# Simulation Codes for obtaining Tables 6, 7 and 8  
# ``About predictions in spatial autoregressive models:
#   Optimal and almost optimal strategies'' (2016)
#
# Number of cores: simulations done with 48 cores 
# Change the 23/06/2021 ()
#################################################################

outofsample <- function(j, path.dir, param.RData, rho, nb.simul = 1000, inv.W=F, BPw.alpha=1, 
                        table.missing=FALSE)
{
 # - j: an integer, giving the number of cores to be used
 # - path.directory: a character, giving the main path directory 
 # - param.RData: a character, giving the name of the file containing initial parameters to be tested ("param-out.RData" or "param-missing.RData")
 # - nb.simul : an integer, the number of simulations
 # - inv.W : a boolean, if TRUE, we use one matrix for simulating and the other for estimating/predicting
 # - BPw.alpha : a scalar between 0 and 1 : gives the percent of neighbours to be used when using BPn predictor
 # - table.missing : a boolean, if TRUE, we compute the last table of the paper.

  # initialization of the parameters
  load(paste(path.dir, param.RData, sep=""))

  # the number of combinaisons
  nb.poss <- prod(unlist(lapply(list.param, length)))
  # the array with the different possibilities 
  arr.poss <- array(1:nb.poss, unlist(lapply(list.param, length)))
  ind.param <- as.numeric(which(arr.poss==j, arr.ind=T))
  W.listw <- list.param[[1]][[ind.param[1]]]

  if(inv.W) 
    W.listw.inv <- list.param[[1]][[-ind.param[1]]]

  sigma <- list.param[[2]][ind.param[2]]
  outsamp <- list.param[[3]][[ind.param[3]]]
  EM <- list.param[[4]][ind.param[4]]
  #rho <- -0.2
  Wmat <- spdep::listw2mat(W.listw)
  n <- nrow(x) 
  no <- length(outsamp)
 
  if(inv.W)
   {A <- diag(1,n) - rho*spdep::listw2mat(W.listw.inv)
   }else
   {A <- diag(1,n) - rho*Wmat}

  # we source EM algorithm if necessary
  if(EM)
  {source(paste(path.dir, "/function/fEMSAR.R", sep=""))}
  
  # We build Ws
  Ws <- Wmat[-outsamp,-outsamp]
  sub.COL.nb <- spdep::mat2listw(Ws, style="W")
  # sub.COL.nb <- spdep::subset.listw(W.listw, !(1:n %in% outsamp))
  
  # and Wos
  Wos <- Wmat[outsamp,-outsamp]
  
  
  # the neighbours of O (the J index) 
  neigh <- vector("list", no) 
  for(k in 1:no)
  {weight <- W.listw$weights[[outsamp[k]]]
   neigh[[k]] <- W.listw$neighbours[[outsamp[k]]][which(weight>=quantile(weight, 1-BPw.alpha))]
  }
  
  ind.j1 <- sort(setdiff(unique(unlist(neigh)),outsamp))

  # Matrix W{J,O}
  Wn <- Wmat[c(ind.j1, outsamp), c(ind.j1, outsamp)]
  # Wnb <- Wmat[c(ind.j1b,outsamp),c(ind.j1b,outsamp)] 
  
  # Because the size of W{J,O} is different from n, we need to know the
  # relationship between old and new indexes 
  ind.j2 <- rep(NA, 283)
  ind.j2[(1:n)[-outsamp]] <- 1:(n-length(outsamp))
  # ind.j4 are the J observations in S
  ind.j4 <- ind.j2[ind.j1]
  
  # Matrix W[JJ]
  # Wj <- Wmat[ind.j1, ind.j1]
  # sub.COL.nb.J <- spdep::mat2listw(Wj)
  # sub.COL.nb.J <- spdep::subset.listw(W.listw, (1:n %in% ind.j1))
  
  # If we care about the missing table, we just care about the site io
  if(table.missing)
  {n.single <- 1
  }else
  {n.single <- no
  }
  
  # initialisation of the predictors (single prediction)
  res.TS1 <- res.TC1 <- res.BP1 <- res.BPw1<- res.BPn1 <- matrix(0, n.single, nb.simul)
  
  # initialisation of the predictors (global)  
  res.TC <- res.BP <- res.BPw <- res.BPn <- matrix(0, n.single, nb.simul)
  
  # we loop
  for (l in 1:nb.simul)
  {
    set.seed(l)
    Y0 <- x%*%beta0 + rnorm(n, 0, sigma)    
    y <- solve(A, Y0)                      # y = (ys yo)
    don <- as.data.frame(x[,-1])
    don <- data.frame(don, y)
    
    ##########
    # estimate parameter with ML on S sample 
    sar.res <- spdep::lagsarlm(y~., data=don[-outsamp,],
                               sub.COL.nb , method="eigen")
    
    # on r?cup?re les coeff beta
    sar.coeff <- as.numeric(sar.res$coefficients)
    # et rho
    hatrho = sar.res$rho
    # et Sigma2 
    hatsigma <- sar.res$s2
    
    ############   ALGO EM
    if(EM)
    {Ys<-y[-outsamp]
    theta<-fEMSAR(Ys, x, Wmat, hatrho, hatsigma, sar.coeff, outsamp)
    # new beta 
    sar.coeff<-theta$beta
    # new rho
    hatrho=theta$rho
    # new Sigma2 
    hatsigma<-theta$sigma2 
    }
    ################################
    
    # Preparation of the computations
    y0 <- x%*%sar.coeff
    
    # we compute Q
    Q <- (diag(1,n) - hatrho*(Wmat+t(Wmat))+
            hatrho^2*crossprod(Wmat))/hatsigma
   
     # Qoo et Qos
    Qo <- Q[outsamp, outsamp]
    Qos <- Q[outsamp, -outsamp]
    
    # We compute sigma 
    Sigma <- solve(Q)
    Sigmaos <- Sigma[outsamp, -outsamp]
    Sigmas <- Sigma[-outsamp, -outsamp]
    
    # Q[J+O] : precision matrix for units in O+J
    nn <- nrow(Wn)
    Qn <- (diag(1,nn)- hatrho*(Wn+t(Wn)) +
           hatrho^2*crossprod(Wn))/hatsigma
    
    Qno <- Qn[(nn-length(outsamp)+1):nn, (nn-length(outsamp)+1):nn]
    Qnoj <- Qn[(nn-length(outsamp)+1):nn, 1:(nn-length(outsamp))]
    
    
    ############    Single prediction 
  
    res.i <- matrix(0, n.single, 6) # initialisation
    
    for (i in 1:n.single)
    {
      res.i[i,1] <- y[outsamp[i]]
      
      # we drop the missing neighbours to i
      # sub.COL.nb.i <- subset(W.listw,!(1:n %in% outsamp[-i]))
      # Wmat.i <- spdep::listw2mat(sub.COL.nb.i)
      Wmat.i <- Wmat[!(1:n %in% outsamp[-i]), !(1:n %in% outsamp[-i])]
      sub.COL.nb.i <- spdep::mat2listw(Wmat.i, style="W") 
      Wmat.i <- spdep::listw2mat(sub.COL.nb.i)
      
      indice <- attr(sub.COL.nb.i, "region.id")
      new.ind.samp <- which(indice==as.character(outsamp[i]))
      
      # TS1
      wy <- spdep::lag.listw(sub.COL.nb.i,y[as.numeric(indice)])
      res.i[i,2] <- y0[outsamp[i]] + hatrho*wy[new.ind.samp]
      
      # TC1
      B <- diag(1,length(indice))-hatrho*Wmat.i
      if(no>1)
       {yj <- solve(B,y0[-outsamp[-i]])
      }else
      {yj <- solve(B,y0)}
      
      res.i[i,3] <- yj[new.ind.samp]
      
      # BP
      Qi<-(diag(1,length(indice))-hatrho*(Wmat.i+t(Wmat.i))+
             hatrho^2*t(Wmat.i)%*%Wmat.i)/hatsigma
      
      Qio<-Qi[new.ind.samp,new.ind.samp]
      Qios<-Qi[new.ind.samp,-new.ind.samp]
      
      res.i[i,4] <-  res.i[i,3] - solve(Qio)%*%Qios%*%(y[as.numeric(indice)[-new.ind.samp]]-yj[-new.ind.samp])
      
      # BPw
      Sigmai = solve(Qi)
      Sigmaios <- Sigmai[new.ind.samp,-new.ind.samp]
      Sigmais <- Sigmai[-new.ind.samp,-new.ind.samp]
      
      yab <- try(res.i[i,3] + (Sigmaios%*%Wmat.i[new.ind.samp,-new.ind.samp])/
                   (Wmat.i[new.ind.samp,-new.ind.samp]%*%Sigmais%*%(Wmat.i[new.ind.samp,-new.ind.samp]))*
                   (Wmat.i[new.ind.samp,-new.ind.samp]%*%(y[as.numeric(indice)[-new.ind.samp]]-yj[-new.ind.samp])),TRUE)
      res.i[i,5] <- ifelse(class(yab)[1]=="try-error", NaN, yab)
      
      # BPn
      eiw=matrix(0,length(sub.COL.nb.i$neighbours[[new.ind.samp]]),nrow(Sigmais))
      new.ind.e<-1:nrow(Sigmais)
      new.ind.e[(new.ind.samp+1):(nrow(Sigmais)+1)]<- new.ind.e[new.ind.samp:nrow(Sigmais)]
      n.i<-length(sub.COL.nb.i$neighbours[[new.ind.samp]])
      eiw[cbind(1:n.i,new.ind.e[sub.COL.nb.i$neighbours[[new.ind.samp]]])]<-1
      
      ybpn <- try(res.i[i,3] + ((Sigmaios%*%t(eiw))%*%solve(eiw%*%Sigmais%*%t(eiw)))%*%(eiw%*%(y[as.numeric(indice)[-new.ind.samp]]-yj[-new.ind.samp])),TRUE)
      res.i[i,6] <- ifelse(class(yab)[1]=="try-error", NaN, ybpn)
    }
    
    # we save the results
    res.TS1[,l] <- (res.i[,2]-res.i[,1])^2
    res.TC1[,l] <- (res.i[,3]-res.i[,1])^2
    res.BP1[,l] <- (res.i[,4]-res.i[,1])^2
    res.BPw1[,l] <- (res.i[,5]-res.i[,1])^2
    res.BPn1[,l] <- (res.i[,6]-res.i[,1])^2
    ############       
    
    
    ############   Global predictions (Rq : pas de boucles)
    # TC
    B <- diag(1, n) - hatrho*Wmat
    yj <- solve(B, y0)
    yjk <- yj[outsamp]
    
    # BP
    ygt =  yjk - solve(Qo)%*%Qos%*%((y-yj)[-outsamp])
    
    # BPw
    if(no>1)
    {y_bpw = try(yjk + (Sigmaos%*%t(Wos))%*% solve(Wos%*%Sigmas%*%t(Wos))%*%(Wos%*%(y-yj)[-outsamp]),TRUE)
    }else
    {y_bpw =  yjk + sum(Sigmaos*t(Wos))/sum((Wos%*%Sigmas)*t(Wos))*(Wos%*%(y-yj)[-outsamp])}
    
    # BPwn
    y_bpn = yjk - solve(Qno)%*%Qnoj%*%(((y-yj)[-outsamp])[ind.j4])
    
    # Stockage des r?sultats
    res.TC[,l] <- ((y[outsamp]-yjk)^2)[1:n.single]
    res.BP[,l] <- ((y[outsamp]-ygt)^2)[1:n.single]
    if(class(y_bpw)[1]!="try-error") 
    {res.BPw[,l] <- ((y[outsamp]-y_bpw)^2)[1:n.single]
    }else
    {res.BPw[,l] <- rep(NA,length(n.single))}
    res.BPn[,l] <- ((y[outsamp]-y_bpn)^2)[1:n.single]
  }
  

  
  # we print the results
  results<-list(
    MSE=round(c(mean(apply(res.BP,2,mean)),      # BP Prediction mean square error
                mean(apply(res.BP1,2,mean,na.rm=TRUE),na.rm=TRUE),  # BP1 Prediction mean square error
                mean(apply(res.TC,2,mean,na.rm=TRUE),na.rm=TRUE),  # TC Prediction mean square error
                mean(apply(res.TC1,2,mean,na.rm=TRUE),na.rm=TRUE),  # TC1 Prediction mean square error
                mean(apply(res.TS1,2,mean,na.rm=TRUE),na.rm=TRUE),  # TS Prediction mean square error
                mean(apply(res.BPw,2,mean,na.rm=TRUE),na.rm=TRUE),  # BPw Prediction mean square error
                mean(apply(res.BPw1,2,mean,na.rm=TRUE),na.rm=TRUE),  # BPw1 Prediction mean square error
                mean(apply(res.BPn,2,mean,na.rm=TRUE),na.rm=TRUE),  # BPn Prediction mean square error
                mean(apply(res.BPn1,2,mean,na.rm=TRUE),na.rm=TRUE)  # BPn1 Prediction mean square error
    ),3),   #############  End Mean of the MSE
    # Begin the sd
    sd=round(c(sd(apply(res.BP,2,mean)),      # BP Prediction mean square error
               sd(apply(res.BP1,2,mean,na.rm=TRUE),na.rm=TRUE),  # BP1 Prediction mean square error
               sd(apply(res.TC,2,mean,na.rm=TRUE),na.rm=TRUE),  # TC Prediction mean square error
               sd(apply(res.TC1,2,mean,na.rm=TRUE),na.rm=TRUE),  # TC1 Prediction mean square error
               sd(apply(res.TS1,2,mean,na.rm=TRUE),na.rm=TRUE),  # TS Prediction mean square error
               sd(apply(res.BPw,2,mean,na.rm=TRUE),na.rm=TRUE),  # BPw Prediction mean square error
               sd(apply(res.BPw1,2,mean,na.rm=TRUE),na.rm=TRUE),  # BPw1 Prediction mean square error
               sd(apply(res.BPn,2,mean,na.rm=TRUE),na.rm=TRUE),  # BPn Prediction mean square error
               sd(apply(res.BPn1,2,mean,na.rm=TRUE),na.rm=TRUE)  # BPn1 Prediction mean square error
    ),3))# end result

  names(results[[1]])<-c("BP","BP1","TC","TC1","TS1","BPw","BPw1","BPn","BPn1")
  names(results[[2]])<-c("BP","BP1","TC","TC1","TS1","BPw","BPw1","BPn","BPn1")
  print(results)
}  



#############################################
# How to use of parallel computing :
####################

############################
######   OUT-OF-SAMPLE

# we load the data 
# remind the file param-out.RData is obtained in the main document: GLT.html
path.dir <- "Z:/Thibault Pro/research/statistique spatiale/pr?diction Michel Christine/article spatial economic analysis/codes/"
load(paste(path.dir, "parameters/param-out.RData", sep=""))

# the number of combinaisons
nb.poss <- prod(unlist(lapply(list.param, length)))

require("snowfall")
# rho =0.35
sfInit(parallel=TRUE, cpus=16)
result.out <- sfClusterApplyLB(1:nb.poss, outofsample, path.dir=path.dir,
                               param.RData="parameters/param-out.RData", rho=0.35, 
                               nb.simul=1000, inv.W=F, BPw.alpha=1, table.missing=F)
sfStop()

save(result.out, file=paste(path.dir, "result/res-out.RData", sep="")) 

# rho = -0.2
sfInit(parallel=TRUE, cpus=nb.poss)
result.out.neg <- sfClusterApplyLB(1:nb.poss, outofsample, path.dir=path.dir, param.RData="parameters/param-out.RData", rho=-0.2, nb.simul=1000, inv.W=F, BPw.alpha=1, table.missing=F)
sfStop()

save(result.out.neg, file=paste(path.dir, "result/res-out-neg.RData", sep="")) 

############################
######   Permuting the matrices in the simulations and prediction

require("snowfall")
sfInit(parallel=TRUE, cpus=nb.poss)
result.inv <- sfClusterApplyLB(1:nb.poss, outofsample, path.dir=path.dir, param.RData="parameters/param-out.RData", nb.simul=1000, inv.W=T, BPw.alpha=1, table.missing=F)
sfStop()

save(result.inv, file=paste(path.dir, "result/res-out-inv.RData", sep="")) 


############################
######   MISSING
load(paste(path.dir, "param-missing.RData", sep=""))

# the number of combinaisons
nb.poss <- prod(unlist(lapply(list.param, length)))

require("snowfall")
sfInit(parallel=TRUE, cpus=nb.poss)
result.miss <- sfClusterApplyLB(1:nb.poss, outofsample, path.dir=path.dir, param.RData="parameters/param-missing.RData", nb.simul=1000, inv.W=F, BPw.alpha=1, table.missing=T)
sfStop()

save(result.miss, file=paste(path.dir, "result/res-missing.RData", sep="")) 