1

I have a dataset data_sp with 132 observations containing 9 predictors and 1 response variable. I perform a twenty-fold cross validation, using ordinary kriging and universal kriging. The raster grid100 on which I do my kriging, contains 61124 features (i.e. cells). For universal kriging, the 9 predictors related to the raster, are also included to account for trends. Applying a twenty-fold cross validation on ordinary kriging yields quick results, however, applying a twenty-fold cross validation takes me a minimum of 5 hours (even with parallel kriging as suggested here). There are other posts that discuss the slowness of universal kriging (e.g. here) and one of the answers points to the number of observations used. In my case, 132 observations should not impact the processing time that much, I assume.

My question: What explains the slow processing that relates to universal kriging and how can I speed this process up, despite parallel kriging?

Code (universal kriging):

library(sp) # processing spatial vector data - the way gstat needs it
library(raster) # processing spatial raster data.
library(Metrics) #MAE, R2, RMSE
library('parallel') #parallel kriging
library(gstat)   # The most popular R-Package for Kriging 
library(automap) # Automatize some (or all) parts of the gstat-workflow 

### === Cross Validation === ###

#examine and compare model performances between the Random Effects Model and Linear Model via 20-fold cross-validation - 75% training; 25% testing

#create empty lists where results per round can be stored to
MAE_model = c() #Mean Absolute Error - will be used later to calculate MAE statistics
RMSE_model = c() #Root Mean Square Error - will be used later to calculate RMSE statistics
R2_model = c() - will be used later to calculate R2 statistics

#twenty seeds - encouraging reproducability
Seeds = list(1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80 , 85, 90, 95)
Rounds = seq(1,length(Seeds),1)


i = 1
j = 1
## CREATE TRAIN- AND TEST DATASETS
while(i <= length(Seeds)){
  print(Rounds[[j]]) #confirm start of next element
  print(i) #confirm start of next element in list Seeds
  set.seed(Seeds[[i]])
  dataset = sort(sample(nrow(data_sp), nrow(data_sp)*.75)) #training = 75% (0.75)
  
  train <- data_sp[dataset,]
  test <- data_sp[-dataset,]
  
  
  variogram_uk = autofitVariogram(Lopend_gemiddelde ~ 1 + nightlight_450 + nightlight_4950 + population_3000 + road_class_1_5000 + road_class_2_1000 + road_class_2_5000 + road_class_3_100 + road_class_3_300 + trafBuf50, input_data = train)
  autofit_params_uk <- variogram_uk$var_model
  
  
  #PARALLEL

  
  # Calculate the number of cores
  no_cores <- detectCores() - 1
  
  # Initiate cluster (after loading all the necessary object to R environment: data_sp, grid100, autofit_params_uk)
  cl <- makeCluster(no_cores)
  
  parts <- split(x = 1:length(grid100), f = 1:no_cores)
  
  clusterExport(cl = cl, varlist = c("train", "grid100", "autofit_params_uk", "parts"), envir = .GlobalEnv)
  clusterEvalQ(cl = cl, expr = c(library('sp'), library('gstat')))
  
  
  #universal kriging
  system.time(parallelX <- parLapply(cl = cl, X = 1:no_cores, fun = function(x) krige(formula = Lopend_gemiddelde ~ 1 + nightlight_450 + nightlight_4950 + population_3000 + road_class_1_5000 + road_class_2_1000 + road_class_2_5000 + road_class_3_100 + road_class_3_300 + trafBuf50, locations = train, newdata = grid100[parts[[x]],], model = autofit_params_uk)))
  
  stopCluster(cl)
  
  # Merge all the predictions    
  mergeParallelX <- maptools::spRbind(parallelX[[1]], parallelX[[2]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[3]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[4]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[5]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[6]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[7]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[8]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[9]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[10]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[11]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[12]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[13]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[14]])
  mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[15]])
  
  
  # Create SpatialPixelsDataFrame from mergeParallelX
  mergeParallelX <- SpatialPixelsDataFrame(points = mergeParallelX, data = mergeParallelX@data)
  
  # Plot mergeParallelX    
  spplot(mergeParallelX["var1.pred"], main = "universal kriging predictions")
  
  #convert to raster
  raster_uk <- raster(mergeParallelX["var1.pred"])
  #examine
  plot(raster_uk)
  
  
  #make predictions on the testing dataset, using the trained model
  
  #narrow data
  test <- test[,c("Lopend_gemiddelde", "spachar", "nightlight_450", "nightlight_4950",  "population_3000" , "road_class_1_5000", "road_class_2_1000" , "road_class_2_5000","road_class_3_100", "road_class_3_300", "trafBuf50")]
  
  
  
  #spatially join predicted values by trained model with corresponding kriged residuals.
  predicted_model = raster::extract(raster_uk, test, sp=T) #sp = T: keep all data
  
  
  predicted_model <- predicted_model %>% rename(predicted = "var1.pred")
  
  #rename variable
  Evaluation_UK_train <- predicted_model
  
  ## == model performance (MAE) == ##
  
  Evaluation_UK_train$MAE_model = mae(Evaluation_UK_train$Lopend_gemiddelde, Evaluation_UK_train$predicted)
  
  
  #store results to variable that will be assigned to list
  MAE_model_round = Evaluation_UK_train$MAE_model
  
  
  #store results to list that was defined before loop.
  MAE_model[[paste0("Round ", j)]] <- MAE_model_round #assign the errors for each round to a group "Round x" where x varies per round 
  
  
  ## == model performance (RMSE) == ##
  
  #2. R SQUARED error metric -- Coefficient of Determination
  RSQUARE = function(y_actual,y_predict){
    cor(y_actual,y_predict)^2}
  
  Evaluation_UK_train$R2_model = RSQUARE(Evaluation_UK_train$Lopend_gemiddelde,Evaluation_UK_train$predicted)
  
  
  #store results to variable that will be assigned to list
  R2_model_round = Evaluation_UK_train$R2_model
  
  
  #store results to list that was defined before loop.
  R2_model[[paste0("Round ", j)]] <- R2_model_round #assign the errors for each round to a group "Round x" where x varies per round 
  
  
  ## == model performance (RMSE) == ##
 
  
  Evaluation_UK_train$RMSE_model = rmse(Evaluation_UK_train$Lopend_gemiddelde, Evaluation_UK_train$predicted)
  
  
  #store results to variable that will be assigned to list
  RMSE_model_round = Evaluation_UK_train$RMSE_model
  
  
  #store results to list that was defined before loop.
  RMSE_model[[paste0("Round ", j)]] <- RMSE_model_round #assign the errors for each round to a group "Round x" where x varies per round 
  
  
  
  #next round
  i = i+1
  j = j+1
  
}

0 Answers0