I have a dataset data_sp
with 132 observations containing 9 predictors and 1 response variable. I perform a twenty-fold cross validation, using ordinary kriging and universal kriging. The raster grid100
on which I do my kriging, contains 61124 features (i.e. cells). For universal kriging, the 9 predictors related to the raster, are also included to account for trends. Applying a twenty-fold cross validation on ordinary kriging yields quick results, however, applying a twenty-fold cross validation takes me a minimum of 5 hours (even with parallel kriging as suggested here). There are other posts that discuss the slowness of universal kriging (e.g. here) and one of the answers points to the number of observations used. In my case, 132 observations should not impact the processing time that much, I assume.
My question: What explains the slow processing that relates to universal kriging and how can I speed this process up, despite parallel kriging?
Code (universal kriging):
library(sp) # processing spatial vector data - the way gstat needs it
library(raster) # processing spatial raster data.
library(Metrics) #MAE, R2, RMSE
library('parallel') #parallel kriging
library(gstat) # The most popular R-Package for Kriging
library(automap) # Automatize some (or all) parts of the gstat-workflow
### === Cross Validation === ###
#examine and compare model performances between the Random Effects Model and Linear Model via 20-fold cross-validation - 75% training; 25% testing
#create empty lists where results per round can be stored to
MAE_model = c() #Mean Absolute Error - will be used later to calculate MAE statistics
RMSE_model = c() #Root Mean Square Error - will be used later to calculate RMSE statistics
R2_model = c() - will be used later to calculate R2 statistics
#twenty seeds - encouraging reproducability
Seeds = list(1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80 , 85, 90, 95)
Rounds = seq(1,length(Seeds),1)
i = 1
j = 1
## CREATE TRAIN- AND TEST DATASETS
while(i <= length(Seeds)){
print(Rounds[[j]]) #confirm start of next element
print(i) #confirm start of next element in list Seeds
set.seed(Seeds[[i]])
dataset = sort(sample(nrow(data_sp), nrow(data_sp)*.75)) #training = 75% (0.75)
train <- data_sp[dataset,]
test <- data_sp[-dataset,]
variogram_uk = autofitVariogram(Lopend_gemiddelde ~ 1 + nightlight_450 + nightlight_4950 + population_3000 + road_class_1_5000 + road_class_2_1000 + road_class_2_5000 + road_class_3_100 + road_class_3_300 + trafBuf50, input_data = train)
autofit_params_uk <- variogram_uk$var_model
#PARALLEL
# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster (after loading all the necessary object to R environment: data_sp, grid100, autofit_params_uk)
cl <- makeCluster(no_cores)
parts <- split(x = 1:length(grid100), f = 1:no_cores)
clusterExport(cl = cl, varlist = c("train", "grid100", "autofit_params_uk", "parts"), envir = .GlobalEnv)
clusterEvalQ(cl = cl, expr = c(library('sp'), library('gstat')))
#universal kriging
system.time(parallelX <- parLapply(cl = cl, X = 1:no_cores, fun = function(x) krige(formula = Lopend_gemiddelde ~ 1 + nightlight_450 + nightlight_4950 + population_3000 + road_class_1_5000 + road_class_2_1000 + road_class_2_5000 + road_class_3_100 + road_class_3_300 + trafBuf50, locations = train, newdata = grid100[parts[[x]],], model = autofit_params_uk)))
stopCluster(cl)
# Merge all the predictions
mergeParallelX <- maptools::spRbind(parallelX[[1]], parallelX[[2]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[3]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[4]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[5]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[6]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[7]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[8]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[9]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[10]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[11]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[12]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[13]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[14]])
mergeParallelX <- maptools::spRbind(mergeParallelX, parallelX[[15]])
# Create SpatialPixelsDataFrame from mergeParallelX
mergeParallelX <- SpatialPixelsDataFrame(points = mergeParallelX, data = mergeParallelX@data)
# Plot mergeParallelX
spplot(mergeParallelX["var1.pred"], main = "universal kriging predictions")
#convert to raster
raster_uk <- raster(mergeParallelX["var1.pred"])
#examine
plot(raster_uk)
#make predictions on the testing dataset, using the trained model
#narrow data
test <- test[,c("Lopend_gemiddelde", "spachar", "nightlight_450", "nightlight_4950", "population_3000" , "road_class_1_5000", "road_class_2_1000" , "road_class_2_5000","road_class_3_100", "road_class_3_300", "trafBuf50")]
#spatially join predicted values by trained model with corresponding kriged residuals.
predicted_model = raster::extract(raster_uk, test, sp=T) #sp = T: keep all data
predicted_model <- predicted_model %>% rename(predicted = "var1.pred")
#rename variable
Evaluation_UK_train <- predicted_model
## == model performance (MAE) == ##
Evaluation_UK_train$MAE_model = mae(Evaluation_UK_train$Lopend_gemiddelde, Evaluation_UK_train$predicted)
#store results to variable that will be assigned to list
MAE_model_round = Evaluation_UK_train$MAE_model
#store results to list that was defined before loop.
MAE_model[[paste0("Round ", j)]] <- MAE_model_round #assign the errors for each round to a group "Round x" where x varies per round
## == model performance (RMSE) == ##
#2. R SQUARED error metric -- Coefficient of Determination
RSQUARE = function(y_actual,y_predict){
cor(y_actual,y_predict)^2}
Evaluation_UK_train$R2_model = RSQUARE(Evaluation_UK_train$Lopend_gemiddelde,Evaluation_UK_train$predicted)
#store results to variable that will be assigned to list
R2_model_round = Evaluation_UK_train$R2_model
#store results to list that was defined before loop.
R2_model[[paste0("Round ", j)]] <- R2_model_round #assign the errors for each round to a group "Round x" where x varies per round
## == model performance (RMSE) == ##
Evaluation_UK_train$RMSE_model = rmse(Evaluation_UK_train$Lopend_gemiddelde, Evaluation_UK_train$predicted)
#store results to variable that will be assigned to list
RMSE_model_round = Evaluation_UK_train$RMSE_model
#store results to list that was defined before loop.
RMSE_model[[paste0("Round ", j)]] <- RMSE_model_round #assign the errors for each round to a group "Round x" where x varies per round
#next round
i = i+1
j = j+1
}