I’m running a nested 3 layers foreach loop but unable to prevent the codes from 100% occupying the remote server (Linux, centOS, physical cores = 14, logical cores = 56). The framework I use is:
Library(doParallel)
doParallel::registerDoParallel(20)
outRes <- foreach1(I = seq1, …) %:%
foreach2(j = seq2, …) %dopar% {
innerRes <- foreach3(k = seq3, …)
}
There are three questions occuring to me.
- For nested foreach loops, will the registered backend be passed to each foreach loop and actually result in 20*3 = 60 workers?
- What’s the mathematical relationship between number of workers and the CPU utility percentage?
- In my real case, foreach1 and foreach2 are small processes, while foreach3 is large process. This causes a problem that most time the workers are idle waiting, leading to waste of workers. Is there any solution to fix it?
PS: a reproducible codes example is attached.
library(mlbench)
data("Sonar")
str(Sonar)
table(Sonar$Class)
seed <- 1234
# for cross validation
number_outCV <- 10
repeats_outCV <- 10
number_innerCV <- 10
repeats_innerCV <- 10
# list of numbers of features to model
featureSeq <- c(10, 30, 50)
# for LASSO training
lambda <- exp(seq(-7, 0, 1))
alpha <- 1
dataList <- list(data1 = Sonar, data2 = Sonar, data3 = Sonar, data4 = Sonar, data5 = Sonar, data6 = Sonar)
# library(doMC)
# doMC::registerDoMC(cores = 20)
library(doParallel)
doParallel::registerDoParallel(20)
nestedCV <- foreach::foreach(clust = 1:length(dataList), .combine = "c", .verbose = TRUE) %:%
foreach::foreach(outCV = 1:(number_outCV*repeats_outCV), .combine = "c", .verbose = TRUE) %dopar% {
# prepare data
dataset <- dataList[[clust]]
table(dataset$Class)
# split data into model developing and testing data in the outCV: repeated 10-fold CV
set.seed(seed)
ResampIndex <- caret::createMultiFolds(y = dataset$Class, k = number_outCV, times = repeats_outCV)
developIndex <- ResampIndex[[outCV]]
developX <- dataset[developIndex, !colnames(dataset) %in% c("Class")]
developY <- dataset$Class[developIndex]
testX <- dataset[-developIndex, !colnames(dataset) %in% c("Class")]
testY <- dataset$Class[-developIndex]
# get a pool of all the features
features_all <- colnames(developX)
# training model with inner repeated 10-fold CV
# foreach for nfeature search
nfeatureRes <- foreach::foreach(featNumIndex = seq(along = featureSeq), .combine = "c", .verbose = TRUE) %dopar% {
nfeature <- featureSeq[featNumIndex]
selectedFeatures <- features_all[1:nfeature]
# train LASSO
lassoCtrl <- trainControl(method = "repeatedCV",
number = number_innerCV,
repeats = repeats_innerCV,
verboseIter = TRUE, returnResamp = "all", savePredictions = "all",
classProbs = TRUE, summaryFunction = twoClassSummary)
lassofit.cv <- train(x = developX[, selectedFeatures],
y = developY,
method = "glmnet",
metric = "ROC",
trControl = lassoCtrl,
tuneGrid = expand.grid(lambda = lambda, alpha = alpha),
preProcess = c("center", "scale"))
AUC.test <- pROC::auc(response = testY, predictor = predict(lassofit.cv, newdata = testX[, selectedFeatures], type = "prob")[[2]])
performance <- data.frame(Class = clust, outCV = outCV, nfeature = nfeature, AUC.cv = max(lassofit.cv$results$ROC), AUC.test = as.numeric(AUC.test))
}
# end of nfeature search foreach loop
nfeatureRes
}
# end of outCV foreach loop as well as the dataList foreach loop
foreach::registerDoSEQ()