2

I was able to run the following code without any problems:

# first code: works fine
library(dplyr)
library(ranger)

original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)),  data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )

original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)

test_set=  rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])

train_set = anti_join(original_data, test_set)

# Step 2: Create "Balanced" Random Subsets:

results <- list()
for (i in 1:100)
   
{
   iteration_i = i
   
    sample_i =  rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
   
    results_tmp = data.frame(iteration_i, sample_i)
    results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
   results[[i]] <- results_tmp
   
}

results_df <- do.call(rbind.data.frame, results)

X<-split(results_df, results_df$iteration)

 invisible(lapply(seq_along(results),
       function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
       x=results))

# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()

for (i in 1:100){
     
    model_i <- ranger(class ~  height + weight + salary, data = X[[i]], probability = TRUE)
    saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
    results_1[[i]] <- model_i   
}

# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)


predict_i$id = 1:nrow(predict_i)
 results_2[[i]] <- predict_i
   
}

final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)

I am now trying to run the same code (Step 2, Step 3, Step 4) in parallel - here is my attempt:

# second code: does not work fine
library(doParallel)
library(foreach)

registerDoParallel(cores = detectCores())
foreach(i = 1:100) %dopar% {
        # Step 2: Create "Balanced" Random Subsets:

results <- list()
for (i in 1:100)
   
{
   iteration_i = i
   
    sample_i =  rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
   
    results_tmp = data.frame(iteration_i, sample_i)
    results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
   results[[i]] <- results_tmp
   
}

results_df <- do.call(rbind.data.frame, results)

X<-split(results_df, results_df$iteration)

 invisible(lapply(seq_along(results),
       function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
       x=results))

# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()

for (i in 1:100){
     
    model_i <- ranger(class ~  height + weight + salary, data = X[[i]], probability = TRUE)
    saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
    results_1[[i]] <- model_i   
}

# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)


predict_i$id = 1:nrow(predict_i)
 results_2[[i]] <- predict_i
   
}

final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
    
}

stopImplicitCluster()

This is giving me the following error:

Error in { : task 1 failed - "could not find function "ranger""

I am not sure why this error is being produced, seeing as I have loaded the "ranger" library.

My Question: Can someone please show me what I am doing wrong and how can I make the second code run like the first code?

Thanks!

Note : After adding the suggestion made by @Waldi, the code doesn't produce an error, but is taking a very long time to run. Does anyone have any recommendations on how to improve this?

user438383
  • 5,716
  • 8
  • 28
  • 43
stats_noob
  • 5,401
  • 4
  • 27
  • 83

1 Answers1

1

You can specify the packages you need using the .packages argument in foreach:

foreach(i = 1:100, .packages = 'ranger') %dopar% {...}

Detailed explanation on footnote regarding parallel processing being slow can be found here

Waldi
  • 39,242
  • 6
  • 30
  • 78
  • @ Waldi: Thank you so much for your answer! I added this correction - my code has been running for almost 20 minutes now and still hasn't finished running ... whereas before, the entire code ran in under 1 minute. Were you able to get my code to run? Am I still doing something wrong? Thank you so much! – stats_noob Jun 24 '22 at 06:02
  • 1
    It worked, but I agree it took much longer than without `parallel`. So this answers your question about packages handling in parallel, but the next question is how to structure the parallel loops so that the overhead due to parallel processing doesn't make everything slower ;-) This is sometimes tricky : parallel doesn't always mean faster! – Waldi Jun 24 '22 at 06:37
  • @ waldi: thank you so much! Do you have any ideas as to how this code might be re-written so that it might run faster? Thank you so much! – stats_noob Jun 24 '22 at 07:18
  • @stats_noob, correct me if I'm wrong, but the non-parallel code is missing the main `for (i in 1:100)` loop, so that it isn't comparable to the parallel version. An advice would be not to use `i` for inner as well as for outer loop : it makes code less readable and analysis more difficult. – Waldi Jun 24 '22 at 17:11
  • @ waldi; thank you for your reply! I am still new to using loops, I will try to look into this! – stats_noob Jun 24 '22 at 17:24