Multiple Processes Instead of for loop in R

Question

I wish to run for loop in parallel process. The result I have with the for loop R code is good to my taste but will be applying it to a very huge data thus, the timing of the execution is slow.

library(forecast)
library(dplyr)
arima_order_results = data.frame()
seed_out2 <- c(1, 16, 170, 178, 411, 630, 661, 1242, 1625, 1901, 1926, 1927, 1928, 2170, 2779, 3687, 4139, 4583, 4825, 4828, 4829, 4827, 5103, 5211, 5509, 5561, 5569, 5679, 6344, 6490, 6943, 6944, 6945, 6946, 6948, 6950, 6951, 6952)
for (my_seed in seed_out2){
  set.seed(my_seed)
  ar1 <- arima.sim(n = 100, model=list(ar = 0.8, order = c(1, 0, 0)), sd = 1)
  ar2 <- auto.arima(ar1, ic = "aicc")
  arr <- as.data.frame(t(ar2$coef))
  if(substr(as.character(arr[1]), 1, 5) == "0.800") {

    arr <- cbind(data.frame(seed=my_seed),arr)
    print(arr)

    arima_order_results = bind_rows(arima_order_results,arr)
    # write.csv(my_seed, paste0(arr, ".csv"), row.names = FALSE)

  } #else print("NOT AVAILABLE")
}

The result

#  seed       ar1
#1  170 0.8006368
#  seed       ar1
#1  411 0.8004152
#  seed       ar1
#1  630 0.8008459
#  seed       ar1
#1  661 0.8001553
#  seed       ar1 intercept
#1 1242 0.8000623 0.8474553
#  seed       ar1
#1 1625 0.8004982
#  seed       ar1
#1 1901 0.8007815
#  seed       ar1
#1 1927 0.8004587
#  seed       ar1
#1 2170 0.8003091
#  seed       ar1
#1 2779 0.8008643
#:
#:
#:
#seed      ar1
#1 5679 0.800689
#  seed     ar1 intercept
#1 6344 0.80004 0.9800426
#  seed       ar1
#1 6490 0.8004093
#  seed       ar1
#1 6948 0.8006992

What I want

I will want a parallel process that will use up my four processors at the same time so that the job execution will be fast when I apply it to huge data` while I have the same result.

See what I tried

library(parallel)    
library(foreach)
library(forecast)
library(dplyr)
library(doSNOW)
cl <- parallel::makeCluster(detectCores(), type = "SOCK")   
doSNOW::registerDoSNOW(cl)
arima_order_results = data.frame()
seed_out2 <- c(1, 16, 170, 178, 411, 630, 661, 1242, 1625, 1901, 1926, 1927, 1928, 2170, 2779, 3687, 4139, 4583, 4825, 4828, 4829, 4827, 5103, 5211, 5509, 5561, 5569, 5679, 6344, 6490, 6943, 6944, 6945, 6946, 6948, 6950, 6951, 6952)
lst_out <- foreach::foreach(my_seed = seq_along(seed_out2), .packages = c("dplyr", "forecast") ) %dopar% {
  set.seed(my_seed)
  ar1 <- arima.sim(n = 100, model=list(ar = 0.8, order = c(1, 0, 0)), sd = 1)
  ar2 <- auto.arima(ar1, ic = "aicc")
  arr <- as.data.frame(t(ar2$coef))
  if(substr(as.character(arr[1]), 1, 5) == "0.800") {

    arr <- cbind(data.frame(seed=my_seed),arr)
    print(arr)

    arima_order_results = bind_rows(arima_order_results,arr)
    # write.csv(my_seed, paste0(arr, ".csv"), row.names = FALSE)

  }
}

See my trial result

#>lst_out
#[[1]]
#NULL

#[[2]]
#NULL

#[[3]]
#NULL

#[[4]]
#NULL
#:
#:
#:
#[[36]]
#NULL

#[[37]]
#NULL

#[[38]]
#NULL

I am operating on windows.

Edith

I want @jay.sf answer modified in such a way that it will be contain in a function like the function I am providing below.

FUN1 <- function(n, ar, sd, arr, R, FUN2){
  FUN2 <- function(i, n, ar, sd, arr) {
    set.seed(i)
    ar1 <- arima.sim(n=n, model=list(ar=ar, order=c(1, 0, 0)), sd=sd)
    ar2 <- auto.arima(ar1, ic="aicc")
    (cf <- ar2$coef)
    if (length(cf) == 0) {
    rep(NA, 2)
    }
    else if (all(grepl(c("ar1|intercept"), names(cf))) &  ## using `grepl`
             substr(cf["ar1"], 1, 5) %in% "arr") { 
      c(cf, seed=i)
    }
    else {
      rep(NA, 2)
    }
  }

  seedv <- 1:R

  library(parallel)
  cl <- makeCluster(detectCores() - 1)
  clusterExport(cl, c("FUN2"), envir=environment())
  clusterEvalQ(cl, suppressPackageStartupMessages(library(forecast)))

  res <- parLapply(cl, seedv, "FUN2")

  res1 <- res[!sapply(res, anyNA)]  ## filter out NAs

  stopCluster(cl)
}
FUN1(n = 10, ar = 0.8, sd = 1, arr = 0.800, R = 1000, FUN2 = FUN2)

May be you are looking for `doRNG` as [here](https://stackoverflow.com/questions/8358098/how-to-set-seed-for-random-simulations-with-foreach-and-domc-packages) — akrun, Dec 26 '20 at 17:19
What I need is parallel processing that will produce the same result in my `MWE1` with the aid of `parallel processing`. That is all. — Daniel James, Dec 26 '20 at 17:26
The two cases are different, the example you gave has two `foreach` functions that must be made `identical`, while my case is that I have a `for loop` function which workers well but lack the speed of execution and I request for what to do to achieve that speed. — Daniel James, Dec 26 '20 at 17:41
My `MWE2` is just an attempt toward achieving such speed though speed is achieved but it prints no result — Daniel James, Dec 26 '20 at 17:43
Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/226448/discussion-between-daniel-james-and-akrun). — Daniel James, Dec 26 '20 at 17:48
Just for the record, did you have a look at the [`sparkly`](https://spark.rstudio.com) package? You have indicated that you are aiming for faster execution when applied to huge data, depends how huge and whether you want to count time for establishing spark connection (once per session) but, on principle, Spark is designed to handle situations like that. — Konrad, Dec 26 '20 at 17:52

jay.sf · Accepted Answer · 2021-10-07T06:46:13.973

Here a similar approach to the answer I gave you to one of your previous related questions.

FUN <- function(i) {
  set.seed(i)
  ar1 <- arima.sim(n=100, model=list(ar=0.8, order=c(1, 0, 0)), sd=1)
  ar2 <- auto.arima(ar1, ic="aicc")
  cf <- ar2$coef
  ## case handling
  if (length(cf) == 0) rep(NA, 2)  ## sometimes result is `character(0)` -> NA
  else if (substr(cf[1], 1, 5) %in% "0.800") c(cf, i)  ## hit, that's what we want
  else rep(NA, 2)  ## all other cases -> NA
}

R <- 1e3  ## this would be your 1e5
seedv <- 1:R  ## or use custom seed vector

library(parallel)
cl <- makeCluster(detectCores() - 1)  ## for all cores remove `- 1`
clusterExport(cl, c("FUN"), envir=environment())
clusterEvalQ(cl, suppressPackageStartupMessages(library(forecast)))

res <- `colnames<-`(t(parSapply(cl, seedv, "FUN")), c("cf", "seed"))

stopCluster(cl)

Result

In the result we want to filter out all the rows with NA.

head(res[!is.na(res[,1]), ])
#             cf seed
# [1,] 0.8006368  170
# [2,] 0.8004152  411
# [3,] 0.8008459  630
# [4,] 0.8001553  661

Edit

To include auto.arima results just containing combinations of "ar1" and "intercept" we better use parLapply:

FUN <- function(i) {
  set.seed(i)
  ar1 <- arima.sim(n=50, model=list(ar=0.8, order=c(1, 0, 0)), sd=1)
  ar2 <- auto.arima(ar1, ic="aicc")
  (cf <- ar2$coef)
  if (length(cf) == 0) {
    rep(NA, 2)
    }
  else if (all(grepl(c("ar1|intercept"), names(cf))) &  ## using `grepl`
           substr(cf["ar1"], 1, 5) %in% "0.800") { 
    c(cf, seed=i)
    }
  else {
    rep(NA, 2)
    }
}

R <- 1e4
seedv <- 1:R

library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, c("FUN"), envir=environment())
clusterEvalQ(cl, suppressPackageStartupMessages(library(forecast)))

res <- parLapply(cl, seedv, "FUN")

res1 <- res[!sapply(res, anyNA)]  ## filter out NAs

stopCluster(cl)

This gives a list of data frames with unequal column lengths, that we may merge with Reduce.

res2 <- Reduce(function(...) merge(..., all=T), lapply(res1, function(x) as.data.frame(t(x))))

res2[order(res2$seed), c("ar1", "intercept", "seed")]  ## some ordering
#          ar1 intercept seed
# 1  0.8000531  1.335388  290
# 3  0.8002499        NA 2154
# 10 0.8005477        NA 2888
# 11 0.8006736        NA 3203
# 15 0.8009363        NA 4415
# 14 0.8008462        NA 4572
# 4  0.8003495        NA 4726
# 9  0.8005087        NA 6241
# 2  0.8001865        NA 6417
# 13 0.8008060 -1.700587 6845
# 6  0.8003977        NA 7187
# 8  0.8004316        NA 8981
# 7  0.8004268        NA 9368
# 12 0.8007281        NA 9697
# 5  0.8003903        NA 9793

Edit2

Here is a function that only requires the user to specify R - the number of iterations. It internally uses doParallel::registerDoParallel to define an implicit cluster which uses the usual detectCores() - 1 by default but may also be specified by the user. The clusters will be stopped automatically. Furthermore, a foreach loop is applied.

library(forecast)
library(doParallel)

arimaze <- function(R, ncores=detectCores() - 1) {
  registerDoParallel(ncores)
  seedv <- 1:R
  FUN <- function(i) {
    set.seed(i)
    ar1 <- arima.sim(n=50, model=list(ar=0.8, order=c(1, 0, 0)), sd=1)
    ar2 <- auto.arima(ar1, ic="aicc")
    cf <- ar2$coef
    if (length(cf) == 0 | !(all(grepl(c("ar1|intercept"), names(cf))) &
                            substr(cf["ar1"], 1, 5) %in% "0.800")) {
      return(rep(NA, 3))
    } else {
      cf <- `length<-`(cf, 2)
      return(c(cf, seed=i))
    }
  }
  message('processing...')
  res <-
    foreach(i=seedv, .combine=rbind.data.frame, .packages='forecast') %dopar% 
    FUN(i)
  message(' done!\n')
  res <- `rownames<-`(res[rowSums(is.na(res)) == 0, ], NULL)
  stopImplicitCluster()
  return(setNames(res, c('ar', 'intercept', 'seed')))
}

Usage

r <- arimaze(1.5e4)
# processing... done!

Result

r
#          ar intercept  seed
# 1 0.8000531  1.335388   290
# 2 0.8008060 -1.700587  6845
# 3 0.8003690 -1.443856 12137

The `800` in `else if (substr(cf[1], 3, 5) %in% "800") c(cf, i) ## hit, that's what we want` is a surprise to me. What happens to `0.800`? — Daniel James, Dec 26 '20 at 19:24
The result and styling are wonderful, I just want to learn by asking the above question — Daniel James, Dec 26 '20 at 19:27
@DanielJames Ah thanks, your original version is indeed better, because mine would also include `1.800`, edited! — jay.sf, Dec 26 '20 at 19:27
yes, I do. I like to turn this your answer into a function that has taken me days yet my attempts have been running in error — Daniel James, Oct 06 '21 at 09:32
If I have a room to chat with you, I will paste my trials for you to see — Daniel James, Oct 06 '21 at 09:35
@DanielJames I'm not sure if I have the time to review your valuable attempts, but since you've placed a bounty I came up with a function that should do what you want. — jay.sf, Oct 07 '21 at 06:50

Multiple Processes Instead of for loop in R

1 Answers1

Result

Edit

Edit2

Usage

Result

Linked