5

I wish to run for loop in parallel process. The result I have with the for loop R code is good to my taste but will be applying it to a very huge data thus, the timing of the execution is slow.

library(forecast)
library(dplyr)
arima_order_results = data.frame()
seed_out2 <- c(1, 16, 170, 178, 411, 630, 661, 1242, 1625, 1901, 1926, 1927, 1928, 2170, 2779, 3687, 4139, 4583, 4825, 4828, 4829, 4827, 5103, 5211, 5509, 5561, 5569, 5679, 6344, 6490, 6943, 6944, 6945, 6946, 6948, 6950, 6951, 6952)
for (my_seed in seed_out2){
  set.seed(my_seed)
  ar1 <- arima.sim(n = 100, model=list(ar = 0.8, order = c(1, 0, 0)), sd = 1)
  ar2 <- auto.arima(ar1, ic = "aicc")
  arr <- as.data.frame(t(ar2$coef))
  if(substr(as.character(arr[1]), 1, 5) == "0.800") {

    arr <- cbind(data.frame(seed=my_seed),arr)
    print(arr)

    arima_order_results = bind_rows(arima_order_results,arr)
    # write.csv(my_seed, paste0(arr, ".csv"), row.names = FALSE)

  } #else print("NOT AVAILABLE")
}

The result

#  seed       ar1
#1  170 0.8006368
#  seed       ar1
#1  411 0.8004152
#  seed       ar1
#1  630 0.8008459
#  seed       ar1
#1  661 0.8001553
#  seed       ar1 intercept
#1 1242 0.8000623 0.8474553
#  seed       ar1
#1 1625 0.8004982
#  seed       ar1
#1 1901 0.8007815
#  seed       ar1
#1 1927 0.8004587
#  seed       ar1
#1 2170 0.8003091
#  seed       ar1
#1 2779 0.8008643
#:
#:
#:
#seed      ar1
#1 5679 0.800689
#  seed     ar1 intercept
#1 6344 0.80004 0.9800426
#  seed       ar1
#1 6490 0.8004093
#  seed       ar1
#1 6948 0.8006992

What I want

I will want a parallel process that will use up my four processors at the same time so that the job execution will be fast when I apply it to huge data` while I have the same result.

See what I tried

library(parallel)    
library(foreach)
library(forecast)
library(dplyr)
library(doSNOW)
cl <- parallel::makeCluster(detectCores(), type = "SOCK")   
doSNOW::registerDoSNOW(cl)
arima_order_results = data.frame()
seed_out2 <- c(1, 16, 170, 178, 411, 630, 661, 1242, 1625, 1901, 1926, 1927, 1928, 2170, 2779, 3687, 4139, 4583, 4825, 4828, 4829, 4827, 5103, 5211, 5509, 5561, 5569, 5679, 6344, 6490, 6943, 6944, 6945, 6946, 6948, 6950, 6951, 6952)
lst_out <- foreach::foreach(my_seed = seq_along(seed_out2), .packages = c("dplyr", "forecast") ) %dopar% {
  set.seed(my_seed)
  ar1 <- arima.sim(n = 100, model=list(ar = 0.8, order = c(1, 0, 0)), sd = 1)
  ar2 <- auto.arima(ar1, ic = "aicc")
  arr <- as.data.frame(t(ar2$coef))
  if(substr(as.character(arr[1]), 1, 5) == "0.800") {

    arr <- cbind(data.frame(seed=my_seed),arr)
    print(arr)

    arima_order_results = bind_rows(arima_order_results,arr)
    # write.csv(my_seed, paste0(arr, ".csv"), row.names = FALSE)

  }
}

See my trial result

#>lst_out
#[[1]]
#NULL

#[[2]]
#NULL

#[[3]]
#NULL

#[[4]]
#NULL
#:
#:
#:
#[[36]]
#NULL

#[[37]]
#NULL

#[[38]]
#NULL

I am operating on windows.

Edith

I want @jay.sf answer modified in such a way that it will be contain in a function like the function I am providing below.

FUN1 <- function(n, ar, sd, arr, R, FUN2){
  FUN2 <- function(i, n, ar, sd, arr) {
    set.seed(i)
    ar1 <- arima.sim(n=n, model=list(ar=ar, order=c(1, 0, 0)), sd=sd)
    ar2 <- auto.arima(ar1, ic="aicc")
    (cf <- ar2$coef)
    if (length(cf) == 0) {
    rep(NA, 2)
    }
    else if (all(grepl(c("ar1|intercept"), names(cf))) &  ## using `grepl`
             substr(cf["ar1"], 1, 5) %in% "arr") { 
      c(cf, seed=i)
    }
    else {
      rep(NA, 2)
    }
  }

  seedv <- 1:R

  library(parallel)
  cl <- makeCluster(detectCores() - 1)
  clusterExport(cl, c("FUN2"), envir=environment())
  clusterEvalQ(cl, suppressPackageStartupMessages(library(forecast)))

  res <- parLapply(cl, seedv, "FUN2")

  res1 <- res[!sapply(res, anyNA)]  ## filter out NAs

  stopCluster(cl)
}
FUN1(n = 10, ar = 0.8, sd = 1, arr = 0.800, R = 1000, FUN2 = FUN2)
jay.sf
  • 60,139
  • 8
  • 53
  • 110
Daniel James
  • 1,381
  • 1
  • 10
  • 28
  • 1
    May be you are looking for `doRNG` as [here](https://stackoverflow.com/questions/8358098/how-to-set-seed-for-random-simulations-with-foreach-and-domc-packages) – akrun Dec 26 '20 at 17:19
  • What I need is parallel processing that will produce the same result in my `MWE1` with the aid of `parallel processing`. That is all. – Daniel James Dec 26 '20 at 17:26
  • The two cases are different, the example you gave has two `foreach` functions that must be made `identical`, while my case is that I have a `for loop` function which workers well but lack the speed of execution and I request for what to do to achieve that speed. – Daniel James Dec 26 '20 at 17:41
  • My `MWE2` is just an attempt toward achieving such speed though speed is achieved but it prints no result – Daniel James Dec 26 '20 at 17:43
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/226448/discussion-between-daniel-james-and-akrun). – Daniel James Dec 26 '20 at 17:48
  • 1
    Just for the record, did you have a look at the [`sparkly`](https://spark.rstudio.com) package? You have indicated that you are aiming for faster execution when applied to huge data, depends how huge and whether you want to count time for establishing spark connection (once per session) but, on principle, Spark is designed to handle situations like that. – Konrad Dec 26 '20 at 17:52
  • you mean `sparklyr`? – Daniel James Dec 26 '20 at 18:19

1 Answers1

3

Here a similar approach to the answer I gave you to one of your previous related questions.

FUN <- function(i) {
  set.seed(i)
  ar1 <- arima.sim(n=100, model=list(ar=0.8, order=c(1, 0, 0)), sd=1)
  ar2 <- auto.arima(ar1, ic="aicc")
  cf <- ar2$coef
  ## case handling
  if (length(cf) == 0) rep(NA, 2)  ## sometimes result is `character(0)` -> NA
  else if (substr(cf[1], 1, 5) %in% "0.800") c(cf, i)  ## hit, that's what we want
  else rep(NA, 2)  ## all other cases -> NA
}

R <- 1e3  ## this would be your 1e5
seedv <- 1:R  ## or use custom seed vector

library(parallel)
cl <- makeCluster(detectCores() - 1)  ## for all cores remove `- 1`
clusterExport(cl, c("FUN"), envir=environment())
clusterEvalQ(cl, suppressPackageStartupMessages(library(forecast)))

res <- `colnames<-`(t(parSapply(cl, seedv, "FUN")), c("cf", "seed"))

stopCluster(cl)

Result

In the result we want to filter out all the rows with NA.

head(res[!is.na(res[,1]), ])
#             cf seed
# [1,] 0.8006368  170
# [2,] 0.8004152  411
# [3,] 0.8008459  630
# [4,] 0.8001553  661

Edit

To include auto.arima results just containing combinations of "ar1" and "intercept" we better use parLapply:

FUN <- function(i) {
  set.seed(i)
  ar1 <- arima.sim(n=50, model=list(ar=0.8, order=c(1, 0, 0)), sd=1)
  ar2 <- auto.arima(ar1, ic="aicc")
  (cf <- ar2$coef)
  if (length(cf) == 0) {
    rep(NA, 2)
    }
  else if (all(grepl(c("ar1|intercept"), names(cf))) &  ## using `grepl`
           substr(cf["ar1"], 1, 5) %in% "0.800") { 
    c(cf, seed=i)
    }
  else {
    rep(NA, 2)
    }
}

R <- 1e4
seedv <- 1:R

library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, c("FUN"), envir=environment())
clusterEvalQ(cl, suppressPackageStartupMessages(library(forecast)))

res <- parLapply(cl, seedv, "FUN")

res1 <- res[!sapply(res, anyNA)]  ## filter out NAs

stopCluster(cl)

This gives a list of data frames with unequal column lengths, that we may merge with Reduce.

res2 <- Reduce(function(...) merge(..., all=T), lapply(res1, function(x) as.data.frame(t(x))))

res2[order(res2$seed), c("ar1", "intercept", "seed")]  ## some ordering
#          ar1 intercept seed
# 1  0.8000531  1.335388  290
# 3  0.8002499        NA 2154
# 10 0.8005477        NA 2888
# 11 0.8006736        NA 3203
# 15 0.8009363        NA 4415
# 14 0.8008462        NA 4572
# 4  0.8003495        NA 4726
# 9  0.8005087        NA 6241
# 2  0.8001865        NA 6417
# 13 0.8008060 -1.700587 6845
# 6  0.8003977        NA 7187
# 8  0.8004316        NA 8981
# 7  0.8004268        NA 9368
# 12 0.8007281        NA 9697
# 5  0.8003903        NA 9793

Edit2

Here is a function that only requires the user to specify R - the number of iterations. It internally uses doParallel::registerDoParallel to define an implicit cluster which uses the usual detectCores() - 1 by default but may also be specified by the user. The clusters will be stopped automatically. Furthermore, a foreach loop is applied.

library(forecast)
library(doParallel)

arimaze <- function(R, ncores=detectCores() - 1) {
  registerDoParallel(ncores)
  seedv <- 1:R
  FUN <- function(i) {
    set.seed(i)
    ar1 <- arima.sim(n=50, model=list(ar=0.8, order=c(1, 0, 0)), sd=1)
    ar2 <- auto.arima(ar1, ic="aicc")
    cf <- ar2$coef
    if (length(cf) == 0 | !(all(grepl(c("ar1|intercept"), names(cf))) &
                            substr(cf["ar1"], 1, 5) %in% "0.800")) {
      return(rep(NA, 3))
    } else {
      cf <- `length<-`(cf, 2)
      return(c(cf, seed=i))
    }
  }
  message('processing...')
  res <-
    foreach(i=seedv, .combine=rbind.data.frame, .packages='forecast') %dopar% 
    FUN(i)
  message(' done!\n')
  res <- `rownames<-`(res[rowSums(is.na(res)) == 0, ], NULL)
  stopImplicitCluster()
  return(setNames(res, c('ar', 'intercept', 'seed')))
}

Usage

r <- arimaze(1.5e4)
# processing... done!

Result

r
#          ar intercept  seed
# 1 0.8000531  1.335388   290
# 2 0.8008060 -1.700587  6845
# 3 0.8003690 -1.443856 12137
jay.sf
  • 60,139
  • 8
  • 53
  • 110