1

If calling parLapply multiple times, is it fine to call makeCluster and stopCluster only once, or should they be called before and after each parLapply call? How does this affect memory usage?

Here's a toy example:

library(parallel)

my_g1 <- function(list_element) {
    return(sum(list_element))
}

my_g2 <- function(list_element, my_parameter) {
    return(max(list_element) + my_parameter)
}

my_fn <- function(large_list, max_iterations=10, my_parameter=123) {
    stopifnot(max_iterations >= 1)
    iteration <- 1
    while(TRUE) {
        message("iteration ", iteration)
        list_of_sums <- lapply(my_large_list, my_g1)
        list_of_max_plus_parameter <- lapply(my_large_list, my_g2, my_parameter=my_parameter)
        stopifnot(list_of_max_plus_parameter[[1]] == max(large_list[[1]]) + my_parameter)
        ## Pretend there's work to do with list_of*: check for convergence; if converged, break
        iteration <- iteration + 1
        if(iteration >= max_iterations) break
    }
    return(1)  # Pretend this has something to do with the work done in the loop
}

my_large_list <- list(seq(1, 10),
                      seq(99, 157),
                      seq(27, 54),
                      seq(1001, 1041))  # Pretend this takes up lots of memory, want to avoid copying

unused <- my_fn(my_large_list)

Now suppose I rewrite my_fn to use a cluster:

my_fn_parallelized <- function(large_list, max_iterations=10, my_parameter=123) {
    stopifnot(max_iterations >= 1)
    cluster <- makeCluster(2)  # Two cores
    iteration <- 1
    while(TRUE) {
        message("iteration ", iteration)
        list_of_sums <- parLapply(cluster, my_large_list, my_g1)
        list_of_max_plus_parameter <- parLapply(cluster, my_large_list, my_g2,
                                                my_parameter=my_parameter)
        stopifnot(list_of_max_plus_parameter[[1]] == max(large_list[[1]]) + my_parameter)
        ## Pretend there's work to do with list_of*: check for convergence; if converged, break
        iteration <- iteration + 1
        if(iteration >= max_iterations) break
    }
    stopCluster(cluster)  # With stopCluster here, is my_large_list copied 2*max_iterations times?
    return(1)  # Pretend this has something to do with the work done in the loop
}

unused <- my_fn_parallelized(my_large_list)

With stopCluster outside the loop, does my_large_list get copied multiple times, with the memory not freed until stopCluster is called? In other words, will the memory usage for my_large_list be on the order of 2*max_iterations? Or will it be constant with respect to max_iterations?

Adrian
  • 3,138
  • 2
  • 28
  • 39

0 Answers0