I try to assess the combined uncertainties related to different input parameters using Markov Chain Monte Carlo method in R. In other words, using the uncertainty parameters reported in input data documentation, I try to generate distributions for each of the datasets by creating 1000 random values within the used distribution (normal distribution or truncated normal distribution). However, I don't know how to do this with the purrr::map() functions faster and without exhausting the RAM. The dataset has 2m rows and 80 cols. Here is a simplified example:
library(tidyverse); library(truncnorm);library(data.table); library(dtplyr)
n <- 1000 # number of simulations
n_obs <- 10000 # number of observations. Does not work if e.g. 50000
Create a data.frame
dt <- data.frame(
var1 = runif(n_obs, 0, 100),
var2_low = runif(n_obs, 0, 1),
var2_mean = runif(n_obs, 0, 5),
var2_up = runif(n_obs, 0, 10)
)
Convert to lazy data table to speed things up
dt1 <- dt %>% as.data.table() %>%
lazy_dt(., immutable = FALSE)
Simulate
dt_sim <- dt1 %>%
mutate(mean_val = rep(1, nrow(.)), # just row of 1
var1_rnorm = map(.x = mean_val,~rnorm(n, mean = .x, sd = 0.10)), # normal distribution with given sd
sim_var1 = map2(.x = var1, .y = var1_rnorm, ~(.x*.y))) %>% # multiply the data with simulated distribution
# add truncated normal distribution for each row (var2)
mutate(sim_var2 = pmap(.,~ rtruncnorm(n,
a = dt$var2_low,
b = dt$var2_up,
mean =dt$var2_mean))) %>%
# multiply simulated variables sim_var1 and sim_var2
mutate(sim_vars_multiplied =
pmap(list(x = .$sim_var1,
y = .$sim_var2),
function(x,y) (x*y))) %>%
# derive coefficient of variation
mutate(var_mean =map(.x = sim_vars_multiplied, ~ mean(.x, na.rm = TRUE)),
var_sd = map(.x = sim_vars_multiplied, ~ sd(.x, na.rm = TRUE)),
var_cv = unlist(var_sd) / unlist(var_mean)) %>%
# select only the variables needed
dplyr::select(var_cv)
# collect the results
sim_results <- dt_sim %>% as.data.table()