How to write a bootstrapped mean difference using tidyverse?

Question

I am interested in the difference in the mean of some variable according to a binary covariate.

I am computing the confidence interval of this difference by bootstraping:

library(tidyverse)
df = mtcars %>% 
   select(disp, vs) %>% 
   mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000

set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
    ib = sample(1:nrow(df), replace = TRUE)
    xi = df$disp[ib]
    byi = df$vs[ib]
    beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409

^{Created on 2021-06-13 by the reprex package (v2.0.0)}

This works, but I find it quite unreadable and I wonder about its efficiency, as for loops are often considered a bad design in R.

Being a heavy user of the tidyverse, I would like to rewrite this using this framework.

Is there a fast and readable way to do so?

PS: Here is the closest I could get, but it is far from being more readable and it is 250 times slower:

beffect2 = replicate(R, {
    df %>% 
        slice_sample(prop=1, replace = TRUE) %>% 
        group_by(vs) %>% 
        summarise(m=mean(disp)) %>% 
        pivot_wider(names_from = "vs", values_from = "m") %>%
        transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>% 
    map_dbl(identity)

EDIT: here are the benchmarks of all methods so far:


# with R=50 ***********

# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
#               expr      min       lq      mean   median       uq      max neval
#             f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672     5
#          f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330     5
#     f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241     5
#  f_akrun_bindout()  21.8185  21.9725  76.68770  22.9811  30.2129 286.4535     5
#           f_cole()   2.7685   3.1343   3.63484   3.2679   4.4346   4.5689     5
#        f_forloop()   2.1136   2.1277   3.14156   3.4968   3.6740   4.2957     5

# with R=500 **********

# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
#               expr       min        lq       mean    median        uq       max neval
#             f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041     5
#          f_akrun()  936.3249  951.3230  970.27424  956.3674  992.3162 1015.0397     5
#     f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271     5
#  f_akrun_bindout()  108.3761  108.7238  113.26746  112.2521  118.4673  118.5180     5
#           f_cole()   23.1283   23.4074   24.75386   23.9244   26.4594   26.8498     5
#        f_forloop()   20.4243   21.1367   23.26222   21.2130   22.5616   30.9755     5

Can you add the new method also in the benchmarks. It is much faster than the original one I proposed — akrun, Jun 13 '21 at 01:25
@akrun there you are. Indeed, the binding method is much faster! — Dan Chaltiel, Jun 13 '21 at 07:55

akrun · Answer 1 · 2021-06-13T01:48:12.967

We could use map and avoid the multiple pivot_wider steps

library(purrr)
library(dplyr)

set.seed(1)
out <- map_dfr(seq_len(R), ~ {
         ib <- sample(1:nrow(df), replace = TRUE)
         df %>% 
            slice(ib) %>%
            summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
                  mean(disp[vs == by1], na.rm = TRUE))
      })

-checking

mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409

Or may use diff instead of pivot_wider

set.seed(1)
out2 <- replicate(R, df %>% 
       slice_sample(prop = 1, replace = TRUE) %>%
       group_by(vs) %>%
       summarise(m = mean(disp), .groups = 'drop') %>% 
       summarise(beffect = diff(m[2:1])), simplify = FALSE) %>% 
       bind_rows

-checking

mean(out2$beffect)
#[1] 175.9203

Or another option would be to do the sample, bind them together with a group identifier, use that to extract the values of the columns, do a group by the group identifier and 'vs' and get the mean

set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
   as_tibble, simplify = FALSE) %>%
   bind_rows(.id = 'grp') %>%
   mutate(vs = df$vs[value], disp = df$disp[value]) %>% 
   group_by(grp, vs) %>% 
   summarise(beffect = mean(disp), .groups = 'drop_last') %>% 
   group_by(grp) %>% 
   summarise(beffect = diff(beffect[2:1]), .groups = 'drop')

-checking

mean(out3$beffect)
#[1] 175.9203

Benchmarks

system.time({set.seed(1)
 out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
    as_tibble, simplify = FALSE) %>%
    bind_rows(.id = 'grp') %>%
    mutate(vs = df$vs[value], disp = df$disp[value]) %>% 
    group_by(grp, vs) %>% 
    summarise(beffect = mean(disp), .groups = 'drop_last') %>% 
    group_by(grp) %>% 
    summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
 #  user  system elapsed 
 # 0.202   0.007   0.208

Or with map

system.time({
  set.seed(1)
 out <- map_dfr(seq_len(R), ~ {
          ib <- sample(1:nrow(df), replace = TRUE)
          df %>% 
             slice(ib) %>%
             summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
                   mean(disp[vs == by1], na.rm = TRUE))
       })
 })
#   user  system elapsed 
#  1.329   0.013   1.338

Or instead of pivot_wider, take the diff

system.time({set.seed(1)
    out2 <- replicate(R, df %>% 
       slice_sample(prop = 1, replace = TRUE) %>%
       group_by(vs) %>%
       summarise(m = mean(disp), .groups = 'drop') %>% 
       summarise(beffect = diff(m[2:1])), simplify = FALSE) %>% 
       bind_rows
     })
 #  user  system elapsed 
 # 3.753   0.027   3.758

Or a similar approach in data.table

library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
        replicate(R,  

              df[df[, .I[sample(seq_len(.N), replace = TRUE)]
                        ]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
                simplify = FALSE)
                         )
})
# user  system elapsed 
#  1.181   0.055   1.230

-OP's method

system.time({replicate(R, {
     df %>% 
         slice_sample(prop=1, replace = TRUE) %>% 
         group_by(vs) %>% 
         summarise(m=mean(disp)) %>% 
         pivot_wider(names_from = "vs", values_from = "m") %>%
         transmute(x=!!ensym(by2) - !!ensym(by1))
 }, simplify = FALSE)})
   user  system elapsed 
  6.991   0.063   7.009

microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
        expr        min         lq      mean     median         uq        max neval  cld
      f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152     5    d
  f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573     5  b  
  f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609     5   c 
  f_akrun3()  178.30997  179.77604  194.0712  189.18425  205.37485  217.71095     5 a   
 f_forloop()   30.11329   33.37171   35.0534   36.80903   36.95909   38.01389     5 a

Much better than my method, thanks! (the for loop is still winning the game though) — Dan Chaltiel, Jun 13 '21 at 00:45
@DanChaltiel I would the tidyverse approach and efficiency have a limit. — akrun, Jun 13 '21 at 00:46
@DanChaltiel Also, when you post a question like this, there is no threshold to compare. Please specify your clear expectations. Some of the functions you want to use as part of readability may not be optimal for efficiency. — akrun, Jun 13 '21 at 00:47
I added a microbenchmark to the end of my question. I guess you are right, the `for` loop might be better suited for this task. — Dan Chaltiel, Jun 13 '21 at 00:54

score 3 · Accepted Answer · answered Jun 13 '21 at 01:38

This may be overlooking the obvious, but the tidyverse equivalent of a for loop would involve something like purrr::map(). The simplest conversion would be to use purrr::map_dbl(1:R, ...) such as:

library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs

map_dbl(1:R,
        ~ {
          ib = sample(nrow(df), replace = TRUE)
          xi = disp[ib]
          byi = vs[ib]
          mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
          })

Also, since by is binary, you may be able to improve performance by translating this into rcpp.

How to write a bootstrapped mean difference using tidyverse?

2 Answers2

Benchmarks