1

I have a dataframe with the following structure:

set.seed(1)
dat<- data.frame(gender=sample(rep(c("Man","Woman"),3000)),
                 age=sample(rep(c("Young","Old"),3000)),
                 question=rep(c("Q1", "Q2", "Q3"),2000),
                 response=rep(c("Res1", "Res2"),3000),
                 value=sample(rep(c(0,1),3000)))
head(dat)
#  gender   age question response value
#1    Man   Old       Q1     Res1     0
#2    Man Young       Q2     Res2     1
#3    Man   Old       Q3     Res1     0
#4  Woman   Old       Q1     Res2     1
#5    Man   Old       Q2     Res1     1
#6    Man   Old       Q3     Res2     1

I have created a loop to do a t-test for every response per question, and join the output in a dataframe.

library(tidyverse)
library(rstatix)
data.list1<- list()
for (i in 1:length(table(dat$question))) {
  dat1<- dat %>% 
    filter(question==names(table(dat$question))[[i]])
  data.list2 <- list()
  for(f in 1:(ncol(dat1)-3)){
    dat2<- dat1 %>% 
      t_test(reformulate(colnames(dat1)[f], "value"),
             detailed=T) %>% 
      mutate(question=names(table(dat$question))[[i]],
             response=names(table(dat$response))[[f]])
    data.list2[[f]]<- dat2
  }
  data.list1[[i]] <- bind_rows(data.list2)
}
final.output<- bind_rows(data.list1) %>% 
  select(question, response, group1, estimate1, 
         group2, estimate2,p)
final.output
#  question response group1 estimate1 group2 estimate2     p
#  <chr>    <chr>    <chr>      <dbl> <chr>      <dbl> <dbl>
#1 Q1       Res1     Man        0.492 Woman      0.494 0.932
#2 Q1       Res2     Old        0.484 Young      0.502 0.418
#3 Q2       Res1     Man        0.500 Woman      0.509 0.687
#4 Q2       Res2     Old        0.489 Young      0.518 0.198
#5 Q3       Res1     Man        0.495 Woman      0.510 0.504
#6 Q3       Res2     Old        0.511 Young      0.494 0.452

My problem is that the dataframe I am actually working with is much larger than the one used in this example and contains more variables, so the loop takes a very long time to run (over 10 minutes). Is there any way to obtain the same output without using a loop?

cholo.trem
  • 314
  • 2
  • 9
  • As a general comment, a common "R" way that avoids you having to assign empty lists is working with the apply family functions. If you want to make your code faster without the need for restructuring, maybe you want to look into library(parallel), mclapply if you are using unix, or parLapply if you are using windows – tlhenvironment Mar 18 '21 at 17:25

1 Answers1

2

We could loop over the named vector of column names with imap, select the 'question', looped column , and 'value', then after grouping by 'question', summarise the t_test output in a list and unnest (from tidyr) the list output

library(purrr)
library(dplyr)
library(rstatix)
library(tidyr)
imap_dfr(c(gender = "gender", age = "age"), ~ {
     nm1 <- .x
     dat %>% 
        select(question, .x, value) %>%
        group_by(question) %>% 
        summarise(out =  list(t_test(reformulate( nm1, "value"),
            detailed = TRUE, data = cur_data())))},
       .id = 'variable') %>%
 unnest(c(out))

-output

# A tibble: 6 x 17
  variable question estimate estimate1 estimate2 .y.   group1 group2    n1    n2 statistic     p    df conf.low conf.high method alternative
  <chr>    <chr>       <dbl>     <dbl>     <dbl> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl>    <dbl>     <dbl> <chr>  <chr>      
1 gender   Q1       -0.00192     0.492     0.494 value Man    Woman   1006   994   -0.0857 0.932 1998.  -0.0458    0.0420 T-test two.sided  
2 gender   Q2       -0.00901     0.500     0.509 value Man    Woman   1001   999   -0.403  0.687 1998.  -0.0529    0.0349 T-test two.sided  
3 gender   Q3       -0.0150      0.495     0.510 value Man    Woman    993  1007   -0.669  0.504 1998.  -0.0588    0.0289 T-test two.sided  
4 age      Q1       -0.0181      0.484     0.502 value Old    Young    992  1008   -0.810  0.418 1998.  -0.0620    0.0258 T-test two.sided  
5 age      Q2       -0.0288      0.489     0.518 value Old    Young    977  1023   -1.29   0.198 1994.  -0.0727    0.0150 T-test two.sided  
6 age      Q3        0.0168      0.511     0.494 value Old    Young   1031   969    0.752  0.452 1990.  -0.0271    0.0607 T-test two.sided  
akrun
  • 874,273
  • 37
  • 540
  • 662