broom::tidy and dplyr::group_modify in R: Loping functions across variables grouped by species

Question

I am trying to get a data frame of t.test outputs grouped by species for many variables. I have a subset of data that looks like this;

structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), levels = c("Apetahia longistigmata", 
"Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
"Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
), class = "factor"), Experiment = structure(c(1L, 1L, 1L, 2L, 
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), levels = c("standard 1", 
"standard 2"), class = "factor"), Alpha.peak.1b = c(-78.35, -78.34, 
-78.84, -78.85, -78.01, -78.17, -79.17, -79.01, -79.01, -79.18, 
-79.18, -79.72, -77.85, -77.86, -77.69, -78.01, -77.86), Alpha.recrystalization.peak = c(-70.34, 
-78.34, -70.83, -70.67, -70.67, -71.17, -71, -71.17, -70.67, 
-70.84, -71.01, -70.67, -70.67, -70.68, -70.84, -70.67, -70.68
), Beta.peak.1a = c(-39.37, -39.63, -39.85, -39.53, -39.69, -39.85, 
-40.52, -40.52, -40.69, -40.86, -40.69, -40.68, -38.87, -38.56, 
-38.72, -38.86, -38.39)), row.names = c(NA, -17L), class = c("tbl_df", 
"tbl", "data.frame"))

I am trying the following;

df.ttest<-df %>%
  group_by(Species) %>%
  group_modify(~ broom::tidy(df[,3:ncol(df)], 2, function(x) t.test(x ~ Experiment, data = df)))

On my subset of data i get the following error;

Error: unexpected ',' in "ction(x) t.test(x ~ Experiment, data = df)))

However, on my full data, I get an output that looks like this;

structure(list(Species = structure(c(1L, 1L, 1L), levels = c("Apetahia longistigmata", 
"Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
"Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
), class = "factor"), column = c("Alpha.peak.1b", "Alpha.recrystalization.peak", 
"Beta.peak.1a"), n = c(Alpha.peak.1b = 47, Alpha.recrystalization.peak = 47, 
Beta.peak.1a = 47), mean = c(Alpha.peak.1b = -78.5808510638298, 
Alpha.recrystalization.peak = -71.0140425531915, Beta.peak.1a = -39.4
), sd = c(Alpha.peak.1b = 0.577779970860107, Alpha.recrystalization.peak = 1.18348905612851, 
Beta.peak.1a = 0.869930032169005), median = c(Alpha.peak.1b = -78.68, 
Alpha.recrystalization.peak = -70.84, Beta.peak.1a = -39.52), 
    trimmed = c(Alpha.peak.1b = -78.5807692307692, Alpha.recrystalization.peak = -70.8828205128205, 
    Beta.peak.1a = -39.4182051282051), mad = c(Alpha.peak.1b = 0.490000000000009, 
    Alpha.recrystalization.peak = 0.170000000000002, Beta.peak.1a = 0.669999999999995
    ), min = c(Alpha.peak.1b = -79.72, Alpha.recrystalization.peak = -78.34, 
    Beta.peak.1a = -40.86), max = c(Alpha.peak.1b = -77.52, Alpha.recrystalization.peak = -69.51, 
    Beta.peak.1a = -37.69), range = c(Alpha.peak.1b = 2.2, Alpha.recrystalization.peak = 8.83, 
    Beta.peak.1a = 3.17), skew = c(Alpha.peak.1b = 0.027222651634338, 
    Alpha.recrystalization.peak = -5.13007598432232, Beta.peak.1a = 0.112391518548281
    ), kurtosis = c(Alpha.peak.1b = 2.00457929376749, Alpha.recrystalization.peak = 32.7138551153747, 
    Beta.peak.1a = 1.9708376743257), se = c(Alpha.peak.1b = 0.0842778705371632, 
    Alpha.recrystalization.peak = 0.172629621110036, Beta.peak.1a = 0.126892336746095
    )), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), groups = structure(list(Species = structure(1L, levels = c("Apetahia longistigmata", 
"Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
"Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
), class = "factor"), .rows = structure(list(1:3), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))

EDIT for better expected outcome: I am expecting an output that has the t.test output but also columns for species and variables, respectively. For example;

structure(list(Species = "Species x", Variable = "Variable x", 
    estimate = 0.10705965418003, estimate1 = 0.460507391694923, 
    estimate2 = 0.353447737514893, statistic = c(t = 1.5638634504756), 
    p.value = 0.198127496637623, parameter = c(df = 3.72179497232624), 
    conf.low = -0.0887480304981188, conf.high = 0.30286733885818, 
    method = "Welch Two Sample t-test", alternative = "two.sided"), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"))

Any help is greatly appreciated!

andrew_reece · Answer 1 · 2023-03-06T14:47:27.780

You can use map to iterate over your DVs inside of group_modify.

Update per OP comments
To get a column naming the dvs in your output, name the dvs vector with set_names(), and then invoke the names_to() argument in list_rbind().

Note: This is the current (2023) recommended way of accomplishing what the .id parameter did in map_dfr().

library(tidyverse)

df |> 
  group_by(Species) |> 
  group_modify(function(data, key) {
    dvs <- data |> select(-Experiment) |> colnames() |> set_names()
    map(dvs, \(dv) {
      frml <- as.formula(paste(dv, "~ Experiment"))
      t.test(frml, data = data) |> broom::tidy()
    }) |> list_rbind(names_to = 'variable')
  })

glimpse() output:

Rows: 9
Columns: 12
Groups: Species [3]
$ Species     <fct> Apetahia longistigmata, Apetahia longistigmata, Apetahia longistigmata, Clermont…
$ variable    <chr> "Alpha.peak.1b", "Alpha.recrystalization.peak", "Beta.peak.1a", "Alpha.peak.1b",…
$ estimate    <dbl> -0.16666667, -2.33333333, 0.07333333, 0.29666667, -0.10666667, 0.16666667, 0.135…
$ estimate1   <dbl> -78.51000, -73.17000, -39.61667, -79.06333, -70.94667, -40.57667, -77.80000, -70…
$ estimate2   <dbl> -78.34333, -70.83667, -39.69000, -79.36000, -70.84000, -40.74333, -77.93500, -70…
$ statistic   <dbl> -0.5449288, -0.8994332, 0.4400000, 1.5802413, -0.6040963, 2.0480798, 1.4508303, …
$ p.value     <dbl> 0.6194889, 0.4626742, 0.6858431, 0.2363844, 0.5828158, 0.1099936, 0.2800136, 0.4…
$ parameter   <dbl> 3.405672, 2.016578, 3.482236, 2.348480, 3.490543, 3.996355, 2.068551, 2.032827, …
$ conf.low    <dbl> -1.07760479, -13.40791444, -0.41784732, -0.40643608, -0.62646029, -0.05935351, -…
$ conf.high   <dbl> 0.7442715, 8.7412478, 0.5645140, 0.9997694, 0.4131270, 0.3926868, 0.5229130, 0.1…
$ method      <chr> "Welch Two Sample t-test", "Welch Two Sample t-test", "Welch Two Sample t-test",…
$ alternative <chr> "two.sided", "two.sided", "two.sided", "two.sided", "two.sided", "two.sided", "t…

Notes:

df is the subset of data provided at the top of your post.
I wasn't able to reproduce your error about "Unexpected ','...", that looks like a typo somewhere.

Thank you @andrew_reece! I am getting this error; "Error in list_rbind(map(dvs, function(dv) { : could not find function "list_rbind"". I tried purrr::list_rbind() but then got "Error: 'list_rbind' is not an exported object from 'namespace:purrr'". I of course have tidyverse installed and loaded. I even remade df from my post, just in case. Any ideas why the error? Also, the new piping is cool, I haven't tried it yet! I should mention I am running 4.2.2. — Dustin, Mar 06 '23 at 02:40
Ok, fixed the above error by updating tidyverse and in particular purrr (to 1.0.1). Thus, I no longer get the error. — Dustin, Mar 06 '23 at 03:03
Thanks for this answer it is almost perfect! I just need a way to see which variable is which (i.e. another column that specifies the variable). I assume they are in the same order but that seems dangerous. I will edit my question as I see I did specify this explicitly. — Dustin, Mar 06 '23 at 03:05
Sure thing! See updated solution, includes a `variable` column now. — andrew_reece, Mar 06 '23 at 14:48

score 1 · Accepted Answer · answered Mar 06 '23 at 08:14

Here is another approach using dplyr::rowwise() with nest_by() and expand_grid(). I like having a final data.frame which contains inputs and output and we can verify each piece (the formula form, the data and the variables vars).

library(tidyverse)

dv <- df %>%
  select(-c(Species, Experiment)) %>%
  colnames()

df |> 
  nest_by(Species) %>% 
  expand_grid(vars = dv) %>% 
  rowwise() %>% 
  mutate(form = list(reformulate("Experiment", vars)),
         res  = list(t.test(form, data = data) %>% 
                       broom::tidy()) 
           
  ) %>% 
  unnest(res)

#> # A tibble: 9 x 14
#>   Species           data vars  form      estim~1 estim~2 estim~3 stati~4 p.value
#>   <fct>          <list<> <chr> <list>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1 Apetahia long~ [6 x 4] Alph~ <formula> -0.167    -78.5   -78.3  -0.545   0.619
#> 2 Apetahia long~ [6 x 4] Alph~ <formula> -2.33     -73.2   -70.8  -0.899   0.463
#> 3 Apetahia long~ [6 x 4] Beta~ <formula>  0.0733   -39.6   -39.7   0.440   0.686
#> 4 Clermontia fa~ [6 x 4] Alph~ <formula>  0.297    -79.1   -79.4   1.58    0.236
#> 5 Clermontia fa~ [6 x 4] Alph~ <formula> -0.107    -70.9   -70.8  -0.604   0.583
#> 6 Clermontia fa~ [6 x 4] Beta~ <formula>  0.167    -40.6   -40.7   2.05    0.110
#> 7 Cyanea fissa   [5 x 4] Alph~ <formula>  0.135    -77.8   -77.9   1.45    0.280
#> 8 Cyanea fissa   [5 x 4] Alph~ <formula> -0.0550   -70.7   -70.7  -0.995   0.423
#> 9 Cyanea fissa   [5 x 4] Beta~ <formula> -0.0917   -38.7   -38.6  -0.365   0.766
#> # ... with 5 more variables: parameter <dbl>, conf.low <dbl>, conf.high <dbl>,
#> #   method <chr>, alternative <chr>, and abbreviated variable names
#> #   1: estimate, 2: estimate1, 3: estimate2, 4: statistic

Data from OP

df <- structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
                                     2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), levels = c("Apetahia longistigmata", 
                                                                                             "Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
                                                                                             "Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
                                     ), class = "factor"), Experiment = structure(c(1L, 1L, 1L, 2L, 
                                                                                    2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), levels = c("standard 1", 
                                                                                                                                                    "standard 2"), class = "factor"), Alpha.peak.1b = c(-78.35, -78.34, 
                                                                                                                                                                                                        -78.84, -78.85, -78.01, -78.17, -79.17, -79.01, -79.01, -79.18, 
                                                                                                                                                                                                        -79.18, -79.72, -77.85, -77.86, -77.69, -78.01, -77.86), Alpha.recrystalization.peak = c(-70.34, 
                                                                                                                                                                                                                                                                                                 -78.34, -70.83, -70.67, -70.67, -71.17, -71, -71.17, -70.67, 
                                                                                                                                                                                                                                                                                                 -70.84, -71.01, -70.67, -70.67, -70.68, -70.84, -70.67, -70.68
                                                                                                                                                                                                        ), Beta.peak.1a = c(-39.37, -39.63, -39.85, -39.53, -39.69, -39.85, 
                                                                                                                                                                                                                            -40.52, -40.52, -40.69, -40.86, -40.69, -40.68, -38.87, -38.56, 
                                                                                                                                                                                                                            -38.72, -38.86, -38.39)), row.names = c(NA, -17L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                         "tbl", "data.frame"))

^{Created on 2023-03-06 by the reprex package (v2.0.1)}

I have to admit, this is pretty darn cool. Thank you! – Dustin Mar 07 '23 at 02:38 — Dustin, Mar 07 '23 at 02:38

broom::tidy and dplyr::group_modify in R: Loping functions across variables grouped by species

2 Answers2