3

I have a list of many iterations of simulated data. Each iteration is in its own data.frame, and saved in a named list. I need to aggregate, for example, the first column of each data.frame in each list, and the second column of each, and the third, and so forth, each one grouped by age. The code below can recreate the structure of the list I have.

x <- list(list(
          list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE)))),
               
          list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))
          ))

What I need is the mean and sd of p over all lists named a grouped by age, likewise for all lists named b, and so forth. My desired result would look like this, perhaps all as a data.table or something convenient. I just can't deal with the nested structure well enough to get what I want.

$a
      mean_p     sd_p age
1   9.453670 2.034949   5
2  11.881241 1.995676   6
3   9.979276 2.003178   7
4  10.909008 2.104870   8
5   9.338779 1.904653   9
6  11.745993 1.909569  10
7   8.019631 2.050843  11
8   8.875167 2.053025  12
9  10.697181 1.991607  13
10 11.656100 2.005437  14
11 11.960535 2.004246  15
12 10.343899 2.085225  16
13  9.573988 1.975635  17
14  9.038953 2.112180  18
15  9.131533 2.036852  19
16 13.644504 2.160581  20
17 10.284376 1.903301  21
18  9.543758 2.134177  22
19  9.658121 2.202386  23
20 10.633312 1.842427  24
21 11.100520 2.105879  25
22 11.237161 1.871875  26
23 11.530732 1.972589  27
24  9.042670 2.187250  28
25  9.855445 1.970171  29
26 10.649243 2.064264  30

$b
      mean_p     sd_p age
1   9.705460 1.860338   5
2  10.080478 2.109235   6
3   9.712833 2.017124   7
4   9.420388 2.040863   8
5  11.775058 1.955592   9
6   8.124517 2.046651  10
7  10.557953 1.799830  11
8  10.047775 2.001543  12
9   9.229939 1.966953  13
10 11.814084 2.163710  14
11 12.102374 2.105870  15
12  9.870014 1.866519  16
13 10.696258 2.076030  17
14  9.615747 1.987050  18
15  9.781690 1.961923  19
16  9.395733 1.980549  20
17 13.307485 2.115417  21
18  9.589766 2.058452  22
19  7.942926 2.121072  23
20  9.651580 2.178241  24
21 11.736841 1.996304  25
22  8.682040 1.883955  26
23 10.041262 2.143555  27
24 10.834982 2.086041  28
25  9.046422 2.013758  29
26  9.769026 2.023566  30

$c
      mean_p     sd_p age
1  10.022880 2.148975   5
2  12.535348 1.913299   6
3   8.431201 2.252942   7
4   9.930989 1.943403   8
5   9.391383 2.004252   9
6   9.217615 1.897260  10
7  10.974630 2.174417  11
8  10.475837 1.935946  12
9   9.291287 1.917856  13
10  9.191117 1.971489  14
11  9.986106 1.940689  15
12 10.249913 1.984423  16
13 10.802905 2.122448  17
14 10.582817 1.843136  18
15  9.197653 1.864674  19
16 10.648420 2.037330  20
17 10.457500 1.885780  21
18  9.291936 2.050027  22
19 11.137871 1.744456  23
20  9.148791 1.907282  24
21 10.157003 2.183199  25
22 12.019497 1.883032  26
23 10.890207 1.922753  27
24 10.305917 2.070391  28
25  9.355486 2.022310  29
26 10.405735 1.920850  30

$d
      mean_p     sd_p age
1  10.577719 2.157974   5
2  10.557474 2.126788   6
3   9.448008 1.959201   7
4  10.160101 2.021964   8
5   9.664677 2.035892   9
6  10.974770 2.101026  10
7   8.888659 2.026531  11
8  10.185955 2.092113  12
9  10.456310 2.100847  13
10 10.259347 2.091751  14
11  9.150137 2.002525  15
12 11.042025 1.991657  16
13 10.321668 2.102700  17
14  9.537923 1.866761  18
15 10.401667 1.966281  19
16 10.380466 1.934101  20
17  9.947381 1.805547  21
18 10.458567 1.853977  22
19 11.041953 1.970225  23
20  9.826557 1.680464  24
21 10.169353 2.079167  25
22  9.352873 1.907423  26
23  9.084426 2.148295  27
24 10.083584 2.019244  28
25 10.919343 2.099395  29
26 11.621675 2.013150  30

EDIT: First of all, thank you all for your detailed feedback. I simplified the structure of my data object and used some existing code that I Akrun already provided elsewhere. I'm sure someone somewhere in the future may find some of the slick code you guys wrote to be useful, so I'm hesitant to just close the question, but if that's what I'm supposed to do then I will do that.

Adam
  • 33
  • 5

3 Answers3

3

We can transpose (from purrr) by looping over the outer list (map) to get all the 'a' elements together, 'b' together, and so on .., then flatten the list, use bind_rows, do a group_by on 'age' and get the mean and sd of 'p' column

library(dplyr)
library(purrr)
map(x, purrr::transpose) %>% 
   flatten %>%  
   map(~ bind_rows(.x) %>% 
           group_by(age) %>% 
            summarise(mean_p = mean(p), sd_p = sd(p)))

-output

$a
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5  0.182  0.854
 2     6 -0.541  0.575
 3     7 -0.0815 0.962
 4     8  0.372  1.24 
 5     9  0.495  1.17 
 6    10  0.528  1.12 
 7    11 -0.0519 0.696
 8    12  0.439  0.627
 9    13  0.188  0.465
10    14  0.232  1.28 
# … with 16 more rows

$b
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5 -0.0386 0.930
 2     6  0.0312 0.961
 3     7 -0.0914 1.12 
 4     8  0.218  0.948
 5     9 -0.155  0.970
 6    10  0.669  1.31 
 7    11 -0.844  0.971
 8    12  0.424  1.21 
 9    13  0.306  1.36 
10    14 -0.0380 0.876
# … with 16 more rows

$c
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5 -0.447  1.21 
 2     6  0.0458 0.919
 3     7 -0.0733 1.08 
 4     8 -0.424  1.32 
 5     9 -0.149  0.611
 6    10 -0.0812 0.650
 7    11 -0.182  1.08 
 8    12  0.513  1.00 
 9    13  0.466  0.869
10    14  0.587  1.06 
# … with 16 more rows

$d
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5  0.749  1.17 
 2     6  0.597  0.971
 3     7  0.294  1.36 
 4     8  0.536  0.842
 5     9  0.202  1.13 
 6    10 -0.267  0.765
 7    11 -0.338  1.19 
 8    12 -0.0775 0.668
 9    13 -0.416  1.04 
10    14 -0.172  0.943
# … with 16 more rows
akrun
  • 874,273
  • 37
  • 540
  • 662
  • This is only the output of one list of `data.frame`s, isn't it? – Dion Groothof Jan 25 '22 at 22:36
  • @DionGroothof OP's post was a bit unclear to me. So, I `transpose`d the list so that the 'a' elements are grouped together,, 'b' elements etc, and then `rbind`ed with `bind_rows` for each element. If they want it separate, it should be done on the nested. But, OP's expected output shows only 4 output data.frame one for 'a', 'b', 'c', and 'd' – akrun Jan 25 '22 at 22:39
  • I'm sorry, I'll try to be more clear. I do actually need the aggregation to be over all of the lists named `a`, all of them named `b`, etc. So in the example problem, I need the aggregation to hit both of those `data.frame`s named `a` from both of the lists. The total would still be 4 data.frames in the output. Is that more clear? – Adam Jan 27 '22 at 00:02
  • @akrun On a side note, it's funny that you are helping me here, because I was already using another very similar solution to a very similar problem that you posted elsewhere, where you also suggested a `transpose`, but the structure of my data object had to change and so I was stuck again. You've helped me so much over the years, thank you so much for all you do. – Adam Jan 27 '22 at 00:04
1

Using base R, we may exploit a bunge of lapplys to accomplish the task.

out <- lapply(a, function(x) {
  lapply(x, function(i) {
    do.call(rbind, lapply(split(i, i$age), function(j) {
      unique(cbind(mean_p = mean(j$p), sd_p = sd(j$p), age = j$age))
      }))
  })
})

What is being done in the background is as follows:

  1. The first call to lapply() grants us access to the two lists, each of which contains four data.frames, i.e. lapply(a, ...)
  2. The second call to lapply() grants us access to the each list of four dataframes separately, i.e. lapply(a, function(x) lapply(x, ...))
  3. The third and final call to lapply() grants us access to each of the four dataframes separately per list, i.e. lapply(a, function(x) lapply(x, function(i) lapply(split(i, i$age), function(j) ...)))
  4. It is at this very stage that we may split the dataframes by age and then compute the mean and sd of p per age. As this yields duplicate rows, we ask to only return the unique rows (which basically boils down to just one row per unit of age). Then, as a final step, we want to bind all splitted rows together with do.call(rbind, ...).

Makes me think of Inception...

Anyway, use lapply(out, function(x) lapply(x, function(i) head(i))) to get the head (i.e. the first six rows) of output in the format you were after per list of dataframes. So [[1]][[1]]$a corresponds to dataframe a of the first list of dataframes (as per your own code construct).

Output

> lapply(out, function(x) lapply(x, function(i) head(i)))
[[1]]
[[1]]$a
         mean_p      sd_p age
[1,] -0.2214052 0.5728252   5
[2,]  0.1512241 1.0183545   6
[3,]  0.5820684 0.6544753   7
[4,] -0.3544267 1.2176304   8
[5,]  0.6891062 0.3257490   9
[6,] -0.6104405 1.0803994  10

[[1]]$b
          mean_p      sd_p age
[1,]  0.65755520 0.9137840   5
[2,]  0.09666811 1.2534887   6
[3,] -0.29419645        NA   7
[4,] -0.61495027        NA   8
[5,] -0.67372766 0.7948035   9
[6,]  0.05492452 0.5962344  10

[[1]]$c
          mean_p      sd_p age
[1,] -0.04997196 0.9735747   5
[2,] -0.18066737 1.0561993   6
[3,]  0.44835144 0.6861611   7
[4,] -0.03450747 0.5618294   8
[5,] -0.36891531 0.6584360   9
[6,] -1.83686991 0.6053109  10

[[1]]$d
         mean_p      sd_p age
[1,]  1.4783206 1.5945939   5
[2,] -0.6051854 0.6201073   6
[3,] -0.8496123 1.2611873   7
[4,]  0.8423030 2.1340601   8
[5,] -0.1505581 0.8284541   9
[6,] -0.5690181 0.5661665  10


[[2]]
[[2]]$a
         mean_p      sd_p age
[1,] -0.1559285 1.1884162   5
[2,] -0.9928200        NA   6
[3,]  0.6378858 0.9583024   7
[4,]  0.3524212 0.4773790   8
[5,] -0.4598595 0.6975926   9
[6,] -1.4468897        NA  10

[[2]]$b
         mean_p       sd_p age
[1,]  0.8621597 0.19021036   5
[2,] -0.6866161 1.47080268   6
[3,] -0.3161500 1.47283570   7
[4,] -0.1491293 1.83977264   8
[5,]  0.3930428 0.47044905  10
[6,]  0.1436387 0.05494105  11

[[2]]$c
          mean_p      sd_p age
[1,] -0.94578695 1.5907651   6
[2,]  0.25843591 0.8750231   7
[3,]  0.60657270 0.2012965   8
[4,]  0.01380658 0.6565536   9
[5,]  0.67645275        NA  10
[6,] -0.57588300 0.1318478  11

[[2]]$d
         mean_p      sd_p age
[1,] -0.3155665 1.2049593   5
[2,]  0.2354498 0.9599451   6
[3,]  0.1003319 1.2857660   7
[4,] -0.1990073 0.8857302   8
[5,] -0.4315429 0.9663623   9
[6,]  0.7719040        NA  10

If you would like to see all the output per list of dataframes, you can use lapply(out, function(x) lapply(x, function(i) i)).

Data

set.seed(1)

a <- list(
  list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE)))),
  
  list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))
)

Fianally, as a verification step to see whether out approach is valid, let's make things a little bit simpler by just having one list of two dataframes a and b. Note that the first six rows of $a and $b shown below are exactly the same as [[1]][[1]]$a and [[1]][[1]]$b in the previous output.

set.seed(1)

my_list <- list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
                b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))

out <- lapply(my_list, function(x) {
  do.call(rbind, lapply(split(x, x$age), function(i) {
    unique(cbind(mean_p = mean(i$p), sd_p = sd(i$p), i$age))
  }))
})

Output

> lapply(out, function(x) head(x))
$a
         mean_p      sd_p age
[1,] -0.2214052 0.5728252   5
[2,]  0.1512241 1.0183545   6
[3,]  0.5820684 0.6544753   7
[4,] -0.3544267 1.2176304   8
[5,]  0.6891062 0.3257490   9
[6,] -0.6104405 1.0803994  10

$b
          mean_p      sd_p age
[1,]  0.65755520 0.9137840   5
[2,]  0.09666811 1.2534887   6
[3,] -0.29419645        NA   7
[4,] -0.61495027        NA   8
[5,] -0.67372766 0.7948035   9
[6,]  0.05492452 0.5962344  10
Dion Groothof
  • 1,406
  • 5
  • 15
  • It looks like your data object `a` is missing one of the shelves in the list. I really wish my data structure wasn't so nested... – Adam Jan 26 '22 at 23:27
  • @Adam Actually, the data object `a` in this answer is exactly the same as the data object `x` in your question, except for the fact that `x` starts with `list(list(list(a = data.frame(...` and `a` starts with `list(list(a = data.frame(...`. You do not have to specify `list` twice before you specify the first list of four `data.frame`s. It is redundant. Have a look youself by first fixing the random number generator with `set.seed()` and then compare `x` with `a`. – Dion Groothof Jan 27 '22 at 13:37
  • Thanks for checking. I'm sure you're correct, it's just when I applied your solution to my code, it wasn't able to find the `age` column, and I chalked it up to maybe that it was drilling into the list one layer too few. I also see that checking `identical(str(a), str(x))` returns `TRUE`, even though `x` has that redundant layer. And yet, your code works on your `a` and not on my `x`. Very strange. Maybe `identical` isn't checking what I think it is? – Adam Jan 27 '22 at 21:31
  • It works if you remove the redundant layer (i.e. the extra `list()`) and replace the `a` with `x` in `out <- lapply(a, ...` :-) So why not do that? – Dion Groothof Jan 27 '22 at 21:47
1

Here is another straightforward approach using rrapply and data.table (and the new pipe and lambda notations):

library(rrapply)
library(data.table)

xout <- rrapply(x, classes = "data.frame", how = "bind") |>
  lapply(\(x) rbindlist(x)[, .(mean_p = mean(p), sd_p = sd(p)), by = "age"])

lapply(xout, head)

#> $`1.a`
#>    age      mean_p      sd_p
#> 1:   5 -0.17463611 0.9986312
#> 2:   6 -0.03944989 1.0236094
#> 3:   7  0.60687614 0.7479393
#> 4:   8 -0.08935872 1.0228175
#> 5:   9  0.16684906 0.7795792
#> 6:  10 -0.74984872 1.0248991
#> 
#> $`1.b`
#>    age     mean_p      sd_p
#> 1:   5  0.7257567 0.7206942
#> 2:   6 -0.1271274 1.2466404
#> 3:   7 -0.3117593 1.2755509
#> 4:   8 -0.2422935 1.6068511
#> 5:   9 -0.6737277 0.7948035
#> 6:  10  0.1901718 0.5493250
#> 
#> $`1.c`
#>    age      mean_p      sd_p
#> 1:   5 -0.04997196 0.9735747
#> 2:   6 -0.48671520 1.2725462
#> 3:   7  0.31023106 0.7987481
#> 4:   8  0.17918592 0.5541580
#> 5:   9 -0.15021709 0.6339513
#> 6:  10 -0.99909569 1.5128772
#> 
#> $`1.d`
#>    age      mean_p      sd_p
#> 1:   5  0.45324227 1.5789165
#> 2:   6 -0.04476195 0.9045707
#> 3:   7 -0.37464021 1.2522912
#> 4:   8  0.45181161 1.7655027
#> 5:   9 -0.32617356 0.8665311
#> 6:  10 -0.30083366 0.7746114

First rrapply() groups the data.frames with the same name together as sublists, then we pass through each sublist and calculate the statistics. If necessary, we can still update the final names:

names(xout) <- substring(names(xout), first = 3)

Data

set.seed(1)

x <- list(list(
  list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE)))),
  
  list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))
))
Joris C.
  • 5,721
  • 3
  • 12
  • 27