Aggregate over similar columns from different lists in a nested list object-CodePudding

I have a list of many iterations of simulated data. Each iteration is in its own data.frame, and saved in a named list. I need to aggregate, for example, the first column of each data.frame in each list, and the second column of each, and the third, and so forth, each one grouped by age. The code below can recreate the structure of the list I have.

x <- list(list(
          list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE)))),
               
          list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
               d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))
          ))

What I need is the mean and sd of p over all lists named a grouped by age, likewise for all lists named b, and so forth. My desired result would look like this, perhaps all as a data.table or something convenient. I just can't deal with the nested structure well enough to get what I want.

$a
      mean_p     sd_p age
1   9.453670 2.034949   5
2  11.881241 1.995676   6
3   9.979276 2.003178   7
4  10.909008 2.104870   8
5   9.338779 1.904653   9
6  11.745993 1.909569  10
7   8.019631 2.050843  11
8   8.875167 2.053025  12
9  10.697181 1.991607  13
10 11.656100 2.005437  14
11 11.960535 2.004246  15
12 10.343899 2.085225  16
13  9.573988 1.975635  17
14  9.038953 2.112180  18
15  9.131533 2.036852  19
16 13.644504 2.160581  20
17 10.284376 1.903301  21
18  9.543758 2.134177  22
19  9.658121 2.202386  23
20 10.633312 1.842427  24
21 11.100520 2.105879  25
22 11.237161 1.871875  26
23 11.530732 1.972589  27
24  9.042670 2.187250  28
25  9.855445 1.970171  29
26 10.649243 2.064264  30

$b
      mean_p     sd_p age
1   9.705460 1.860338   5
2  10.080478 2.109235   6
3   9.712833 2.017124   7
4   9.420388 2.040863   8
5  11.775058 1.955592   9
6   8.124517 2.046651  10
7  10.557953 1.799830  11
8  10.047775 2.001543  12
9   9.229939 1.966953  13
10 11.814084 2.163710  14
11 12.102374 2.105870  15
12  9.870014 1.866519  16
13 10.696258 2.076030  17
14  9.615747 1.987050  18
15  9.781690 1.961923  19
16  9.395733 1.980549  20
17 13.307485 2.115417  21
18  9.589766 2.058452  22
19  7.942926 2.121072  23
20  9.651580 2.178241  24
21 11.736841 1.996304  25
22  8.682040 1.883955  26
23 10.041262 2.143555  27
24 10.834982 2.086041  28
25  9.046422 2.013758  29
26  9.769026 2.023566  30

$c
      mean_p     sd_p age
1  10.022880 2.148975   5
2  12.535348 1.913299   6
3   8.431201 2.252942   7
4   9.930989 1.943403   8
5   9.391383 2.004252   9
6   9.217615 1.897260  10
7  10.974630 2.174417  11
8  10.475837 1.935946  12
9   9.291287 1.917856  13
10  9.191117 1.971489  14
11  9.986106 1.940689  15
12 10.249913 1.984423  16
13 10.802905 2.122448  17
14 10.582817 1.843136  18
15  9.197653 1.864674  19
16 10.648420 2.037330  20
17 10.457500 1.885780  21
18  9.291936 2.050027  22
19 11.137871 1.744456  23
20  9.148791 1.907282  24
21 10.157003 2.183199  25
22 12.019497 1.883032  26
23 10.890207 1.922753  27
24 10.305917 2.070391  28
25  9.355486 2.022310  29
26 10.405735 1.920850  30

$d
      mean_p     sd_p age
1  10.577719 2.157974   5
2  10.557474 2.126788   6
3   9.448008 1.959201   7
4  10.160101 2.021964   8
5   9.664677 2.035892   9
6  10.974770 2.101026  10
7   8.888659 2.026531  11
8  10.185955 2.092113  12
9  10.456310 2.100847  13
10 10.259347 2.091751  14
11  9.150137 2.002525  15
12 11.042025 1.991657  16
13 10.321668 2.102700  17
14  9.537923 1.866761  18
15 10.401667 1.966281  19
16 10.380466 1.934101  20
17  9.947381 1.805547  21
18 10.458567 1.853977  22
19 11.041953 1.970225  23
20  9.826557 1.680464  24
21 10.169353 2.079167  25
22  9.352873 1.907423  26
23  9.084426 2.148295  27
24 10.083584 2.019244  28
25 10.919343 2.099395  29
26 11.621675 2.013150  30

CodePudding user response：

We can transpose (from purrr) by looping over the outer list (map) to get all the 'a' elements together, 'b' together, and so on .., then flatten the list, use bind_rows, do a group_by on 'age' and get the mean and sd of 'p' column

library(dplyr)
library(purrr)
map(x, purrr::transpose) %>% 
   flatten %>%  
   map(~ bind_rows(.x) %>% 
           group_by(age) %>% 
            summarise(mean_p = mean(p), sd_p = sd(p)))

-output

$a
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5  0.182  0.854
 2     6 -0.541  0.575
 3     7 -0.0815 0.962
 4     8  0.372  1.24 
 5     9  0.495  1.17 
 6    10  0.528  1.12 
 7    11 -0.0519 0.696
 8    12  0.439  0.627
 9    13  0.188  0.465
10    14  0.232  1.28 
# … with 16 more rows

$b
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5 -0.0386 0.930
 2     6  0.0312 0.961
 3     7 -0.0914 1.12 
 4     8  0.218  0.948
 5     9 -0.155  0.970
 6    10  0.669  1.31 
 7    11 -0.844  0.971
 8    12  0.424  1.21 
 9    13  0.306  1.36 
10    14 -0.0380 0.876
# … with 16 more rows

$c
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5 -0.447  1.21 
 2     6  0.0458 0.919
 3     7 -0.0733 1.08 
 4     8 -0.424  1.32 
 5     9 -0.149  0.611
 6    10 -0.0812 0.650
 7    11 -0.182  1.08 
 8    12  0.513  1.00 
 9    13  0.466  0.869
10    14  0.587  1.06 
# … with 16 more rows

$d
# A tibble: 26 × 3
     age  mean_p  sd_p
   <int>   <dbl> <dbl>
 1     5  0.749  1.17 
 2     6  0.597  0.971
 3     7  0.294  1.36 
 4     8  0.536  0.842
 5     9  0.202  1.13 
 6    10 -0.267  0.765
 7    11 -0.338  1.19 
 8    12 -0.0775 0.668
 9    13 -0.416  1.04 
10    14 -0.172  0.943
# … with 16 more rows

CodePudding user response：

Using base R, we may exploit a bunge of lapplys to accomplish the task.

out <- lapply(a, function(x) {
  lapply(x, function(i) {
    do.call(rbind, lapply(split(i, i$age), function(j) {
      unique(cbind(mean_p = mean(j$p), sd_p = sd(j$p), age = j$age))
      }))
  })
})

What is being done in the background is as follows:

The first call to lapply() grants us access to the two lists, each of which contains four data.frames, i.e. lapply(a, ...)
The second call to lapply() grants us access to the each list of four dataframes separately, i.e. lapply(a, function(x) lapply(x, ...))
The third and final call to lapply() grants us access to each of the four dataframes separately per list, i.e. lapply(a, function(x) lapply(x, function(i) lapply(split(i, i$age), function(j) ...)))
It is at this very stage that we may split the dataframes by age and then compute the mean and sd of p per age. As this yields duplicate rows, we ask to only return the unique rows (which basically boils down to just one row per unit of age). Then, as a final step, we want to bind all splitted rows together with do.call(rbind, ...).

Makes me think of Inception...

Anyway, use lapply(out, function(x) lapply(x, function(i) head(i))) to get the head (i.e. the first six rows) of output in the format you were after per list of dataframes. So [[1]][[1]]$a corresponds to dataframe a of the first list of dataframes (as per your own code construct).

Output

> lapply(out, function(x) lapply(x, function(i) head(i)))
[[1]]
[[1]]$a
         mean_p      sd_p age
[1,] -0.2214052 0.5728252   5
[2,]  0.1512241 1.0183545   6
[3,]  0.5820684 0.6544753   7
[4,] -0.3544267 1.2176304   8
[5,]  0.6891062 0.3257490   9
[6,] -0.6104405 1.0803994  10

[[1]]$b
          mean_p      sd_p age
[1,]  0.65755520 0.9137840   5
[2,]  0.09666811 1.2534887   6
[3,] -0.29419645        NA   7
[4,] -0.61495027        NA   8
[5,] -0.67372766 0.7948035   9
[6,]  0.05492452 0.5962344  10

[[1]]$c
          mean_p      sd_p age
[1,] -0.04997196 0.9735747   5
[2,] -0.18066737 1.0561993   6
[3,]  0.44835144 0.6861611   7
[4,] -0.03450747 0.5618294   8
[5,] -0.36891531 0.6584360   9
[6,] -1.83686991 0.6053109  10

[[1]]$d
         mean_p      sd_p age
[1,]  1.4783206 1.5945939   5
[2,] -0.6051854 0.6201073   6
[3,] -0.8496123 1.2611873   7
[4,]  0.8423030 2.1340601   8
[5,] -0.1505581 0.8284541   9
[6,] -0.5690181 0.5661665  10


[[2]]
[[2]]$a
         mean_p      sd_p age
[1,] -0.1559285 1.1884162   5
[2,] -0.9928200        NA   6
[3,]  0.6378858 0.9583024   7
[4,]  0.3524212 0.4773790   8
[5,] -0.4598595 0.6975926   9
[6,] -1.4468897        NA  10

[[2]]$b
         mean_p       sd_p age
[1,]  0.8621597 0.19021036   5
[2,] -0.6866161 1.47080268   6
[3,] -0.3161500 1.47283570   7
[4,] -0.1491293 1.83977264   8
[5,]  0.3930428 0.47044905  10
[6,]  0.1436387 0.05494105  11

[[2]]$c
          mean_p      sd_p age
[1,] -0.94578695 1.5907651   6
[2,]  0.25843591 0.8750231   7
[3,]  0.60657270 0.2012965   8
[4,]  0.01380658 0.6565536   9
[5,]  0.67645275        NA  10
[6,] -0.57588300 0.1318478  11

[[2]]$d
         mean_p      sd_p age
[1,] -0.3155665 1.2049593   5
[2,]  0.2354498 0.9599451   6
[3,]  0.1003319 1.2857660   7
[4,] -0.1990073 0.8857302   8
[5,] -0.4315429 0.9663623   9
[6,]  0.7719040        NA  10

If you would like to see all the output per list of dataframes, you can use lapply(out, function(x) lapply(x, function(i) i)).

Data

set.seed(1)

a <- list(
  list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE)))),
  
  list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       c = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
       d = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))
)

Fianally, as a verification step to see whether out approach is valid, let's make things a little bit simpler by just having one list of two dataframes a and b. Note that the first six rows of $a and $b shown below are exactly the same as [[1]][[1]]$a and [[1]][[1]]$b in the previous output.

set.seed(1)

my_list <- list(a = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))),
                b = data.frame(p = rnorm(100), age = sort(sample(seq(5,30),100, replace = TRUE))))

out <- lapply(my_list, function(x) {
  do.call(rbind, lapply(split(x, x$age), function(i) {
    unique(cbind(mean_p = mean(i$p), sd_p = sd(i$p), i$age))
  }))
})

Output

> lapply(out, function(x) head(x))
$a
         mean_p      sd_p age
[1,] -0.2214052 0.5728252   5
[2,]  0.1512241 1.0183545   6
[3,]  0.5820684 0.6544753   7
[4,] -0.3544267 1.2176304   8
[5,]  0.6891062 0.3257490   9
[6,] -0.6104405 1.0803994  10

$b
          mean_p      sd_p age
[1,]  0.65755520 0.9137840   5
[2,]  0.09666811 1.2534887   6
[3,] -0.29419645        NA   7
[4,] -0.61495027        NA   8
[5,] -0.67372766 0.7948035   9
[6,]  0.05492452 0.5962344  10