Element-wise median across lists of data frames in R


I have several lists, each one containing many data frames. I would like to calculate the element-wise median across the elements of each data frame inside each list, i.e. the median between element [[1]][1,1] of list1, element [[1]][1,1] of list2, and element [[1]][1,1] of list3, and so on for all elements. The data frames have many columns each, but here is some sample data with only two columns:


list1 <- list(data.frame(a = sample.int(10, 4), b = sample.int(10, 4)),
              data.frame(a = sample.int(10, 4), b = sample.int(10, 4)))

list2 <- list(data.frame(a = sample.int(10, 4), b = sample.int(10, 4)),
              data.frame(a = sample.int(10, 4), b = sample.int(10, 4)))

list3 <- list(data.frame(a = sample.int(10, 4), b = sample.int(10, 4)),
              data.frame(a = sample.int(10, 4), b = sample.int(10, 4)))

This is the expected result:

 a b
 7 4
 9 9
 7 3
 4 6

 a b
 5 7
 8 6
 2 6
 5 2

Any tips?

CodePudding user response:

Using purrr:

lsts <- list(list1,list2,list3)

map(transpose(lsts),~map_dfc(transpose(.), ~apply(list2DF(.x),1,median)))   

# A tibble: 4 × 2
      a     b
  <int> <int>
1     7     4
2     9     9
3     7     3
4     4     6

# A tibble: 4 × 2
      a     b
  <int> <int>
1     5     7
2     8     6
3     2     6
4     5     2

In Base R, assuming they all have the same structure:

dims <- c(dim(list1[[1]]), length(list1), length(lsts))
d <- apply(array(unlist(lsts), dims), head(seq(dims),-1), median)
asplit(aperm(d, c(1,3,2)), 3)

     [,1] [,2]
[1,]    7    4
[2,]    9    9
[3,]    7    3
[4,]    4    6

     [,1] [,2]
[1,]    5    7
[2,]    8    6
[3,]    2    6
[4,]    5    2

CodePudding user response:

Here's another (base R) option:

Map(function(...) {
  dots1 <- list(...)
  out <- do.call(mapply, c(list(FUN=function(...) {
    dots2 <- list(...)
    apply(do.call(cbind, dots2), 1, median)
  }), dots1))
}, list1, list2, list3)
# [[1]]
#   a b
# 1 7 4
# 2 9 9
# 3 7 3
# 4 4 6
# [[2]]
#   a b
# 1 5 7
# 2 8 6
# 3 2 6
# 4 5 2

Certainly not beautiful, but functional.

A related dplyr option:

list(list1, list2, list3) |>
  lapply(bind_rows, .id = "id1") |>
  bind_rows(.id = "id2") |>
  group_by(id1, id2) |>
  mutate(rn = row_number()) |>
  group_by(id1, rn) |>
  summarize(across(c(a, b), ~ median(.))) |>
  ungroup() |>
  select(-rn) |>
  group_nest(id1) |>
# [[1]]
# # A tibble: 4 × 2
#       a     b
#   <int> <int>
# 1     7     4
# 2     9     9
# 3     7     3
# 4     4     6
# [[2]]
# # A tibble: 4 × 2
#       a     b
#   <int> <int>
# 1     5     7
# 2     8     6
# 3     2     6
# 4     5     2

CodePudding user response:

Certainly not the most efficient solution, but one option with tidyverse might be:

map_dfr(mget(ls(pattern = "list")), 
        function(list_of_lists) imap(list_of_lists, 
                                     function(lists, lists_id)
                                      lists %>%
                                      mutate(rowid = row_number(),
                                             lists_id = lists_id))) %>%
 group_by(rowid, lists_id) %>%
 summarise(across(c(a, b), median))

  rowid lists_id     a     b
  <int>    <int> <int> <int>
1     1        1    10     3
2     1        2     8     1
3     2        1     5     4
4     2        2     9     8
5     3        1     6     6
6     3        2     6     3
7     4        1     3     2
8     4        2     4     6

If the goal is to return a list:

map_dfr(mget(ls(pattern = "list")), 
        function(list_of_lists) imap(list_of_lists, 
                                     function(lists, lists_id)
                                      lists %>%
                                      mutate(rowid = row_number(),
                                             lists_id = lists_id))) %>%
 group_by(rowid, lists_id) %>%
 summarise(across(c(a, b), median)) %>%
 ungroup() %>%

# A tibble: 4 × 4
  rowid lists_id     a     b
  <int>    <int> <int> <int>
1     1        1    10     3
2     2        1     5     4
3     3        1     6     6
4     4        1     3     2

# A tibble: 4 × 4
  rowid lists_id     a     b
  <int>    <int> <int> <int>
1     1        2     8     1
2     2        2     9     8
3     3        2     6     3
4     4        2     4     6

CodePudding user response:

Here is a tidyverse solution first draft (I am sure that it could be improved):


bind_rows(list1, list2, list3) %>% 
  mutate(x =rep(1:3, each=8, length.out = n())) %>% 
  group_by(x) %>% 
  pivot_wider(names_from = x, 
              values_from = c(a,b),
              values_fn = list) %>% 
  unnest() %>% 
  rowwise() %>% 
  transmute(a = median(c(a_1, a_2, a_3)), 
         b = median(c(b_1, b_2, b_3))
         ) %>% 
  ungroup() %>% 
  group_by(x = as.integer(gl(n(),4,n()))) %>% 
  group_split() %>% 
  map(.,~(.x %>%select(-x)))
# A tibble: 4 × 2
      a     b
  <int> <int>
1     7     4
2     9     9
3     7     3
4     4     6

# A tibble: 4 × 2
      a     b
  <int> <int>
1     5     7
2     8     6
3     2     6
4     5     2
