Compute mean on data frame columns for specific rows selected from list elements in R-CodePudding

I would like to obtain the mean of a data frame column considering only the rows selected by a list with different elements. My data looks like this:

I have a list of genesets with gene names (A,B,C...) with the following structure:

mylist <- list(
  first_set = c("B", "C", "D"),
  second_set = c("A", "C", "E"),
  third_set = c("B", "D", "F")
)

I have a data frame of genes from a differential expression analysis in a table with the gene name and some numeric values such as the following:

mydataframe <- data.frame(
  name = c("A", "B", "C", "D", "E", "F"),
  first_value = runif(6),
  second_value = runif(6)
)

I want to compute the mean of the first and second value considering only the genes in each geneset detailed in the list, separately. This means that for first value, I'll end up with mean for first_set, mean for second_set and so on, and the same for the second value. In the end I want a list or a data frame such as this one.

desired_output <- data.frame(
  set = c("first_set", "second_set", "third_set")
  first_value_mean_by_list_group = runif(3),
  second_value_mean_by_list_group = runif(3),
)

I guess that it can be done with apply or loops but my thought approach is extremely primitive. Thank you very much.

CodePudding user response：

Try this using base R

l <- lapply(mylist , \(x) lapply(subset(mydataframe ,  mydataframe$name %in% x)[2:3] , mean))

df <- do.call(rbind , l)

output

           first_value second_value
first_set  0.5958386   0.307204    
second_set 0.4836385   0.572279    
third_set  0.5559671   0.3666328

CodePudding user response：

library(tidyverse)

set.seed(1337)

mylist <- list(
  first_set = c("B", "C", "D"),
  second_set = c("A", "C", "E"),
  third_set = c("B", "D", "F")
)

mydataframe <- data.frame(
  name = c("A", "B", "C", "D", "E", "F"),
  first_value = runif(6),
  second_value = runif(6)
)

mylist |>
  enframe(name = "set") |>
  mutate(
    data = value |> map(~ mydataframe |> filter(name %in% .x)),
    first_value_mean_by_list_group = data |> map_dbl(~ .x$first_value |> mean()),
    second_value_mean_by_list_group = data |> map_dbl(~ .x$second_value |> mean())
  ) |>
  select(-value, -data)
#> # A tibble: 3 × 3
#>   set        first_value_mean_by_list_group second_value_mean_by_list_group
#>   <chr>                               <dbl>                           <dbl>
#> 1 first_set                           0.364                           0.224
#> 2 second_set                          0.341                           0.724
#> 3 third_set                           0.450                           0.474

^{Created on 2022-06-29 by the reprex package (v2.0.0)}

CodePudding user response：

mylist <- list(
  first_set = c("B", "C", "D"),
  second_set = c("A", "C", "E"),
  third_set = c("B", "D", "F")
)

mydataframe <- data.frame(
  name = c("A", "B", "C", "D", "E", "F"),
  first_value = runif(6),
  second_value = runif(6)
)

library(tidyverse)
map_df(set_names(mylist, names(mylist)), ~ mydataframe %>% 
      filter(name %in% .x) %>% 
      summarise(across(where(is.numeric), mean, na.rm = TRUE)), .id = "set")
#>          set first_value second_value
#> 1  first_set   0.2745788    0.7704410
#> 2 second_set   0.5918136    0.8168732
#> 3  third_set   0.1881355    0.5139927

^{Created on 2022-06-29 by the reprex package (v2.0.1)}