Group-wise summarise and calculate fraction-CodePudding

For each sample in column variable I would like to add a new row named "Other" that should represent the fraction of ID/Phyla that are not part of the data frame, calculated as 100 minus the sum of value for each species.

tidyverse summarise_each is probably a way?

Desired out:

                             ID             Phyla variable    value
1   s__Streptococcus_lutetiensis     p__Firmicutes  Sample1 24.51658
2       s__Streptococcus_equinus     p__Firmicutes  Sample1 12.33968
3  s__Ligilactobacillus_animalis     p__Firmicutes  Sample1 10.00188
4    s__Prevotella_copri_clade_A  p__Bacteroidetes  Sample1  5.16247
5  s__Catenibacterium_sp_AM22_15     p__Firmicutes  Sample1  4.56347
              Other                   Other         Sample1 43.41592
6             s__GGB6572_SGB9285 p__Proteobacteria  Sample2 34.40184
7            s__Blautia_producta     p__Firmicutes  Sample2 20.38267
8             s__GGB2738_SGB3684 p__Proteobacteria  Sample2 18.77016
9  s__Amedibacterium_intestinale     p__Firmicutes  Sample2  9.94069
10          s__GGB48468_SGB66515  p__Bacteroidetes  Sample2  4.14584
              Other               Other             Sample2  12.3588

In:

                              ID             Phyla variable    value
1   s__Streptococcus_lutetiensis     p__Firmicutes  Sample1 24.51658
2       s__Streptococcus_equinus     p__Firmicutes  Sample1 12.33968
3  s__Ligilactobacillus_animalis     p__Firmicutes  Sample1 10.00188
4    s__Prevotella_copri_clade_A  p__Bacteroidetes  Sample1  5.16247
5  s__Catenibacterium_sp_AM22_15     p__Firmicutes  Sample1  4.56347
6             s__GGB6572_SGB9285 p__Proteobacteria  Sample2 34.40184
7            s__Blautia_producta     p__Firmicutes  Sample2 20.38267
8             s__GGB2738_SGB3684 p__Proteobacteria  Sample2 18.77016
9  s__Amedibacterium_intestinale     p__Firmicutes  Sample2  9.94069
10          s__GGB48468_SGB66515  p__Bacteroidetes  Sample2  4.14584

In dput:

structure(list(ID = c("s__Streptococcus_lutetiensis", "s__Streptococcus_equinus", 
"s__Ligilactobacillus_animalis", "s__Prevotella_copri_clade_A", 
"s__Catenibacterium_sp_AM22_15", "s__GGB6572_SGB9285", "s__Blautia_producta", 
"s__GGB2738_SGB3684", "s__Amedibacterium_intestinale", "s__GGB48468_SGB66515"
), Phyla = c("p__Firmicutes", "p__Firmicutes", "p__Firmicutes", 
"p__Bacteroidetes", "p__Firmicutes", "p__Proteobacteria", "p__Firmicutes", 
"p__Proteobacteria", "p__Firmicutes", "p__Bacteroidetes"), variable = c("Sample1", 
"Sample1", "Sample1", "Sample1", "Sample1", "Sample2", "Sample2", 
"Sample2", "Sample2", "Sample2"), value = c(24.51658, 12.33968, 
10.00188, 5.16247, 4.56347, 34.40184, 20.38267, 18.77016, 9.94069, 
4.14584)), row.names = c(NA, -10L), class = "data.frame")

CodePudding user response：

janitor::adorn_totals appends a totals row and/or column to a data.frame. It can cooperate with group_modify() to create this:

library(dplyr)

df %>%
  group_by(variable) %>%
  group_modify(janitor::adorn_totals, where = "row", fill = "Other", name = "Other") %>%
  ungroup() %>%
  mutate(value = ifelse(ID == "Other", 100 - value, value))

# A tibble: 12 x 4
   variable ID                            Phyla             value
   <chr>    <chr>                         <chr>             <dbl>
 1 Sample1  s__Streptococcus_lutetiensis  p__Firmicutes     24.5 
 2 Sample1  s__Streptococcus_equinus      p__Firmicutes     12.3 
 3 Sample1  s__Ligilactobacillus_animalis p__Firmicutes     10.0 
 4 Sample1  s__Prevotella_copri_clade_A   p__Bacteroidetes   5.16
 5 Sample1  s__Catenibacterium_sp_AM22_15 p__Firmicutes      4.56
 6 Sample1  Other                         Other             43.4 
 7 Sample2  s__GGB6572_SGB9285            p__Proteobacteria 34.4 
 8 Sample2  s__Blautia_producta           p__Firmicutes     20.4 
 9 Sample2  s__GGB2738_SGB3684            p__Proteobacteria 18.8 
10 Sample2  s__Amedibacterium_intestinale p__Firmicutes      9.94
11 Sample2  s__GGB48468_SGB66515          p__Bacteroidetes   4.15
12 Sample2  Other                         Other             12.4

Note of args

fill: What should fill the bottom row of non-numeric columns
name: The name of the totals row and/or column

The above is not a typical solution because adorn_totals only deals with summation. A general solution to treat more complex computation should be:

df %>%
  group_by(variable) %>%
  group_modify(~ {
    .x %>%
      summarise(value = 100 - sum(value)) %>%
      bind_rows(.x, .)
  }) %>%
  ungroup()

CodePudding user response：

library(tidyverse)

data <- structure(list(ID = c(
  "s__Streptococcus_lutetiensis", "s__Streptococcus_equinus",
  "s__Ligilactobacillus_animalis", "s__Prevotella_copri_clade_A",
  "s__Catenibacterium_sp_AM22_15", "s__GGB6572_SGB9285", "s__Blautia_producta",
  "s__GGB2738_SGB3684", "s__Amedibacterium_intestinale", "s__GGB48468_SGB66515"
), Phyla = c(
  "p__Firmicutes", "p__Firmicutes", "p__Firmicutes",
  "p__Bacteroidetes", "p__Firmicutes", "p__Proteobacteria", "p__Firmicutes",
  "p__Proteobacteria", "p__Firmicutes", "p__Bacteroidetes"
), variable = c(
  "Sample1",
  "Sample1", "Sample1", "Sample1", "Sample1", "Sample2", "Sample2",
  "Sample2", "Sample2", "Sample2"
), value = c(
  24.51658, 12.33968,
  10.00188, 5.16247, 4.56347, 34.40184, 20.38267, 18.77016, 9.94069,
  4.14584
)), row.names = c(NA, -10L), class = "data.frame")

other <-
  data %>%
  group_by(variable) %>%
  summarise(value = 100 - sum(value), Phyla = "Other", ID = "Other")
other
#> # A tibble: 2 × 4
#>   variable value Phyla ID   
#>   <chr>    <dbl> <chr> <chr>
#> 1 Sample1   43.4 Other Other
#> 2 Sample2   12.4 Other Other

data %>%
  bind_rows(other) %>%
  arrange(variable)
#>                               ID             Phyla variable    value
#> 1   s__Streptococcus_lutetiensis     p__Firmicutes  Sample1 24.51658
#> 2       s__Streptococcus_equinus     p__Firmicutes  Sample1 12.33968
#> 3  s__Ligilactobacillus_animalis     p__Firmicutes  Sample1 10.00188
#> 4    s__Prevotella_copri_clade_A  p__Bacteroidetes  Sample1  5.16247
#> 5  s__Catenibacterium_sp_AM22_15     p__Firmicutes  Sample1  4.56347
#> 6                          Other             Other  Sample1 43.41592
#> 7             s__GGB6572_SGB9285 p__Proteobacteria  Sample2 34.40184
#> 8            s__Blautia_producta     p__Firmicutes  Sample2 20.38267
#> 9             s__GGB2738_SGB3684 p__Proteobacteria  Sample2 18.77016
#> 10 s__Amedibacterium_intestinale     p__Firmicutes  Sample2  9.94069
#> 11          s__GGB48468_SGB66515  p__Bacteroidetes  Sample2  4.14584
#> 12                         Other             Other  Sample2 12.35880

^{Created on 2022-04-22 by the reprex package (v2.0.0)}