For each sample in column variable
I would like to add a new row named "Other" that should represent the fraction of ID/Phyla that are not part of the data frame, calculated as 100 minus the sum of value
for each species.
tidyverse summarise_each
is probably a way?
Desired out:
ID Phyla variable value
1 s__Streptococcus_lutetiensis p__Firmicutes Sample1 24.51658
2 s__Streptococcus_equinus p__Firmicutes Sample1 12.33968
3 s__Ligilactobacillus_animalis p__Firmicutes Sample1 10.00188
4 s__Prevotella_copri_clade_A p__Bacteroidetes Sample1 5.16247
5 s__Catenibacterium_sp_AM22_15 p__Firmicutes Sample1 4.56347
Other Other Sample1 43.41592
6 s__GGB6572_SGB9285 p__Proteobacteria Sample2 34.40184
7 s__Blautia_producta p__Firmicutes Sample2 20.38267
8 s__GGB2738_SGB3684 p__Proteobacteria Sample2 18.77016
9 s__Amedibacterium_intestinale p__Firmicutes Sample2 9.94069
10 s__GGB48468_SGB66515 p__Bacteroidetes Sample2 4.14584
Other Other Sample2 12.3588
In:
ID Phyla variable value
1 s__Streptococcus_lutetiensis p__Firmicutes Sample1 24.51658
2 s__Streptococcus_equinus p__Firmicutes Sample1 12.33968
3 s__Ligilactobacillus_animalis p__Firmicutes Sample1 10.00188
4 s__Prevotella_copri_clade_A p__Bacteroidetes Sample1 5.16247
5 s__Catenibacterium_sp_AM22_15 p__Firmicutes Sample1 4.56347
6 s__GGB6572_SGB9285 p__Proteobacteria Sample2 34.40184
7 s__Blautia_producta p__Firmicutes Sample2 20.38267
8 s__GGB2738_SGB3684 p__Proteobacteria Sample2 18.77016
9 s__Amedibacterium_intestinale p__Firmicutes Sample2 9.94069
10 s__GGB48468_SGB66515 p__Bacteroidetes Sample2 4.14584
In dput:
structure(list(ID = c("s__Streptococcus_lutetiensis", "s__Streptococcus_equinus",
"s__Ligilactobacillus_animalis", "s__Prevotella_copri_clade_A",
"s__Catenibacterium_sp_AM22_15", "s__GGB6572_SGB9285", "s__Blautia_producta",
"s__GGB2738_SGB3684", "s__Amedibacterium_intestinale", "s__GGB48468_SGB66515"
), Phyla = c("p__Firmicutes", "p__Firmicutes", "p__Firmicutes",
"p__Bacteroidetes", "p__Firmicutes", "p__Proteobacteria", "p__Firmicutes",
"p__Proteobacteria", "p__Firmicutes", "p__Bacteroidetes"), variable = c("Sample1",
"Sample1", "Sample1", "Sample1", "Sample1", "Sample2", "Sample2",
"Sample2", "Sample2", "Sample2"), value = c(24.51658, 12.33968,
10.00188, 5.16247, 4.56347, 34.40184, 20.38267, 18.77016, 9.94069,
4.14584)), row.names = c(NA, -10L), class = "data.frame")
CodePudding user response:
janitor::adorn_totals
appends a totals row and/or column to a data.frame. It can cooperate with group_modify()
to create this:
library(dplyr)
df %>%
group_by(variable) %>%
group_modify(janitor::adorn_totals, where = "row", fill = "Other", name = "Other") %>%
ungroup() %>%
mutate(value = ifelse(ID == "Other", 100 - value, value))
# A tibble: 12 x 4
variable ID Phyla value
<chr> <chr> <chr> <dbl>
1 Sample1 s__Streptococcus_lutetiensis p__Firmicutes 24.5
2 Sample1 s__Streptococcus_equinus p__Firmicutes 12.3
3 Sample1 s__Ligilactobacillus_animalis p__Firmicutes 10.0
4 Sample1 s__Prevotella_copri_clade_A p__Bacteroidetes 5.16
5 Sample1 s__Catenibacterium_sp_AM22_15 p__Firmicutes 4.56
6 Sample1 Other Other 43.4
7 Sample2 s__GGB6572_SGB9285 p__Proteobacteria 34.4
8 Sample2 s__Blautia_producta p__Firmicutes 20.4
9 Sample2 s__GGB2738_SGB3684 p__Proteobacteria 18.8
10 Sample2 s__Amedibacterium_intestinale p__Firmicutes 9.94
11 Sample2 s__GGB48468_SGB66515 p__Bacteroidetes 4.15
12 Sample2 Other Other 12.4
Note of args
fill
: What should fill the bottom row of non-numeric columnsname
: The name of the totals row and/or column
The above is not a typical solution because adorn_totals
only deals with summation. A general solution to treat more complex computation should be:
df %>%
group_by(variable) %>%
group_modify(~ {
.x %>%
summarise(value = 100 - sum(value)) %>%
bind_rows(.x, .)
}) %>%
ungroup()
CodePudding user response:
library(tidyverse)
data <- structure(list(ID = c(
"s__Streptococcus_lutetiensis", "s__Streptococcus_equinus",
"s__Ligilactobacillus_animalis", "s__Prevotella_copri_clade_A",
"s__Catenibacterium_sp_AM22_15", "s__GGB6572_SGB9285", "s__Blautia_producta",
"s__GGB2738_SGB3684", "s__Amedibacterium_intestinale", "s__GGB48468_SGB66515"
), Phyla = c(
"p__Firmicutes", "p__Firmicutes", "p__Firmicutes",
"p__Bacteroidetes", "p__Firmicutes", "p__Proteobacteria", "p__Firmicutes",
"p__Proteobacteria", "p__Firmicutes", "p__Bacteroidetes"
), variable = c(
"Sample1",
"Sample1", "Sample1", "Sample1", "Sample1", "Sample2", "Sample2",
"Sample2", "Sample2", "Sample2"
), value = c(
24.51658, 12.33968,
10.00188, 5.16247, 4.56347, 34.40184, 20.38267, 18.77016, 9.94069,
4.14584
)), row.names = c(NA, -10L), class = "data.frame")
other <-
data %>%
group_by(variable) %>%
summarise(value = 100 - sum(value), Phyla = "Other", ID = "Other")
other
#> # A tibble: 2 × 4
#> variable value Phyla ID
#> <chr> <dbl> <chr> <chr>
#> 1 Sample1 43.4 Other Other
#> 2 Sample2 12.4 Other Other
data %>%
bind_rows(other) %>%
arrange(variable)
#> ID Phyla variable value
#> 1 s__Streptococcus_lutetiensis p__Firmicutes Sample1 24.51658
#> 2 s__Streptococcus_equinus p__Firmicutes Sample1 12.33968
#> 3 s__Ligilactobacillus_animalis p__Firmicutes Sample1 10.00188
#> 4 s__Prevotella_copri_clade_A p__Bacteroidetes Sample1 5.16247
#> 5 s__Catenibacterium_sp_AM22_15 p__Firmicutes Sample1 4.56347
#> 6 Other Other Sample1 43.41592
#> 7 s__GGB6572_SGB9285 p__Proteobacteria Sample2 34.40184
#> 8 s__Blautia_producta p__Firmicutes Sample2 20.38267
#> 9 s__GGB2738_SGB3684 p__Proteobacteria Sample2 18.77016
#> 10 s__Amedibacterium_intestinale p__Firmicutes Sample2 9.94069
#> 11 s__GGB48468_SGB66515 p__Bacteroidetes Sample2 4.14584
#> 12 Other Other Sample2 12.35880
Created on 2022-04-22 by the reprex package (v2.0.0)