combine redundant row items in r-CodePudding

I have a dataset with the the names of many different plant species (column MTmatch), some of which appear repeatedly. Each of these has a column (ReadSum) with a sum associated with it (as well as many other pieces of information). How do I combine/aggregate all of the redundant plant species and sum the associated ReadSum with each, while leaving the non-redundant rows alone?

I would like to take a dataset like this, and either have it transformed so that each sample has the aggregate of the combined rows, or at least an additional column showing the sum of the ReadSum column for the combined redundant species. Sorry if this is confusing, I'm not sure how to ask this question.

I have been messing about with dplyr, using group_by() and summarise(), but that seems to be summarizing across the whole column rather than just the new group.

structure(list(ESVID = c("ESV_000090", "ESV_000682", "ESV_000028", 
"ESV_000030", "ESV_000010", "ESV_000182", "ESV_000040", "ESV_000135", 
"ESV_000383"), S026401.R1 = c(0.222447727, 0, 0, 0, 0, 0, 0.029074432, 
0, 0), S026404.R1 = c(0.022583349, 0, 0, 0, 0, 0, 0.016390389, 
0.001257217, 0), S026406.R1 = c(0.360895503, 0, 0, 0.00814677, 
0, 0, 0.01513888, 0, 0.00115466)), row.names = c(NA, -9L), class = "data.frame")
> dput(samp5[1:9])
structure(list(ESVID = c("ESV_000090", "ESV_000682", "ESV_000028", 
"ESV_000030", "ESV_000010", "ESV_000182", "ESV_000040", "ESV_000135", 
"ESV_000383"), S026401.R1 = c(0.222447727, 0, 0, 0, 0, 0, 0.029074432, 
0, 0), S026404.R1 = c(0.022583349, 0, 0, 0, 0, 0, 0.016390389, 
0.001257217, 0), S026406.R1 = c(0.360895503, 0, 0, 0.00814677, 
0, 0, 0.01513888, 0, 0.00115466), S026409.R1 = c(0.221175955, 
0, 0, 0, 0, 0, 0.005146173, 0, 0), S026412.R1 = c(0.026058888, 
0, 0, 0, 0, 0, 0, 0, 0), MAX = c(0.400577608, 0.009933177, 0.124412855, 
0.00814677, 0.009824944, 0.086475106, 0.154850408, 0.015593835, 
0.008340888), ReadSum = c(3.54892343, 0.012059346, 0.203303936, 
0.021075546, 0.009824944, 0.128007863, 0.859687787, 0.068159534, 
0.050266853), SPECIES = c("Abies ", "Abies ", "Acer", "Alnus", 
"Berberis", "Betula ", "Boykinia", "Boykinia", "Boykinia")), row.names = c(NA, 
-9L), class = "data.frame")

CodePudding user response：

Do either of these approached produce your intended outcome?

Data:

df <- structure(list(ESVID = c("ESV_000090", "ESV_000682", "ESV_000028", 
                         "ESV_000030", "ESV_000010", "ESV_000182", "ESV_000040", "ESV_000135", 
                         "ESV_000383"), S026401.R1 = c(0.222447727, 0, 0, 0, 0, 0, 0.029074432, 
                                                       0, 0), S026404.R1 = c(0.022583349, 0, 0, 0, 0, 0, 0.016390389, 
                                                                             0.001257217, 0), S026406.R1 = c(0.360895503, 0, 0, 0.00814677, 
                                                                                                             0, 0, 0.01513888, 0, 0.00115466), S026409.R1 = c(0.221175955, 
                                                                                                                                                              0, 0, 0, 0, 0, 0.005146173, 0, 0), S026412.R1 = c(0.026058888, 
                                                                                                                                                                                                                0, 0, 0, 0, 0, 0, 0, 0), MAX = c(0.400577608, 0.009933177, 0.124412855, 
                                                                                                                                                                                                                                                 0.00814677, 0.009824944, 0.086475106, 0.154850408, 0.015593835, 
                                                                                                                                                                                                                                                 0.008340888), ReadSum = c(3.54892343, 0.012059346, 0.203303936, 
                                                                                                                                                                                                                                                                           0.021075546, 0.009824944, 0.128007863, 0.859687787, 0.068159534, 
                                                                                                                                                                                                                                                                           0.050266853), SPECIES = c("Abies ", "Abies ", "Acer", "Alnus", 
                                                                                                                                                                                                                                                                                                     "Berberis", "Betula ", "Boykinia", "Boykinia", "Boykinia")), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                -9L), class = "data.frame")

Create a new column "combined_ReadSum" (2nd col) which is the sum of "ReadSum" for each "SPECIES":

library(dplyr)

df %>%
  group_by(SPECIES) %>%
  summarise(combined_ReadSum = sum(ReadSum)) %>%
  left_join(df, by = "SPECIES")
#> # A tibble: 9 × 10
#>   SPECIES  combi…¹ ESVID S0264…² S0264…³ S0264…⁴ S0264…⁵ S0264…⁶     MAX ReadSum
#>   <chr>      <dbl> <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1 "Abies " 3.56    ESV_…  0.222  0.0226  0.361   0.221    0.0261 0.401   3.55   
#> 2 "Abies " 3.56    ESV_…  0      0       0       0        0      0.00993 0.0121 
#> 3 "Acer"   0.203   ESV_…  0      0       0       0        0      0.124   0.203  
#> 4 "Alnus"  0.0211  ESV_…  0      0       0.00815 0        0      0.00815 0.0211 
#> 5 "Berber… 0.00982 ESV_…  0      0       0       0        0      0.00982 0.00982
#> 6 "Betula… 0.128   ESV_…  0      0       0       0        0      0.0865  0.128  
#> 7 "Boykin… 0.978   ESV_…  0.0291 0.0164  0.0151  0.00515  0      0.155   0.860  
#> 8 "Boykin… 0.978   ESV_…  0      0.00126 0       0        0      0.0156  0.0682 
#> 9 "Boykin… 0.978   ESV_…  0      0       0.00115 0        0      0.00834 0.0503 
#> # … with abbreviated variable names ¹combined_ReadSum, ²S026401.R1,
#> #   ³S026404.R1, ⁴S026406.R1, ⁵S026409.R1, ⁶S026412.R1

Or, summarise columns by summing the values for each unique species:

library(dplyr)

df %>%
  group_by(SPECIES) %>%
  summarise(across(where(is.numeric), sum))
#> # A tibble: 6 × 8
#>   SPECIES    S026401.R1 S026404.R1 S026406.R1 S026409.R1 S0264…¹     MAX ReadSum
#>   <chr>           <dbl>      <dbl>      <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
#> 1 "Abies "       0.222      0.0226    0.361      0.221    0.0261 0.411   3.56   
#> 2 "Acer"         0          0         0          0        0      0.124   0.203  
#> 3 "Alnus"        0          0         0.00815    0        0      0.00815 0.0211 
#> 4 "Berberis"     0          0         0          0        0      0.00982 0.00982
#> 5 "Betula "      0          0         0          0        0      0.0865  0.128  
#> 6 "Boykinia"     0.0291     0.0176    0.0163     0.00515  0      0.179   0.978  
#> # … with abbreviated variable name ¹S026412.R1

^{Created on 2022-10-28 by the reprex package (v2.0.1)}

CodePudding user response：

thanks jared mamrot! worked like a charm. both are helpful, but i'll be using the second suggestion to reorder some of the data.