R pivot wider error. listcols output error-CodePudding

I have a dataset like below,

df1<-structure(list(Uniprot_IDs = c("P0A799|PGK", "P0A853|TNAA", "P0CE47|EFTU1", 
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A9B2|G3P1", "P0A853|TNAA", "P0A6P1|EFTS", 
"P0A6P1|EFTS", "P0A799|PGK"), `1_3ng` = c(12305960196.5721, 24169710612.0476, 
NA, 8553811608.70032, 13176265141.6301, 92994780469.5607, 11373139178.993, 
NA, 8062061247.94512, 3484150815.20598), `2_3ng` = c(11629654800, 
25162283400, 31864546300, 8157173240, 12812379370, 90007498700, 
10191440110, NA, 7911370530, 3406054010), `3_3ng` = c(12503938417.8663, 
25733015601.0117, 34727094361.2997, 8857104380.18179, NA, 93988723611.341, 
11653192532.4546, NA, 7933102839.01341, NA), `4_7-5ng` = c(NA, 
79582218995.1549, 77615759060.3497, 21749287984.8341, 33342436650.5148, 
101254055758.836, 30624750667.6451, 39438567251.7351, 10726988796.4798, 
7850501475.22747), `5_7-5ng` = c(NA, 78743355495.2545, 81948536416.9992, 
NA, 34617564902.3219, 99485017820.8478, NA, 40420212151.9563, 
14804870783.7792, 8280398872.03417), `6_7-5ng` = c(NA, 80272416055.8845, 
77019098847.8474, 23045479130.9574, 32885483296.8046, 90789109337.1181, 
30678346321.0037, 37073444001.0421, 13710097518.7425, 7916821420.64152
), `7_10ng` = c(22617928037.5148, 97473230025.8853, 91579176089.4265, 
28086665669.9634, 38033883000.8102, NA, 37181868033.5073, 44274304023.6936, 
NA, 9288965106.5049), `8_10ng` = c(22091136513.3736, NA, 90754802145.7813, 
26405368418.6503, 36442770423.3661, NA, 36789459227.7515, 42793252584.0984, 
15307787846.1716, 8834742124.86943), `9_10ng` = c(24125219176.3177, 
98420360686.1339, 99355131865.2305, 28271975548.9608, 39837381317.8216, 
NA, 39481996086.9157, 47261977623.5276, 16463020175.2068, 9931809132.696
), `10_15ng` = c(30252776887.1842, 141726904178.35, 130889671408.26, 
38206477283.6549, 56021084469.4745, 100336249543.662, 53295491175.4506, 
62883519160.5278, NA, 13994955303.4972), `11_15ng` = c(32859283128.8916, 
161633827056.573, NA, 45497410866.4248, 61586094337.2513, NA, 
60508117975.6097, 73276218943.4545, NA, 15400735421.5), `12_15ng` = c(34372085877.8071, 
165557046117.222, 153975644961.53, 46279635074.4959, 61867667358.3367, 
106133922907.254, 63526552497.161, 76374667334.5682, NA, 15329671283.3959
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L))

And a group data,

groups <- structure(list(samples = c("1_3ng", "2_3ng", "3_3ng", "4_7-5ng", 
"5_7-5ng", "6_7-5ng", "7_10ng", "8_10ng", "9_10ng", "10_15ng", 
"11_15ng", "12_15ng"), groups = c("GrA", "GrA", "GrA", "GrB", 
"GrB", "GrB", "GrC", "GrC", "GrC", "GrD", "GrD", "GrD")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -12L))

And I have used the following codes for removing the groupwise missing rows from the dataset.

new_colnames <- c("name", "group")
colnames(groups)<-new_colnames

x <- colnames(df1[,1]) 
df2 <- setNames(df1,replace(names(df1),names(df1)==x,"rowid"))

com_data <- df2 %>%
  pivot_longer(!rowid, values_to="mass") %>%
  inner_join(groups, by="name") %>%
  group_by(name) %>%
  filter(!all(is.na(mass))) %>%
  ungroup() %>%
  add_count(rowid) %>%
  filter(n == max(n)) %>%
  select(!c(group, n)) %>%
  pivot_wider(names_from=name, values_from=mass)

But I am not getting the desired output, it was like below,

structure(list(rowid = c("P0A799|PGK", "P0A853|TNAA", "P0A6P1|EFTS"
), `1_3ng` = list(c(12305960196.5721, 3484150815.20598), c(24169710612.0476, 
11373139178.993), c(NA, 8062061247.94512)), `2_3ng` = list(c(11629654800, 
3406054010), c(25162283400, 10191440110), c(NA, 7911370530)), 
    `3_3ng` = list(c(12503938417.8663, NA), c(25733015601.0117, 
    11653192532.4546), c(NA, 7933102839.01341)), `4_7-5ng` = list(
        c(NA, 7850501475.22747), c(79582218995.1549, 30624750667.6451
        ), c(39438567251.7351, 10726988796.4798)), `5_7-5ng` = list(
        c(NA, 8280398872.03417), c(78743355495.2545, NA), c(40420212151.9563, 
        14804870783.7792)), `6_7-5ng` = list(c(NA, 7916821420.64152
    ), c(80272416055.8845, 30678346321.0037), c(37073444001.0421, 
    13710097518.7425)), `7_10ng` = list(c(22617928037.5148, 9288965106.5049
    ), c(97473230025.8853, 37181868033.5073), c(44274304023.6936, 
    NA)), `8_10ng` = list(c(22091136513.3736, 8834742124.86943
    ), c(NA, 36789459227.7515), c(42793252584.0984, 15307787846.1716
    )), `9_10ng` = list(c(24125219176.3177, 9931809132.696), 
        c(98420360686.1339, 39481996086.9157), c(47261977623.5276, 
        16463020175.2068)), `10_15ng` = list(c(30252776887.1842, 
    13994955303.4972), c(141726904178.35, 53295491175.4506), 
        c(62883519160.5278, NA)), `11_15ng` = list(c(32859283128.8916, 
    15400735421.5), c(161633827056.573, 60508117975.6097), c(73276218943.4545, 
    NA)), `12_15ng` = list(c(34372085877.8071, 15329671283.3959
    ), c(165557046117.222, 63526552497.161), c(76374667334.5682, 
    NA))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-3L))

And the warning message was,

Warning message:
Values from `mass` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
  {data} %>%
    dplyr::group_by(rowid, name) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    dplyr::filter(n > 1L)

How to get the dataframe output which contain single values in each column instead of list-cols.

My desired output is,

structure(list(Uniprot_IDs = c("P0A853|TNAA", "P0CE47|EFTU1", 
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A853|TNAA", "P0A799|PGK"), `1_3ng` = c(24169710612.0476, 
NA, 8553811608.70032, 13176265141.6301, 11373139178.993, 3484150815.20598
), `2_3ng` = c(25162283400, 31864546300, 8157173240, 12812379370, 
10191440110, 3406054010), `3_3ng` = c(25733015601.0117, 34727094361.2997, 
8857104380.18179, NA, 11653192532.4546, NA), `4_7-5ng` = c(79582218995.1549, 
77615759060.3497, 21749287984.8341, 33342436650.5148, 30624750667.6451, 
7850501475.22747), `5_7-5ng` = c(78743355495.2545, 81948536416.9992, 
NA, 34617564902.3219, NA, 8280398872.03417), `6_7-5ng` = c(80272416055.8845, 
77019098847.8474, 23045479130.9574, 32885483296.8046, 30678346321.0037, 
7916821420.64152), `7_10ng` = c(97473230025.8853, 91579176089.4265, 
28086665669.9634, 38033883000.8102, 37181868033.5073, 9288965106.5049
), `8_10ng` = c(NA, 90754802145.7813, 26405368418.6503, 36442770423.3661, 
36789459227.7515, 8834742124.86943), `9_10ng` = c(98420360686.1339, 
99355131865.2305, 28271975548.9608, 39837381317.8216, 39481996086.9157, 
9931809132.696), `10_15ng` = c(141726904178.35, 130889671408.26, 
38206477283.6549, 56021084469.4745, 53295491175.4506, 13994955303.4972
), `11_15ng` = c(161633827056.573, NA, 45497410866.4248, 61586094337.2513, 
60508117975.6097, 15400735421.5), `12_15ng` = c(165557046117.222, 
153975644961.53, 46279635074.4959, 61867667358.3367, 63526552497.161, 
15329671283.3959)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-6L))

CodePudding user response：

The issue is that your rowid does not uniquely identify observations as there are multiple obs for some rowids. To account for that you could add an additional identifier before reshaping to long:

library(tidyr)
library(dplyr)

com_data <- df2 %>%
  group_by(rowid) %>%
  mutate(id = row_number()) %>%
  ungroup() %>%
  pivot_longer(-c(rowid, id), values_to="mass") %>%
  inner_join(groups, by="name") %>%
  group_by(name) %>%
  filter(!all(is.na(mass))) %>%
  ungroup() %>%
  add_count(rowid) %>%
  filter(n == max(n)) %>%
  select(!c(group, n)) %>%
  arrange(rowid, id) %>%
  pivot_wider(names_from=name, values_from=mass) %>%
  select(-id)

com_data
#> # A tibble: 6 × 13
#>   rowid         `1_3ng`  `2_3ng`  `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#>   <chr>           <dbl>    <dbl>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>
#> 1 P0A6P1|EFTS  NA       NA       NA         3.94e10   4.04e10   3.71e10  4.43e10
#> 2 P0A6P1|EFTS   8.06e 9  7.91e 9  7.93e 9   1.07e10   1.48e10   1.37e10 NA      
#> 3 P0A799|PGK    1.23e10  1.16e10  1.25e10  NA        NA        NA        2.26e10
#> 4 P0A799|PGK    3.48e 9  3.41e 9 NA         7.85e 9   8.28e 9   7.92e 9  9.29e 9
#> 5 P0A853|TNAA   2.42e10  2.52e10  2.57e10   7.96e10   7.87e10   8.03e10  9.75e10
#> 6 P0A853|TNAA   1.14e10  1.02e10  1.17e10   3.06e10  NA         3.07e10  3.72e10
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> #   `11_15ng` <dbl>, `12_15ng` <dbl>

EDIT To get your desired result is a different issue but could be achieved like so:

com_data <- df2 %>%
  group_by(rowid) %>%
  mutate(id = row_number()) %>%
  ungroup() %>%
  pivot_longer(-c(rowid, id), values_to = "mass") %>%
  inner_join(groups, by = "name") %>%
  add_count(rowid, id, group, wt = !is.na(mass)) %>%
  group_by(rowid, id) %>%
  filter(!any(n == 0)) %>%
  ungroup() %>%
  select(!c(group, n)) %>%
  pivot_wider(names_from = name, values_from = mass) %>%
  select(-id)

com_data
#> # A tibble: 6 × 13
#>   rowid          `1_3ng` `2_3ng`  `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#>   <chr>            <dbl>   <dbl>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>
#> 1 P0A853|TNAA    2.42e10 2.52e10  2.57e10   7.96e10   7.87e10   8.03e10  9.75e10
#> 2 P0CE47|EFTU1  NA       3.19e10  3.47e10   7.76e10   8.19e10   7.70e10  9.16e10
#> 3 P0A6F3|GLPK    8.55e 9 8.16e 9  8.86e 9   2.17e10  NA         2.30e10  2.81e10
#> 4 P0A6F5|CH60    1.32e10 1.28e10 NA         3.33e10   3.46e10   3.29e10  3.80e10
#> 5 P0A853|TNAA    1.14e10 1.02e10  1.17e10   3.06e10  NA         3.07e10  3.72e10
#> 6 P0A799|PGK     3.48e 9 3.41e 9 NA         7.85e 9   8.28e 9   7.92e 9  9.29e 9
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> #   `11_15ng` <dbl>, `12_15ng` <dbl>