Home > Back-end >  R pivot wider error. listcols output error
R pivot wider error. listcols output error

Time:07-25

I have a dataset like below,

df1<-structure(list(Uniprot_IDs = c("P0A799|PGK", "P0A853|TNAA", "P0CE47|EFTU1", 
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A9B2|G3P1", "P0A853|TNAA", "P0A6P1|EFTS", 
"P0A6P1|EFTS", "P0A799|PGK"), `1_3ng` = c(12305960196.5721, 24169710612.0476, 
NA, 8553811608.70032, 13176265141.6301, 92994780469.5607, 11373139178.993, 
NA, 8062061247.94512, 3484150815.20598), `2_3ng` = c(11629654800, 
25162283400, 31864546300, 8157173240, 12812379370, 90007498700, 
10191440110, NA, 7911370530, 3406054010), `3_3ng` = c(12503938417.8663, 
25733015601.0117, 34727094361.2997, 8857104380.18179, NA, 93988723611.341, 
11653192532.4546, NA, 7933102839.01341, NA), `4_7-5ng` = c(NA, 
79582218995.1549, 77615759060.3497, 21749287984.8341, 33342436650.5148, 
101254055758.836, 30624750667.6451, 39438567251.7351, 10726988796.4798, 
7850501475.22747), `5_7-5ng` = c(NA, 78743355495.2545, 81948536416.9992, 
NA, 34617564902.3219, 99485017820.8478, NA, 40420212151.9563, 
14804870783.7792, 8280398872.03417), `6_7-5ng` = c(NA, 80272416055.8845, 
77019098847.8474, 23045479130.9574, 32885483296.8046, 90789109337.1181, 
30678346321.0037, 37073444001.0421, 13710097518.7425, 7916821420.64152
), `7_10ng` = c(22617928037.5148, 97473230025.8853, 91579176089.4265, 
28086665669.9634, 38033883000.8102, NA, 37181868033.5073, 44274304023.6936, 
NA, 9288965106.5049), `8_10ng` = c(22091136513.3736, NA, 90754802145.7813, 
26405368418.6503, 36442770423.3661, NA, 36789459227.7515, 42793252584.0984, 
15307787846.1716, 8834742124.86943), `9_10ng` = c(24125219176.3177, 
98420360686.1339, 99355131865.2305, 28271975548.9608, 39837381317.8216, 
NA, 39481996086.9157, 47261977623.5276, 16463020175.2068, 9931809132.696
), `10_15ng` = c(30252776887.1842, 141726904178.35, 130889671408.26, 
38206477283.6549, 56021084469.4745, 100336249543.662, 53295491175.4506, 
62883519160.5278, NA, 13994955303.4972), `11_15ng` = c(32859283128.8916, 
161633827056.573, NA, 45497410866.4248, 61586094337.2513, NA, 
60508117975.6097, 73276218943.4545, NA, 15400735421.5), `12_15ng` = c(34372085877.8071, 
165557046117.222, 153975644961.53, 46279635074.4959, 61867667358.3367, 
106133922907.254, 63526552497.161, 76374667334.5682, NA, 15329671283.3959
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L))

And a group data,

groups <- structure(list(samples = c("1_3ng", "2_3ng", "3_3ng", "4_7-5ng", 
"5_7-5ng", "6_7-5ng", "7_10ng", "8_10ng", "9_10ng", "10_15ng", 
"11_15ng", "12_15ng"), groups = c("GrA", "GrA", "GrA", "GrB", 
"GrB", "GrB", "GrC", "GrC", "GrC", "GrD", "GrD", "GrD")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -12L))

And I have used the following codes for removing the groupwise missing rows from the dataset.

new_colnames <- c("name", "group")
colnames(groups)<-new_colnames

x <- colnames(df1[,1]) 
df2 <- setNames(df1,replace(names(df1),names(df1)==x,"rowid"))

com_data <- df2 %>%
  pivot_longer(!rowid, values_to="mass") %>%
  inner_join(groups, by="name") %>%
  group_by(name) %>%
  filter(!all(is.na(mass))) %>%
  ungroup() %>%
  add_count(rowid) %>%
  filter(n == max(n)) %>%
  select(!c(group, n)) %>%
  pivot_wider(names_from=name, values_from=mass) 

But I am not getting the desired output, it was like below,

structure(list(rowid = c("P0A799|PGK", "P0A853|TNAA", "P0A6P1|EFTS"
), `1_3ng` = list(c(12305960196.5721, 3484150815.20598), c(24169710612.0476, 
11373139178.993), c(NA, 8062061247.94512)), `2_3ng` = list(c(11629654800, 
3406054010), c(25162283400, 10191440110), c(NA, 7911370530)), 
    `3_3ng` = list(c(12503938417.8663, NA), c(25733015601.0117, 
    11653192532.4546), c(NA, 7933102839.01341)), `4_7-5ng` = list(
        c(NA, 7850501475.22747), c(79582218995.1549, 30624750667.6451
        ), c(39438567251.7351, 10726988796.4798)), `5_7-5ng` = list(
        c(NA, 8280398872.03417), c(78743355495.2545, NA), c(40420212151.9563, 
        14804870783.7792)), `6_7-5ng` = list(c(NA, 7916821420.64152
    ), c(80272416055.8845, 30678346321.0037), c(37073444001.0421, 
    13710097518.7425)), `7_10ng` = list(c(22617928037.5148, 9288965106.5049
    ), c(97473230025.8853, 37181868033.5073), c(44274304023.6936, 
    NA)), `8_10ng` = list(c(22091136513.3736, 8834742124.86943
    ), c(NA, 36789459227.7515), c(42793252584.0984, 15307787846.1716
    )), `9_10ng` = list(c(24125219176.3177, 9931809132.696), 
        c(98420360686.1339, 39481996086.9157), c(47261977623.5276, 
        16463020175.2068)), `10_15ng` = list(c(30252776887.1842, 
    13994955303.4972), c(141726904178.35, 53295491175.4506), 
        c(62883519160.5278, NA)), `11_15ng` = list(c(32859283128.8916, 
    15400735421.5), c(161633827056.573, 60508117975.6097), c(73276218943.4545, 
    NA)), `12_15ng` = list(c(34372085877.8071, 15329671283.3959
    ), c(165557046117.222, 63526552497.161), c(76374667334.5682, 
    NA))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-3L))

And the warning message was,

Warning message:
Values from `mass` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
  {data} %>%
    dplyr::group_by(rowid, name) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    dplyr::filter(n > 1L) 

How to get the dataframe output which contain single values in each column instead of list-cols.

My desired output is,

structure(list(Uniprot_IDs = c("P0A853|TNAA", "P0CE47|EFTU1", 
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A853|TNAA", "P0A799|PGK"), `1_3ng` = c(24169710612.0476, 
NA, 8553811608.70032, 13176265141.6301, 11373139178.993, 3484150815.20598
), `2_3ng` = c(25162283400, 31864546300, 8157173240, 12812379370, 
10191440110, 3406054010), `3_3ng` = c(25733015601.0117, 34727094361.2997, 
8857104380.18179, NA, 11653192532.4546, NA), `4_7-5ng` = c(79582218995.1549, 
77615759060.3497, 21749287984.8341, 33342436650.5148, 30624750667.6451, 
7850501475.22747), `5_7-5ng` = c(78743355495.2545, 81948536416.9992, 
NA, 34617564902.3219, NA, 8280398872.03417), `6_7-5ng` = c(80272416055.8845, 
77019098847.8474, 23045479130.9574, 32885483296.8046, 30678346321.0037, 
7916821420.64152), `7_10ng` = c(97473230025.8853, 91579176089.4265, 
28086665669.9634, 38033883000.8102, 37181868033.5073, 9288965106.5049
), `8_10ng` = c(NA, 90754802145.7813, 26405368418.6503, 36442770423.3661, 
36789459227.7515, 8834742124.86943), `9_10ng` = c(98420360686.1339, 
99355131865.2305, 28271975548.9608, 39837381317.8216, 39481996086.9157, 
9931809132.696), `10_15ng` = c(141726904178.35, 130889671408.26, 
38206477283.6549, 56021084469.4745, 53295491175.4506, 13994955303.4972
), `11_15ng` = c(161633827056.573, NA, 45497410866.4248, 61586094337.2513, 
60508117975.6097, 15400735421.5), `12_15ng` = c(165557046117.222, 
153975644961.53, 46279635074.4959, 61867667358.3367, 63526552497.161, 
15329671283.3959)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-6L))

CodePudding user response:

The issue is that your rowid does not uniquely identify observations as there are multiple obs for some rowids. To account for that you could add an additional identifier before reshaping to long:

library(tidyr)
library(dplyr)

com_data <- df2 %>%
  group_by(rowid) %>%
  mutate(id = row_number()) %>%
  ungroup() %>%
  pivot_longer(-c(rowid, id), values_to="mass") %>%
  inner_join(groups, by="name") %>%
  group_by(name) %>%
  filter(!all(is.na(mass))) %>%
  ungroup() %>%
  add_count(rowid) %>%
  filter(n == max(n)) %>%
  select(!c(group, n)) %>%
  arrange(rowid, id) %>%
  pivot_wider(names_from=name, values_from=mass) %>%
  select(-id)

com_data
#> # A tibble: 6 × 13
#>   rowid         `1_3ng`  `2_3ng`  `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#>   <chr>           <dbl>    <dbl>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>
#> 1 P0A6P1|EFTS  NA       NA       NA         3.94e10   4.04e10   3.71e10  4.43e10
#> 2 P0A6P1|EFTS   8.06e 9  7.91e 9  7.93e 9   1.07e10   1.48e10   1.37e10 NA      
#> 3 P0A799|PGK    1.23e10  1.16e10  1.25e10  NA        NA        NA        2.26e10
#> 4 P0A799|PGK    3.48e 9  3.41e 9 NA         7.85e 9   8.28e 9   7.92e 9  9.29e 9
#> 5 P0A853|TNAA   2.42e10  2.52e10  2.57e10   7.96e10   7.87e10   8.03e10  9.75e10
#> 6 P0A853|TNAA   1.14e10  1.02e10  1.17e10   3.06e10  NA         3.07e10  3.72e10
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> #   `11_15ng` <dbl>, `12_15ng` <dbl>

EDIT To get your desired result is a different issue but could be achieved like so:

com_data <- df2 %>%
  group_by(rowid) %>%
  mutate(id = row_number()) %>%
  ungroup() %>%
  pivot_longer(-c(rowid, id), values_to = "mass") %>%
  inner_join(groups, by = "name") %>%
  add_count(rowid, id, group, wt = !is.na(mass)) %>%
  group_by(rowid, id) %>%
  filter(!any(n == 0)) %>%
  ungroup() %>%
  select(!c(group, n)) %>%
  pivot_wider(names_from = name, values_from = mass) %>%
  select(-id)

com_data
#> # A tibble: 6 × 13
#>   rowid          `1_3ng` `2_3ng`  `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#>   <chr>            <dbl>   <dbl>    <dbl>     <dbl>     <dbl>     <dbl>    <dbl>
#> 1 P0A853|TNAA    2.42e10 2.52e10  2.57e10   7.96e10   7.87e10   8.03e10  9.75e10
#> 2 P0CE47|EFTU1  NA       3.19e10  3.47e10   7.76e10   8.19e10   7.70e10  9.16e10
#> 3 P0A6F3|GLPK    8.55e 9 8.16e 9  8.86e 9   2.17e10  NA         2.30e10  2.81e10
#> 4 P0A6F5|CH60    1.32e10 1.28e10 NA         3.33e10   3.46e10   3.29e10  3.80e10
#> 5 P0A853|TNAA    1.14e10 1.02e10  1.17e10   3.06e10  NA         3.07e10  3.72e10
#> 6 P0A799|PGK     3.48e 9 3.41e 9 NA         7.85e 9   8.28e 9   7.92e 9  9.29e 9
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> #   `11_15ng` <dbl>, `12_15ng` <dbl>
  • Related