I have a dataset like below,
df1<-structure(list(Uniprot_IDs = c("P0A799|PGK", "P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A9B2|G3P1", "P0A853|TNAA", "P0A6P1|EFTS",
"P0A6P1|EFTS", "P0A799|PGK"), `1_3ng` = c(12305960196.5721, 24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 92994780469.5607, 11373139178.993,
NA, 8062061247.94512, 3484150815.20598), `2_3ng` = c(11629654800,
25162283400, 31864546300, 8157173240, 12812379370, 90007498700,
10191440110, NA, 7911370530, 3406054010), `3_3ng` = c(12503938417.8663,
25733015601.0117, 34727094361.2997, 8857104380.18179, NA, 93988723611.341,
11653192532.4546, NA, 7933102839.01341, NA), `4_7-5ng` = c(NA,
79582218995.1549, 77615759060.3497, 21749287984.8341, 33342436650.5148,
101254055758.836, 30624750667.6451, 39438567251.7351, 10726988796.4798,
7850501475.22747), `5_7-5ng` = c(NA, 78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, 99485017820.8478, NA, 40420212151.9563,
14804870783.7792, 8280398872.03417), `6_7-5ng` = c(NA, 80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 90789109337.1181,
30678346321.0037, 37073444001.0421, 13710097518.7425, 7916821420.64152
), `7_10ng` = c(22617928037.5148, 97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, NA, 37181868033.5073, 44274304023.6936,
NA, 9288965106.5049), `8_10ng` = c(22091136513.3736, NA, 90754802145.7813,
26405368418.6503, 36442770423.3661, NA, 36789459227.7515, 42793252584.0984,
15307787846.1716, 8834742124.86943), `9_10ng` = c(24125219176.3177,
98420360686.1339, 99355131865.2305, 28271975548.9608, 39837381317.8216,
NA, 39481996086.9157, 47261977623.5276, 16463020175.2068, 9931809132.696
), `10_15ng` = c(30252776887.1842, 141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 100336249543.662, 53295491175.4506,
62883519160.5278, NA, 13994955303.4972), `11_15ng` = c(32859283128.8916,
161633827056.573, NA, 45497410866.4248, 61586094337.2513, NA,
60508117975.6097, 73276218943.4545, NA, 15400735421.5), `12_15ng` = c(34372085877.8071,
165557046117.222, 153975644961.53, 46279635074.4959, 61867667358.3367,
106133922907.254, 63526552497.161, 76374667334.5682, NA, 15329671283.3959
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
And a group data,
groups <- structure(list(samples = c("1_3ng", "2_3ng", "3_3ng", "4_7-5ng",
"5_7-5ng", "6_7-5ng", "7_10ng", "8_10ng", "9_10ng", "10_15ng",
"11_15ng", "12_15ng"), groups = c("GrA", "GrA", "GrA", "GrB",
"GrB", "GrB", "GrC", "GrC", "GrC", "GrD", "GrD", "GrD")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -12L))
And I have used the following codes for removing the groupwise missing rows from the dataset.
new_colnames <- c("name", "group")
colnames(groups)<-new_colnames
x <- colnames(df1[,1])
df2 <- setNames(df1,replace(names(df1),names(df1)==x,"rowid"))
com_data <- df2 %>%
pivot_longer(!rowid, values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
pivot_wider(names_from=name, values_from=mass)
But I am not getting the desired output, it was like below,
structure(list(rowid = c("P0A799|PGK", "P0A853|TNAA", "P0A6P1|EFTS"
), `1_3ng` = list(c(12305960196.5721, 3484150815.20598), c(24169710612.0476,
11373139178.993), c(NA, 8062061247.94512)), `2_3ng` = list(c(11629654800,
3406054010), c(25162283400, 10191440110), c(NA, 7911370530)),
`3_3ng` = list(c(12503938417.8663, NA), c(25733015601.0117,
11653192532.4546), c(NA, 7933102839.01341)), `4_7-5ng` = list(
c(NA, 7850501475.22747), c(79582218995.1549, 30624750667.6451
), c(39438567251.7351, 10726988796.4798)), `5_7-5ng` = list(
c(NA, 8280398872.03417), c(78743355495.2545, NA), c(40420212151.9563,
14804870783.7792)), `6_7-5ng` = list(c(NA, 7916821420.64152
), c(80272416055.8845, 30678346321.0037), c(37073444001.0421,
13710097518.7425)), `7_10ng` = list(c(22617928037.5148, 9288965106.5049
), c(97473230025.8853, 37181868033.5073), c(44274304023.6936,
NA)), `8_10ng` = list(c(22091136513.3736, 8834742124.86943
), c(NA, 36789459227.7515), c(42793252584.0984, 15307787846.1716
)), `9_10ng` = list(c(24125219176.3177, 9931809132.696),
c(98420360686.1339, 39481996086.9157), c(47261977623.5276,
16463020175.2068)), `10_15ng` = list(c(30252776887.1842,
13994955303.4972), c(141726904178.35, 53295491175.4506),
c(62883519160.5278, NA)), `11_15ng` = list(c(32859283128.8916,
15400735421.5), c(161633827056.573, 60508117975.6097), c(73276218943.4545,
NA)), `12_15ng` = list(c(34372085877.8071, 15329671283.3959
), c(165557046117.222, 63526552497.161), c(76374667334.5682,
NA))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L))
And the warning message was,
Warning message:
Values from `mass` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(rowid, name) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
How to get the dataframe output which contain single values in each column instead of list-cols.
My desired output is,
structure(list(Uniprot_IDs = c("P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A853|TNAA", "P0A799|PGK"), `1_3ng` = c(24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 11373139178.993, 3484150815.20598
), `2_3ng` = c(25162283400, 31864546300, 8157173240, 12812379370,
10191440110, 3406054010), `3_3ng` = c(25733015601.0117, 34727094361.2997,
8857104380.18179, NA, 11653192532.4546, NA), `4_7-5ng` = c(79582218995.1549,
77615759060.3497, 21749287984.8341, 33342436650.5148, 30624750667.6451,
7850501475.22747), `5_7-5ng` = c(78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, NA, 8280398872.03417), `6_7-5ng` = c(80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 30678346321.0037,
7916821420.64152), `7_10ng` = c(97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, 37181868033.5073, 9288965106.5049
), `8_10ng` = c(NA, 90754802145.7813, 26405368418.6503, 36442770423.3661,
36789459227.7515, 8834742124.86943), `9_10ng` = c(98420360686.1339,
99355131865.2305, 28271975548.9608, 39837381317.8216, 39481996086.9157,
9931809132.696), `10_15ng` = c(141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 53295491175.4506, 13994955303.4972
), `11_15ng` = c(161633827056.573, NA, 45497410866.4248, 61586094337.2513,
60508117975.6097, 15400735421.5), `12_15ng` = c(165557046117.222,
153975644961.53, 46279635074.4959, 61867667358.3367, 63526552497.161,
15329671283.3959)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
CodePudding user response:
The issue is that your rowid
does not uniquely identify observations as there are multiple obs for some rowid
s. To account for that you could add an additional identifier before reshaping to long:
library(tidyr)
library(dplyr)
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
arrange(rowid, id) %>%
pivot_wider(names_from=name, values_from=mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A6P1|EFTS NA NA NA 3.94e10 4.04e10 3.71e10 4.43e10
#> 2 P0A6P1|EFTS 8.06e 9 7.91e 9 7.93e 9 1.07e10 1.48e10 1.37e10 NA
#> 3 P0A799|PGK 1.23e10 1.16e10 1.25e10 NA NA NA 2.26e10
#> 4 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> 5 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 6 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>
EDIT To get your desired result is a different issue but could be achieved like so:
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to = "mass") %>%
inner_join(groups, by = "name") %>%
add_count(rowid, id, group, wt = !is.na(mass)) %>%
group_by(rowid, id) %>%
filter(!any(n == 0)) %>%
ungroup() %>%
select(!c(group, n)) %>%
pivot_wider(names_from = name, values_from = mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 2 P0CE47|EFTU1 NA 3.19e10 3.47e10 7.76e10 8.19e10 7.70e10 9.16e10
#> 3 P0A6F3|GLPK 8.55e 9 8.16e 9 8.86e 9 2.17e10 NA 2.30e10 2.81e10
#> 4 P0A6F5|CH60 1.32e10 1.28e10 NA 3.33e10 3.46e10 3.29e10 3.80e10
#> 5 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> 6 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>