I have the following data frame:
dat <- structure(list(ref_string = c("K", "Y", "Q", "C", "H", "A", "S",
"Y", "L", "Y"), peptide_name = c("p47", "p666", "p506", "p356",
"p598", "p458", "p448", "p117", "p232", "p464"), peptide = c("FKDHKHIDVKgrrrskrrrrTRCYHIDPHH",
"FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH", "FKDHKHIDVKrrrrskrgrrTRCYHIDPHH",
"FKDHKHIDVKrrrgrrrrskTRCYHIDPHH", "FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH",
"FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH", "FKDHKHIDVKrrrrrgrskrTRCYHIDPHH",
"FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH", "FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH",
"FKDHKHIDVKrrrrrksrrgTRCYHIDPHH"), status = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0)), class = c("rowwise_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), groups = structure(list(.rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")))
It looks like this:
# A tibble: 10 × 4
# Rowwise:
ref_string peptide_name peptide status
<chr> <chr> <chr> <dbl>
1 K p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH 0
2 Y p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH 0
3 Q p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH 0
4 C p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH 0
5 H p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH 0
6 A p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH 0
7 S p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH 0
8 Y p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH 0
9 L p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH 0
10 Y p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH 0
Currently, I have this dplyr's process which is slow for real data set:
library(tidyverse)
dat %>%
dplyr::group_by(peptide_name, peptide) %>%
mutate(tot = sum(status)) %>%
filter(tot == 0) %>%
pivot_wider(names_from = ref_string, values_from = status) %>%
dplyr::select(-tot) %>%
ungroup()
Which produces:
# A tibble: 10 × 10
peptide_name peptide K Y Q C H A S L
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH 0 NA NA NA NA NA NA NA
2 p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH NA 0 NA NA NA NA NA NA
3 p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH NA NA 0 NA NA NA NA NA
4 p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH NA NA NA 0 NA NA NA NA
5 p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH NA NA NA NA 0 NA NA NA
6 p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH NA NA NA NA NA 0 NA NA
7 p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH NA NA NA NA NA NA 0 NA
8 p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH NA 0 NA NA NA NA NA NA
9 p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH NA NA NA NA NA NA NA 0
10 p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH NA 0 NA NA NA NA NA NA
How can I achieve the same result with data.table?
I'm stuck here:
library(data.table)
dt <- as.data.table(dat)
dt[,by = peptide, tot:=sum(status) ]
dt
with this result:
ref_string peptide_name peptide status tot
1: K p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH 0 0
2: Y p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH 0 0
3: Q p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH 0 0
4: C p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH 0 0
5: H p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH 0 0
6: A p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH 0 0
7: S p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH 0 0
8: Y p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH 0 0
9: L p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH 0 0
10: Y p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH 0 0
I don't know how to use filter and pivot_wider for data.table.
CodePudding user response:
Here is a data.table
option, where dcast
should be used to pivot wider
dcast(
setDT(dat)[
,
tot := sum(status),
.(peptide_name, peptide)
][tot == 0],
peptide_name peptide ~ ref_string,
value.var = "status"
)
and you will see the result as
peptide_name peptide A C H K L Q S Y
1: p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH NA NA NA NA NA NA NA 0
2: p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH NA NA NA NA 0 NA NA NA
3: p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH NA 0 NA NA NA NA NA NA
4: p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH NA NA NA NA NA NA 0 NA
5: p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH 0 NA NA NA NA NA NA NA
6: p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH NA NA NA NA NA NA NA 0
7: p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH NA NA NA 0 NA NA NA NA
8: p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH NA NA NA NA NA 0 NA NA
9: p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH NA NA 0 NA NA NA NA NA
10: p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH NA NA NA NA NA NA NA 0
CodePudding user response:
Try dtplyr, which allows you to use dplyr and tidyr commands with data.table as a backend:
library(dplyr)
library(tidyr)
library(dtplyr)
dat_dt <- lazy_dt(dat)
dat_dt %>%
group_by(peptide_name, peptide) %>%
mutate(tot = sum(status)) %>%
filter(tot == 0) %>%
pivot_wider(names_from = ref_string, values_from = status) %>%
select(-tot) %>%
ungroup() %>%
as_tibble()
# A tibble: 10 × 10
peptide_name peptide K Y Q C H A S L
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 p117 FKDHKHIDVKkrrrr… NA 0 NA NA NA NA NA NA
2 p232 FKDHKHIDVKrkrrr… NA NA NA NA NA NA NA 0
3 p356 FKDHKHIDVKrrrgr… NA NA NA 0 NA NA NA NA
4 p448 FKDHKHIDVKrrrrr… NA NA NA NA NA NA 0 NA
5 p458 FKDHKHIDVKrrrrr… NA NA NA NA NA 0 NA NA
6 p464 FKDHKHIDVKrrrrr… NA 0 NA NA NA NA NA NA
7 p47 FKDHKHIDVKgrrrs… 0 NA NA NA NA NA NA NA
8 p506 FKDHKHIDVKrrrrs… NA NA 0 NA NA NA NA NA
9 p598 FKDHKHIDVKrsgrr… NA NA NA NA 0 NA NA NA
10 p666 FKDHKHIDVKsrgrk… NA 0 NA NA NA NA NA NA
Microbenchmark:
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 27.80141 29.91216 32.92722 31.35731 33.91613 73.43162 100 b
dtplyr 18.60528 20.17416 22.33375 21.38543 23.15164 52.08883 100 a
CodePudding user response:
Another way of solving your problem using data.table
dcast(data = setDT(dat)[, if(sum(status)==0) .SD, by=.(peptide_name, peptide)],
formula = peptide_name peptide ~ ref_string,
value.var = "status")
peptide_name peptide A C H K L Q S Y
<char> <char> <num> <num> <num> <num> <num> <num> <num> <num>
1: p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH NA NA NA NA NA NA NA 0
2: p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH NA NA NA NA 0 NA NA NA
3: p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH NA 0 NA NA NA NA NA NA
4: p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH NA NA NA NA NA NA 0 NA
5: p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH 0 NA NA NA NA NA NA NA
6: p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH NA NA NA NA NA NA NA 0
7: p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH NA NA NA 0 NA NA NA NA
8: p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH NA NA NA NA NA 0 NA NA
9: p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH NA NA 0 NA NA NA NA NA
10: p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH NA NA NA NA NA NA NA 0