Home > OS >  How to convert dplyr's group_by, mutate, filter, pivot_wider into data.table approach
How to convert dplyr's group_by, mutate, filter, pivot_wider into data.table approach

Time:11-11

I have the following data frame:

dat <- structure(list(ref_string = c("K", "Y", "Q", "C", "H", "A", "S", 
"Y", "L", "Y"), peptide_name = c("p47", "p666", "p506", "p356", 
"p598", "p458", "p448", "p117", "p232", "p464"), peptide = c("FKDHKHIDVKgrrrskrrrrTRCYHIDPHH", 
"FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH", "FKDHKHIDVKrrrrskrgrrTRCYHIDPHH", 
"FKDHKHIDVKrrrgrrrrskTRCYHIDPHH", "FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH", 
"FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH", "FKDHKHIDVKrrrrrgrskrTRCYHIDPHH", 
"FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH", "FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH", 
"FKDHKHIDVKrrrrrksrrgTRCYHIDPHH"), status = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0)), class = c("rowwise_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), groups = structure(list(.rows = structure(list(
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame")))

It looks like this:

# A tibble: 10 × 4
# Rowwise: 
   ref_string peptide_name peptide                        status
   <chr>      <chr>        <chr>                           <dbl>
 1 K          p47          FKDHKHIDVKgrrrskrrrrTRCYHIDPHH      0
 2 Y          p666         FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH      0
 3 Q          p506         FKDHKHIDVKrrrrskrgrrTRCYHIDPHH      0
 4 C          p356         FKDHKHIDVKrrrgrrrrskTRCYHIDPHH      0
 5 H          p598         FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH      0
 6 A          p458         FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH      0
 7 S          p448         FKDHKHIDVKrrrrrgrskrTRCYHIDPHH      0
 8 Y          p117         FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH      0
 9 L          p232         FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH      0
10 Y          p464         FKDHKHIDVKrrrrrksrrgTRCYHIDPHH      0

Currently, I have this dplyr's process which is slow for real data set:

library(tidyverse)
dat %>% 
  dplyr::group_by(peptide_name, peptide) %>% 
  mutate(tot = sum(status)) %>% 
  filter(tot == 0) %>%  
  pivot_wider(names_from = ref_string, values_from = status) %>%
  dplyr::select(-tot) %>% 
  ungroup()

Which produces:

# A tibble: 10 × 10
   peptide_name peptide                            K     Y     Q     C     H     A     S     L
   <chr>        <chr>                          <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 p47          FKDHKHIDVKgrrrskrrrrTRCYHIDPHH     0    NA    NA    NA    NA    NA    NA    NA
 2 p666         FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH    NA     0    NA    NA    NA    NA    NA    NA
 3 p506         FKDHKHIDVKrrrrskrgrrTRCYHIDPHH    NA    NA     0    NA    NA    NA    NA    NA
 4 p356         FKDHKHIDVKrrrgrrrrskTRCYHIDPHH    NA    NA    NA     0    NA    NA    NA    NA
 5 p598         FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH    NA    NA    NA    NA     0    NA    NA    NA
 6 p458         FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH    NA    NA    NA    NA    NA     0    NA    NA
 7 p448         FKDHKHIDVKrrrrrgrskrTRCYHIDPHH    NA    NA    NA    NA    NA    NA     0    NA
 8 p117         FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH    NA     0    NA    NA    NA    NA    NA    NA
 9 p232         FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH    NA    NA    NA    NA    NA    NA    NA     0
10 p464         FKDHKHIDVKrrrrrksrrgTRCYHIDPHH    NA     0    NA    NA    NA    NA    NA    NA

How can I achieve the same result with data.table?

I'm stuck here:

library(data.table)
dt <- as.data.table(dat)
dt[,by = peptide, tot:=sum(status) ]
dt

with this result:

    ref_string peptide_name                        peptide status tot
 1:          K          p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH      0   0
 2:          Y         p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH      0   0
 3:          Q         p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH      0   0
 4:          C         p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH      0   0
 5:          H         p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH      0   0
 6:          A         p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH      0   0
 7:          S         p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH      0   0
 8:          Y         p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH      0   0
 9:          L         p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH      0   0
10:          Y         p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH      0   0

I don't know how to use filter and pivot_wider for data.table.

CodePudding user response:

Here is a data.table option, where dcast should be used to pivot wider

dcast(
  setDT(dat)[
    ,
    tot := sum(status),
    .(peptide_name, peptide)
  ][tot == 0],
  peptide_name   peptide ~ ref_string,
  value.var = "status"
)

and you will see the result as

    peptide_name                        peptide  A  C  H  K  L  Q  S  Y
 1:         p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH NA NA NA NA NA NA NA  0
 2:         p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH NA NA NA NA  0 NA NA NA
 3:         p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH NA  0 NA NA NA NA NA NA
 4:         p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH NA NA NA NA NA NA  0 NA
 5:         p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH  0 NA NA NA NA NA NA NA
 6:         p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH NA NA NA NA NA NA NA  0
 7:          p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH NA NA NA  0 NA NA NA NA
 8:         p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH NA NA NA NA NA  0 NA NA
 9:         p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH NA NA  0 NA NA NA NA NA
10:         p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH NA NA NA NA NA NA NA  0

CodePudding user response:

Try dtplyr, which allows you to use dplyr and tidyr commands with data.table as a backend:

library(dplyr)
library(tidyr)
library(dtplyr)

dat_dt <- lazy_dt(dat)

dat_dt %>% 
  group_by(peptide_name, peptide) %>% 
  mutate(tot = sum(status)) %>% 
  filter(tot == 0) %>%  
  pivot_wider(names_from = ref_string, values_from = status) %>%
  select(-tot) %>% 
  ungroup() %>%
  as_tibble()
# A tibble: 10 × 10
   peptide_name peptide              K     Y     Q     C     H     A     S     L
   <chr>        <chr>            <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 p117         FKDHKHIDVKkrrrr…    NA     0    NA    NA    NA    NA    NA    NA
 2 p232         FKDHKHIDVKrkrrr…    NA    NA    NA    NA    NA    NA    NA     0
 3 p356         FKDHKHIDVKrrrgr…    NA    NA    NA     0    NA    NA    NA    NA
 4 p448         FKDHKHIDVKrrrrr…    NA    NA    NA    NA    NA    NA     0    NA
 5 p458         FKDHKHIDVKrrrrr…    NA    NA    NA    NA    NA     0    NA    NA
 6 p464         FKDHKHIDVKrrrrr…    NA     0    NA    NA    NA    NA    NA    NA
 7 p47          FKDHKHIDVKgrrrs…     0    NA    NA    NA    NA    NA    NA    NA
 8 p506         FKDHKHIDVKrrrrs…    NA    NA     0    NA    NA    NA    NA    NA
 9 p598         FKDHKHIDVKrsgrr…    NA    NA    NA    NA     0    NA    NA    NA
10 p666         FKDHKHIDVKsrgrk…    NA     0    NA    NA    NA    NA    NA    NA

Microbenchmark:

Unit: milliseconds
   expr      min       lq     mean   median       uq      max neval cld
  dplyr 27.80141 29.91216 32.92722 31.35731 33.91613 73.43162   100   b
 dtplyr 18.60528 20.17416 22.33375 21.38543 23.15164 52.08883   100  a 

CodePudding user response:

Another way of solving your problem using data.table

dcast(data = setDT(dat)[, if(sum(status)==0) .SD, by=.(peptide_name, peptide)],
      formula = peptide_name   peptide ~ ref_string, 
      value.var = "status")

    peptide_name                        peptide     A     C     H     K     L     Q     S     Y
          <char>                         <char> <num> <num> <num> <num> <num> <num> <num> <num>
 1:         p117 FKDHKHIDVKkrrrrsrrgrTRCYHIDPHH    NA    NA    NA    NA    NA    NA    NA     0
 2:         p232 FKDHKHIDVKrkrrrrsgrrTRCYHIDPHH    NA    NA    NA    NA     0    NA    NA    NA
 3:         p356 FKDHKHIDVKrrrgrrrrskTRCYHIDPHH    NA     0    NA    NA    NA    NA    NA    NA
 4:         p448 FKDHKHIDVKrrrrrgrskrTRCYHIDPHH    NA    NA    NA    NA    NA    NA     0    NA
 5:         p458 FKDHKHIDVKrrrrrkrrgsTRCYHIDPHH     0    NA    NA    NA    NA    NA    NA    NA
 6:         p464 FKDHKHIDVKrrrrrksrrgTRCYHIDPHH    NA    NA    NA    NA    NA    NA    NA     0
 7:          p47 FKDHKHIDVKgrrrskrrrrTRCYHIDPHH    NA    NA    NA     0    NA    NA    NA    NA
 8:         p506 FKDHKHIDVKrrrrskrgrrTRCYHIDPHH    NA    NA    NA    NA    NA     0    NA    NA
 9:         p598 FKDHKHIDVKrsgrrrrrkrTRCYHIDPHH    NA    NA     0    NA    NA    NA    NA    NA
10:         p666 FKDHKHIDVKsrgrkrrrrrTRCYHIDPHH    NA    NA    NA    NA    NA    NA    NA     0
  • Related