Home > Back-end >  Use different approach than inner_join
Use different approach than inner_join

Time:04-01

I would like to use another resolution approach that is faster to calculate the SPV. See that I use inner_join, which is a function that takes considerable computational time, so there is another way to calculate the SPV, other than inner_join and make it faster.?

library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)

df1 <- structure(
  list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
                "2021-06-28","2021-06-28","2021-06-28"),
       date2 = c("2021-06-25","2021-06-25","2021-06-27","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
       Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
       Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
       DR1 = c(4,1,4,3,3,4,3,5),
       DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
       DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
       DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
       DR08 = c(0,0,0,1,2,0,0,0),DR09 = c(0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0),DR011 = c(4,0,0,0,0,0,0,0), 
       DR012 = c(0,0,0,3,0,0,0,5),DR013 = c(0,0,1,0,0,0,2,0),DR014 = c(0,0,0,0,0,2,0,0)),
  class = "data.frame", row.names = c(NA, -8L))

selection = startsWith(names(df1), "DRM")

df1[selection][is.na(df1[selection])] = 0

dt1 <- as.data.table(df1)

cols <- grep("^DR0", colnames(dt1), value = TRUE)

medi_ana <- 
  dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
  ][, lapply(.SD, median), by = .(Code, Week), .SDcols = paste0(cols, "_PV") ]


SPV<-df1%>%
  inner_join(medi_ana, by = c('Code', 'Week')) %>%
  mutate(across(matches("^DR0\\d $"), ~.x   
                  get(paste0(cur_column(), '_PV')),
                .names = '{col}_{col}_PV')) %>%
  select(date1:Week, DR01_DR01_PV:last_col())%>%
  data.frame()

> SPV
       date1      date2 Code      Week DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV
1 2021-06-28 2021-06-25  FDE Wednesday            4          4.0            4          4.0          4.0          4.0          4.0
2 2021-06-28 2021-06-25  ABC Wednesday            1         -0.5            3          2.5          2.5          2.5          2.5
3 2021-06-28 2021-06-27  ABC    Friday            4          3.0            5          4.5          5.5          1.5          2.0
4 2021-06-28 2021-07-07  ABC Wednesday            3          4.5            1          1.5          1.5          1.5          1.5
5 2021-06-28 2021-07-07  CDE Wednesday            3          3.0            3          3.0          3.0          3.0          3.0
6 2021-06-28 2021-07-09  FGE    Friday            4          4.0            4          4.0          4.0          4.0          4.0
7 2021-06-28 2021-07-09  ABC    Friday            3          4.0            2          2.5          1.5          5.5          5.0
8 2021-06-28 2021-07-09  CDE    Friday            5          5.0            5          5.0          5.0          5.0          5.0
  DR08_DR08_PV DR09_DR09_PV DR010_DR010_PV DR011_DR011_PV DR012_DR012_PV DR013_DR013_PV DR014_DR014_PV
1          4.0          4.0            4.0            4.0            4.0              4            4.0
2          1.5          2.0            2.0            2.0            0.5              2            2.0
3          3.5          3.5            3.5            3.5            3.5              3            3.5
4          2.5          2.0            2.0            2.0            3.5              2            2.0
5          3.0          3.0            3.0            3.0            3.0              3            3.0
6          4.0          4.0            4.0            4.0            4.0              4            4.0
7          3.5          3.5            3.5            3.5            3.5              4            3.5
8          5.0          5.0            5.0            5.0            5.0              5            5.0

CodePudding user response:

As we are using data.table, data.table join could be faster

library(data.table)
f1 <- function(nm, pat) grep(pat, nm, value = TRUE)
nm1 <- f1(names(df1), "^DR0\\d $")
nm2 <- f1(names(medi_ana), "_PV")
nm3 <- paste0("i.", nm2)
setDT(df1)[medi_ana,  (nm2) := Map(` `, mget(nm1), mget(nm3)), on = .(Code, Week)]
SPV2 <- df1[, c('date1', 'date2', 'Code', 'Week', nm2), with = FALSE]
  •  Tags:  
  • r
  • Related