Finding overlapped units based on start and end times-CodePudding

I am wondering if there was an efficient data.table solution for the following problem.

Suppose, that I have the following dataset:

library(data.table)


DT <- data.table(emp = c(1,2,3),
                 start_time = c(90,90,540),
                 duration = c(480, 480,480 ))

DT[, end_time := start_time   duration]

which looks like:

     emp   start_time duration end_time
   <num>      <num>    <num>    <num>
1:     1         90      480      570
2:     2         90      480      570
3:     3        540      480     1020

Here, emp is the employee id, and the start time, duration, and end times of each employee's shift are given by the three columns. I am attempting to determine the amount of overlap that each employee has with each other in minutes. Thus, the output should look something like:

     emp emp_1 emp_2 emp_3
   <num> <num> <num> <num>
1:     1   480   480    30
2:     2   480   480    30
3:     3    30    30   480

where the columns are based on the full set of employees.

I am looking for a data.table solution since the number of employees is quite large.

CodePudding user response：

One possible solution with foverlaps and dcast:

library(data.table)

#Key needed for foverlaps
setkey(DT,start_time,end_time)

dcast(foverlaps(DT,DT)[,ol:=pmin(end_time,i.end_time)-pmax(start_time,i.start_time)],
      emp~i.emp,value.var = "ol")

Key: <emp>
     emp     1     2     3
   <num> <num> <num> <num>
1:     1   480   480    30
2:     2   480   480    30
3:     3    30    30   480

CodePudding user response：

Using pmin and pmax with my method of cross-join is the fastest:

dcast(setkey(DT[,c(k=1,.SD)],k)[
  setNames(DT, paste0(names(DT2), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
          overlap :=  pmin(end_time,end_time_2)-pmax(start_time,start_time_2)],
  emp~ emp_2, value.var = "overlap")

Benchmark:

Here I made a larger dataset to test different approaches, including tidyverse ones. I had to create copies of the datatable to account for setting the key in each of the solutions, although it doesn't affect the benchmark considerably;

library(data.table)
library(DescTools)
library(dplyr)
library(tidyr)

set.seed(123)
DT <- data.table(emp = 1:100,
                 start_time = sample.int(1000, 100),
                 duration = sample.int(1000, 100)   1000)

DT[, end_time := start_time   duration]

DT2 <- copy(DT)
DT3 <- copy(DT)
DT4 <- copy(DT)


WvM <- microbenchmark::microbenchmark(

M_DT_Desc = dcast(setkey(DT2[,c(k=1,.SD)],k)[
      setNames(DT2, paste0(names(DT2), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
         overlap :=  Overlap(c(start_time , end_time), c(start_time_2, end_time_2)), 
                                                                            by = 1:NROW(DT2)^2],
          emp~ emp_2, value.var = "overlap"),


M_DT_pminmax = dcast(setkey(DT2[,c(k=1,.SD)],k)[
  setNames(DT2, paste0(names(DT2), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
          overlap :=  pmin(end_time,end_time_2)-pmax(start_time,start_time_2)],
  emp~ emp_2, value.var = "overlap"),


M_foverlap_Desc = {setkey(DT3,start_time,end_time);dcast(foverlaps(DT,DT)[, irow := .I][,
                    overlap :=  Overlap(c(start_time , end_time), c(i.start_time, i.end_time)), 
                                                                              by = irow],
            emp~i.emp,value.var = "overlap")},


M_dplyr_Desc = DT4 %>% 
  setNames(paste0(names(.), '_2')) %>% 
  crossing(DT4, .) %>% 
  rowwise() %>%
  mutate(overlap = Overlap(c(start_time , end_time), c(start_time_2, end_time_2))) %>%
  ungroup() %>% 
  pivot_wider(id_cols = "emp", names_from = "emp_2", values_from = "overlap"),


M_dplyr_pminmax = DT4 %>% 
  setNames(paste0(names(.), '_2')) %>% 
  crossing(DT4, .) %>% 
  rowwise() %>%
  mutate(overlap = pmin(end_time,end_time_2)-pmax(start_time,start_time_2)) %>%
  ungroup() %>% 
  pivot_wider(id_cols = "emp", names_from = "emp_2", values_from = "overlap"),


Waldi ={setkey(DT,start_time,end_time); dcast(foverlaps(DT,DT)[,
                  ol:=pmin(end_time,i.end_time)-pmax(start_time,i.start_time)],
      emp~i.emp,value.var = "ol")},

times = 10)

Unit: milliseconds
            expr       min        lq       mean    median        uq       max neval
       M_DT_Desc  967.6728  992.4321 1063.98096 1053.6871 1093.5663 1258.7258    10
    M_DT_pminmax    7.3910    8.3103    8.86385    8.4347    9.8666   10.5503    10
 M_foverlap_Desc  966.2051 1001.8745 1043.72299 1034.6016 1095.6339 1128.2970    10
    M_dplyr_Desc 1040.0847 1060.8663 1132.24239 1101.4212 1150.1816 1444.9537    10
 M_dplyr_pminmax  168.4051  172.5951  185.10149  179.1346  197.1055  223.4941    10
           Waldi    8.5117    9.3202   10.54267    9.6550   10.2424   17.6923    10

Original Answer:

Here's another approach for cross join and getting the overlaps using DescTools package.

library(data.table)
library(DescTools)

dcast(setkey(DT[,c(k=1,.SD)],k)[
        setNames(DT, paste0(names(DT), '_2'))[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL][ , 
              overlap :=  DescTools::Overlap(c(start_time , end_time), c(start_time_2, end_time_2)), 
                by = 1:NROW(DT)^2],
      emp~ emp_2, value.var = "overlap")