I have two dataframes with start and end time. I want to compare each row of df2 to each row of df1 and calculate the overlap.
df1
# start end
#1 5 15
#2 20 28
#3 46 68
#4 80 87
df2
# start end
#1 20 40
#2 65 85
So the results should be a vector with results
overlaping_duration_1= 8 (overlap from df2 row 1 with df1 row 1)
overlaping_duration_2= 3 5 = overlap from df2 row 2 with df1 row 3 overlap from df2 row 2 with df1 row 4
I tried it with an ifelse approach and cover the different conditions. This is only for the first row for df2.
overlap = ifelse ( df2$start <= df1$start & df1$start <= df2$end & df2$end <= df1$end, df2$end-df1$start, 0)
overlap2 = ifelse ( df2$start <= df1$start & df1$end <= df2$end, df1$end-df1$start, 0)
overlap3 = ifelse ( df1$start < df2$start & df2$end <= df1$end, df2$end-df2$start, 0)
overlap4 = ifelse ( df1$start < df2$start & df2$start <= df1$end & df1$end <= df2$end, df1$end-df2$start, 0)
Afterwards the different overlap vectors can be merged. This could be applied over a for
loop over df2.
This approach is quite cumbersome. Is there a more comfortable way?
CodePudding user response:
library(data.table)
df1 = data.table(start=c(5,20,46,80),end=c(15,28,68,87))
df2 = data.table(start=c(20,65), end=c(40,85))
# add row identifer (`rn`) and dummy var (`id`) for cartesian join
df1[,`:=`(id=1, rn=.I)]
df2[,`:=`(id=1, rn=.I)]
# do full join
df = df1[df2,on="id", allow.cartesian=T]
# estimate overlap, by row
result = df[,overlap:=.(min(i.end,end)-max(i.start,start)), by=1:nrow(df)]
# retain positive overlaps, and sum by df2 row number
result[overlap>0, .(total = sum(overlap)), by = .(rn=i.rn)]
Output:
rn total
1: 1 8
2: 2 8
Update: You can also avoid the full join, by instead keying df2
on start
and end
and using data.table::foverlaps
:
library(data.table)
df1 = data.table(start=c(5,20,46,80),end=c(15,28,68,87))
df2 = data.table(start=c(20,65), end=c(40,85))
setkey(df2,start,end)
df = foverlaps(df1,df2[,rn:=.I], nomatch=NULL)
df[,overlap:=.(min(i.end,end)-max(i.start,start)), by=1:nrow(df)][, .(total =sum(overlap,na.rm=T)),by=rn]
Output:
rn total
1: 1 8
2: 2 8