Join overlapping ranges from two data frames in r-CodePudding

Note: This question was closed as a 'duplicate'. The solutions offered here and here did not answer my question. They showed how to merge when a single entry fell within a range, I'm trying to identify overlapping ranges and joining them. Perhaps my title could have been better...

I have a main data set main_df with a start and end time (in seconds). I would like to see if the time range in main_df falls within a list of ranges in lookup_df, and if so, grab the value from lookup_df. Additionally, if the main_df falls within two different lookup ranges, duplicate the row so each value is represented.***

main_df <- tibble(start = c(30,124,161),
                end = c(80,152,185))

lookup_df <- tibble(start = c(34,73,126,141,174,221),
                       end = c(69,123,136,157,189,267),
                       value = c('a','b','b','b','b','a'))

# Do something here to get the following:

> final_df
# A tibble: 4 x 4
  start   end value notes                                      
  <dbl> <dbl> <chr> <chr>                                      
1    30    80 a     ""                                         
2    30    80 b     "Duplicate because it falls within a and b"
3   124   152 b     "Falls within two lookups but both are b"  
4   161   185 b     ""

***Edit: Looking at the way I've structured the problem...

#Not actual code
left_join(main_df, lookup_df, by(some_range_join_function) %>% 
  add_rows(through_some_means)

Rather than having to add a new row I could flip how I'm joining them...

semi_join(lookup_df, main_df, by(some_range_join_function))

CodePudding user response：

You could do some logical comparisons and then a case handling what shall happen if all are 'b', 'a' and 'b', etc. In this way you easily could add more cases, e.g. both are 'a', one is 'a', more are 'b' which you didn't declare in OP. The approach yields NULL if there are no matches which gets omitted during rbind.

f <- \(x, y) {
  w <- which((x[1] >= y[, 1] & x[1] <= y[, 2]) | (x[2] >= y[, 1] & x[1] <= y[, 2]))
  if (length(w) > 0) {
    d <- data.frame(t(x), value=cbind(y[w, 3]), notes='')
    if (length(w) >= 2) {
      if (all(d$value == 'b')) {
        d <- d[!duplicated(d$value), ]
        d$notes[1] <- 'both b'
      }
      else {
        d$notes[nrow(d)] <- 'a & b'
      }
    }
    d
  }
}

apply(main_df, 1, f, lookup_df, simplify=F) |> do.call(what=rbind)
#   start end value  notes
# 1    30  80     a       
# 2    30  80     b  a & b
# 3   124 152     b both b
# 4   161 185     b

Data:

main_df <- structure(list(start = c(2, 30, 124, 161), end = c(1, 80, 152, 
185)), row.names = c(NA, -4L), class = "data.frame")

lookup_df <- structure(list(start = c(34, 73, 126, 141, 174, 221), end = c(69, 
123, 136, 157, 189, 267), value = c("a", "b", "b", "b", "b", 
"a")), row.names = c(NA, -6L), class = "data.frame")

CodePudding user response：

Another option is fuzzyjoin::interval_join:

library(fuzzyjoin)
library(dplyr)

interval_join(main_df, lookup_df, by = c("start", "end"), mode = "inner") %>% 
  group_by(value, start.x, end.x) %>% 
  slice(1) %>% 
  select(start = start.x, end = end.x, value)

# A tibble: 4 × 3
# Groups:   value, start, end [4]
  start   end value
  <dbl> <dbl> <chr>
1    30    80 a    
2    30    80 b    
3   124   152 b    
4   161   185 b

CodePudding user response：

You can use foverlaps from data.table for this.

library(data.table)

setDT(main_df) # make it a data.table if needed
setDT(lookup_df) # make it a data.table if needed

setkey(main_df, start, end) # set the keys of 'y'

foverlaps(lookup_df, main_df, nomatch = NULL) # do the lookup

#    start end i.start i.end value
# 1:    30  80      34    69     a
# 2:    30  80      73   123     b
# 3:   124 152     126   136     b
# 4:   124 152     141   157     b
# 5:   161 185     174   189     b

Or to get the cleaned results as end result (OP's final_df)

unique(foverlaps(lookup_df, main_df, nomatch = NULL)[, .(start, end, value)])

   start end value
1:    30  80     a
2:    30  80     b
3:   124 152     b
4:   161 185     b

CodePudding user response：

A possible solution, based on powerjoin:

library(tidyverse)
library(powerjoin)

power_left_join(
  main_df, lookup_df,
  by = ~ (.x$start <= .y$start & .x$end >= .y$end) |
    (.x$start >= .y$start & .x$start <= .y$end) | 
    (.x$start <= .y$start & .x$end >= .y$start), 
  keep = "left") %>% 
  distinct()

#> # A tibble: 4 x 3
#>   start   end value
#>   <dbl> <dbl> <chr>
#> 1    30    80 a    
#> 2    30    80 b    
#> 3   124   152 b    
#> 4   161   185 b

Or using tidyr::crossing:

library(tidyverse)

crossing(main_df, lookup_df,
        .name_repair = ~ c("start", "end", "start2", "end2", "value")) %>% 
  filter((start <= start2 & end >= end2) |
         (start >= start2 & start <= end2) | (start <= start2 & end >= start2)) %>% 
  select(-start2, -end2) %>% 
  distinct()

#> # A tibble: 4 x 3
#>   start   end value
#>   <dbl> <dbl> <chr>
#> 1    30    80 a    
#> 2    30    80 b    
#> 3   124   152 b    
#> 4   161   185 b