Find overlapping ranges between two data frames after grouping in R-CodePudding

I have two large data frames that look like this:

df1 <- tibble(chrom=c(1,1,1,2,2,2),
              start=c(100,200,300,100,200,300),
              end=c(150,250,350,120,220,320))

df2 <- tibble(chrom=c(1,1,1,2,2,2),
              start2=c(100,50,280,100,10,200),
              end2=c(125,100,320,115,15,350))

df1
#> # A tibble: 6 × 3
#>   chrom start   end
#>   <dbl> <dbl> <dbl>
#> 1     1   100   150
#> 2     1   200   250
#> 3     1   300   350
#> 4     2   100   120
#> 5     2   200   220
#> 6     2   300   320
df2
#> # A tibble: 6 × 3
#>   chrom start2  end2
#>   <dbl>  <dbl> <dbl>
#> 1     1    100   125
#> 2     1     50   100
#> 3     1    280   320
#> 4     2    100   115
#> 5     2     10    15
#> 6     2    200   350

^{Created on 2023-01-09 with reprex v2.0.2}

I want to find which range[start2-end2] of df2 overlaps with the range[start-end] of df1. An ideal output would be something like this, but it's not necessary. Mostly I want the coordinates of the overlapping ranges.


#> # A tibble: 6 × 8
#>   chrom start   end start2  end2 overlap overlap_start overlap_end
#>   <dbl> <dbl> <dbl>  <dbl> <dbl> <chr>   <chr>         <chr>      
#> 1     1   100   150    100   125 yes     100           125        
#> 2     1   200   250     50   100 no      <NA>          <NA>       
#> 3     1   300   350    280   320 yes     300           320        
#> 4     2   100   120    100   115 yes     100           115        
#> 5     2   200   220     10    15 no      <NA>          <NA>       
#> 6     2   300   320    200   350 yes     200,220       300,320

^{Created on 2023-01-09 with reprex v2.0.2}

!Note that on the last line, the range 200-350 overlaps already with two ranges from df1[200-220, 300-320].

CodePudding user response：

My advise is to use the Bioconductor package GenomicRanges, which can use optimal data structures for finding interval overlaps.

library(GenomicRanges)

df1 <- tibble(chrom=c(1,1,1,2,2,2),
              start=c(100,200,300,100,200,300),
              end=c(150,250,350,120,220,320))

df2 <- tibble(chrom=c(1,1,1,2,2,2),
              start2=c(100,50,280,100,10,200),
              end2=c(125,100,320,115,15,350))


overlaps <- findOverlapPairs(makeGRangesFromDataFrame(df1),
                             makeGRangesFromDataFrame(df2,
                                                      end.field = "end2",
                                                      start.field = "start2"))


> overlaps
Pairs object with 6 pairs and 0 metadata columns:
          first    second
      <GRanges> <GRanges>
  [1] 1:100-150  1:50-100
  [2] 1:100-150 1:100-125
  [3] 1:300-350 1:280-320
  [4] 2:100-120 2:100-115
  [5] 2:200-220 2:200-350
  [6] 2:300-320 2:200-350

mapply(as.data.frame,
       list(S4Vectors::first(overlaps),
            S4Vectors::second(overlaps)),
       SIMPLIFY = FALSE) |>
    do.call(what = `cbind`)

  seqnames start end width strand seqnames start end width strand
1        1   100 150    51      *        1    50 100    51      *
2        1   100 150    51      *        1   100 125    26      *
3        1   300 350    51      *        1   280 320    41      *
4        2   100 120    21      *        2   100 115    16      *
5        2   200 220    21      *        2   200 350   151      *
6        2   300 320    21      *        2   200 350   151      *

CodePudding user response：

I believe you are looking for sometehing like this?

I see no need to summarise here, so you'll get two results for the df2-range 200-350.

library(data.table)
library(matrixStats)
# set to data.table format
setDT(df1); setDT(df2)
# perform join
ans <- df1[df2, .(chrom, 
                  start = x.start, end = x.end, 
                  start2 = i.start2, end2 = i.end2), 
           on = .(chrom, start < end2, end > start2),
           nomatch = NA]

# calculate new columns
ans[, overlap_start := rowMaxs(as.matrix(.SD)), .SDcols = c("start", "start2")]
ans[, overlap_end := rowMins(as.matrix(.SD)), .SDcols = c("end", "end2")]

#    chrom start end start2 end2 overlap_start overlap_end
# 1:     1   100 150    100  125           100         125
# 2:     1    NA  NA     50  100            NA          NA
# 3:     1   300 350    280  320           280         320
# 4:     2   100 120    100  115           100         115
# 5:     2    NA  NA     10   15            NA          NA
# 6:     2   200 220    200  350           200         220
# 7:     2   300 320    200  350           200         320

CodePudding user response：

A lengthier "tidy-style" version:

library(dplyr)

df1 |>
  left_join(df2, by = 'chrom') |>
  rowwise() |>
  mutate(range1 = list(start:end),
         range2 = list(start2:end2),
         intersect = list(intersect(start:end, start2:end2)),
         overlap = c('no', 'yes')[1   sign(length(intersect))],
         overlap_start = ifelse(length(intersect), min(intersect), NA),
         overlap_end = ifelse(length(intersect), max(intersect), NA),
         ) |>
  group_by(paste(start2, end2)) |>
  summarise(across(chrom : end2),
            overlap,
            across(starts_with('overlap_'),
                   ~ paste(na.omit(.x), collapse = ','))
            ) |>
  ungroup() |>
  select(chrom:overlap_end)

# A tibble: 18 x 8
   chrom start   end start2  end2 overlap overlap_start overlap_end
   <dbl> <dbl> <dbl>  <dbl> <dbl> <chr>   <chr>         <chr>      
 1     2   100   120     10    15 no      ""            ""         
 2     2   200   220     10    15 no      ""            ""         
 3     2   300   320     10    15 no      ""            ""         
 4     2   100   120    100   115 yes     "100"         "115"      
 5     2   200   220    100   115 no      "100"         "115"      
 6     2   300   320    100   115 no      "100"         "115"      
 7     1   100   150    100   125 yes     "100"         "125"      
 8     1   200   250    100   125 no      "100"         "125"      
 9     1   300   350    100   125 no      "100"         "125"      
10     2   100   120    200   350 no      "200,300"     "220,320" 
# ...

to obtain numeric vectors instead of comma-separated strings for multiple overlaps, summarize with the following fragment instead:

## ...
    across(starts_with('overlap_'),
           ~ list(c(na.omit(.x)))
           )