I have two large data frames that look like this:
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
df1
#> # A tibble: 6 × 3
#> chrom start end
#> <dbl> <dbl> <dbl>
#> 1 1 100 150
#> 2 1 200 250
#> 3 1 300 350
#> 4 2 100 120
#> 5 2 200 220
#> 6 2 300 320
df2
#> # A tibble: 6 × 3
#> chrom start2 end2
#> <dbl> <dbl> <dbl>
#> 1 1 100 125
#> 2 1 50 100
#> 3 1 280 320
#> 4 2 100 115
#> 5 2 10 15
#> 6 2 200 350
Created on 2023-01-09 with reprex v2.0.2
I want to find which range[start2-end2] of df2 overlaps with the range[start-end] of df1. An ideal output would be something like this, but it's not necessary. Mostly I want the coordinates of the overlapping ranges.
#> # A tibble: 6 × 8
#> chrom start end start2 end2 overlap overlap_start overlap_end
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#> 1 1 100 150 100 125 yes 100 125
#> 2 1 200 250 50 100 no <NA> <NA>
#> 3 1 300 350 280 320 yes 300 320
#> 4 2 100 120 100 115 yes 100 115
#> 5 2 200 220 10 15 no <NA> <NA>
#> 6 2 300 320 200 350 yes 200,220 300,320
Created on 2023-01-09 with reprex v2.0.2
!Note that on the last line, the range 200-350 overlaps already with two ranges from df1[200-220, 300-320].
CodePudding user response:
My advise is to use the Bioconductor package GenomicRanges
, which can use optimal data structures for finding interval overlaps.
library(GenomicRanges)
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
overlaps <- findOverlapPairs(makeGRangesFromDataFrame(df1),
makeGRangesFromDataFrame(df2,
end.field = "end2",
start.field = "start2"))
> overlaps
Pairs object with 6 pairs and 0 metadata columns:
first second
<GRanges> <GRanges>
[1] 1:100-150 1:50-100
[2] 1:100-150 1:100-125
[3] 1:300-350 1:280-320
[4] 2:100-120 2:100-115
[5] 2:200-220 2:200-350
[6] 2:300-320 2:200-350
mapply(as.data.frame,
list(S4Vectors::first(overlaps),
S4Vectors::second(overlaps)),
SIMPLIFY = FALSE) |>
do.call(what = `cbind`)
seqnames start end width strand seqnames start end width strand
1 1 100 150 51 * 1 50 100 51 *
2 1 100 150 51 * 1 100 125 26 *
3 1 300 350 51 * 1 280 320 41 *
4 2 100 120 21 * 2 100 115 16 *
5 2 200 220 21 * 2 200 350 151 *
6 2 300 320 21 * 2 200 350 151 *
CodePudding user response:
I believe you are looking for sometehing like this?
I see no need to summarise here, so you'll get two results for the df2-range 200-350.
library(data.table)
library(matrixStats)
# set to data.table format
setDT(df1); setDT(df2)
# perform join
ans <- df1[df2, .(chrom,
start = x.start, end = x.end,
start2 = i.start2, end2 = i.end2),
on = .(chrom, start < end2, end > start2),
nomatch = NA]
# calculate new columns
ans[, overlap_start := rowMaxs(as.matrix(.SD)), .SDcols = c("start", "start2")]
ans[, overlap_end := rowMins(as.matrix(.SD)), .SDcols = c("end", "end2")]
# chrom start end start2 end2 overlap_start overlap_end
# 1: 1 100 150 100 125 100 125
# 2: 1 NA NA 50 100 NA NA
# 3: 1 300 350 280 320 280 320
# 4: 2 100 120 100 115 100 115
# 5: 2 NA NA 10 15 NA NA
# 6: 2 200 220 200 350 200 220
# 7: 2 300 320 200 350 200 320
CodePudding user response:
A lengthier "tidy-style" version:
library(dplyr)
df1 |>
left_join(df2, by = 'chrom') |>
rowwise() |>
mutate(range1 = list(start:end),
range2 = list(start2:end2),
intersect = list(intersect(start:end, start2:end2)),
overlap = c('no', 'yes')[1 sign(length(intersect))],
overlap_start = ifelse(length(intersect), min(intersect), NA),
overlap_end = ifelse(length(intersect), max(intersect), NA),
) |>
group_by(paste(start2, end2)) |>
summarise(across(chrom : end2),
overlap,
across(starts_with('overlap_'),
~ paste(na.omit(.x), collapse = ','))
) |>
ungroup() |>
select(chrom:overlap_end)
# A tibble: 18 x 8
chrom start end start2 end2 overlap overlap_start overlap_end
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 2 100 120 10 15 no "" ""
2 2 200 220 10 15 no "" ""
3 2 300 320 10 15 no "" ""
4 2 100 120 100 115 yes "100" "115"
5 2 200 220 100 115 no "100" "115"
6 2 300 320 100 115 no "100" "115"
7 1 100 150 100 125 yes "100" "125"
8 1 200 250 100 125 no "100" "125"
9 1 300 350 100 125 no "100" "125"
10 2 100 120 200 350 no "200,300" "220,320"
# ...
to obtain numeric vectors instead of comma-separated strings for multiple overlaps, summarize with the following fragment instead:
## ...
across(starts_with('overlap_'),
~ list(c(na.omit(.x)))
)