Count the number of overlaps between groups-CodePudding

I have two large data sets that look like this.

library(tidyverse)


dat1 <- tibble(chrom=c(rep(c("Chr1","Chr2"),each=5)),
               start=c(9885,11944, 13271,15104,19059,25793,97514,104718,118862,120950),
               end=c(11008,17644,20164,23807,25264,106001,119205, 121576,124981,138514)
)

head(dat1,n=4)
#> # A tibble: 10 × 3
#>    chrom  start    end
#>    <chr>  <dbl>  <dbl>
#>  1 Chr1    9885  11008
#>  2 Chr1   11944  17644
#>  3 Chr1   13271  20164
#>  4 Chr1   15104  23807


dat2 <- tibble(chrom=c(rep(c("Chr1","Chr2"),each=5)),
               start=c(9885,11944, 13271,15104,19059,25793,97514,104718,118862,120950),
               end=c(10203,12546,13669,15638,19283,26703,97773, 105102,119388,121331)
               )

head(dat2, n=4)
#> # A tibble: 10 × 3
#>    chrom  start    end
#>    <chr>  <dbl>  <dbl>
#>  1 Chr1    9885  10203
#>  2 Chr1   11944  12546
#>  3 Chr1   13271  13669
#>  4 Chr1   15104  15638

^{Created on 2022-12-05 with reprex v2.0.2}

I want to group my data based on the chrom and find which ranges [start-end], from the Chr1, from dat2 overlap with ranges [start-end] of Chr1 from dat1.

What I have tried

I have found a lovely package to work it around, but I feel I need to break the data sets into data frames of different chromosomes to compare them.

library(plyranges)

dat1 <- dat1 %>% 
  as_iranges()

dat2  <- dat2 %>% 
  as_iranges()

dat1 %>% 
  mutate(n_olap = count_overlaps(., dat2),
                 n_olap_within = count_overlaps_within(., dat2))

IRanges object with 10 ranges and 3 metadata columns:
           start       end     width |       chrom    n_olap n_olap_within
       <integer> <integer> <integer> | <character> <integer>     <integer>
   [1]      9885     11008      1124 |        Chr1         1             0
   [2]     11944     17644      5701 |        Chr1         3             0
   [3]     13271     20164      6894 |        Chr1         3             0
   [4]     15104     23807      8704 |        Chr1         2             0
   [5]     19059     25264      6206 |        Chr1         1             0
   [6]     25793    106001     80209 |        Chr2         3             0
   [7]     97514    119205     21692 |        Chr2         3             0
   [8]    104718    121576     16859 |        Chr2         3             0
   [9]    118862    124981      6120 |        Chr2         2             0
  [10]    120950    138514     17565 |        Chr2         1             0

To get what I want from here, I need to filter my data and then compare it. But there should a way or dplyr trick to find a solution

dat1 <- dat1 %>% 
  as_iranges() %>% 
  filter(chrom=="Chr1")

dat2  <- dat2 %>% 
  as_iranges() %>% 
  filter(chrom=="Chr1")

dat1 %>% 
  mutate(n_olap = count_overlaps(., dat2),
                 n_olap_within = count_overlaps_within(., dat2))

Is there a way to compare only chromosomes with each other? any help or guidance are appreciated

CodePudding user response：

I'll stick with the data.table theme set forth by RicVillalba, but I think the foverlaps function is meant for things like this (especially with larger datasets).

library(data.table)
setDT(dat1)
setDT(dat2)
setkey(dat1, chrom, start, end)
setkey(dat2, chrom, start, end)
dat1[, id := .I]
foverlaps(dat1, dat2)
#      chrom  start    end i.start  i.end    id
#     <char>  <num>  <num>   <num>  <num> <int>
#  1:   Chr1   9885  10203    9885  11008     1
#  2:   Chr1  11944  12546   11944  17644     2
#  3:   Chr1  13271  13669   11944  17644     2
#  4:   Chr1  15104  15638   11944  17644     2
#  5:   Chr1  13271  13669   13271  20164     3
#  6:   Chr1  15104  15638   13271  20164     3
#  7:   Chr1  19059  19283   13271  20164     3
#  8:   Chr1  15104  15638   15104  23807     4
#  9:   Chr1  19059  19283   15104  23807     4
# 10:   Chr1  19059  19283   19059  25264     5
# ---                                          
# 13:   Chr2 104718 105102   25793 106001     6
# 14:   Chr2  97514  97773   97514 119205     7
# 15:   Chr2 104718 105102   97514 119205     7
# 16:   Chr2 118862 119388   97514 119205     7
# 17:   Chr2 104718 105102  104718 121576     8
# 18:   Chr2 118862 119388  104718 121576     8
# 19:   Chr2 120950 121331  104718 121576     8
# 20:   Chr2 118862 119388  118862 124981     9
# 21:   Chr2 120950 121331  118862 124981     9
# 22:   Chr2 120950 121331  120950 138514    10

(Note that in addition to requiring keys, the order matters: the last two keys must be the "start" and "end" of the ranges to check for overlapping. I added chrom to ensure we do this by-chromosome.)

That's the start. I added the id column into dat1 so that we could efficiently return back to the original columns. If you inspect the columns closely, notice that the i.* columns are from dat1, so those are the ones we want to keep.

Extending that to do the aggregation you're hoping for,

overlaps <- foverlaps(dat1, dat2)[, .(n_olaps = .N, n_within = sum(between(i.start, start, end) & between(i.end, start, end))), by = .(id)]
overlaps
#        id n_olaps n_within
#     <int>   <int>    <int>
#  1:     1       1        0
#  2:     2       3        0
#  3:     3       3        0
#  4:     4       2        0
#  5:     5       1        0
#  6:     6       3        0
#  7:     7       3        0
#  8:     8       3        0
#  9:     9       2        0
# 10:    10       1        0

dat1 <- overlaps[dat1, on = .(id)]
dat1
#        id n_olaps n_within  chrom  start    end
#     <int>   <int>    <int> <char>  <num>  <num>
#  1:     1       1        0   Chr1   9885  11008
#  2:     2       3        0   Chr1  11944  17644
#  3:     3       3        0   Chr1  13271  20164
#  4:     4       2        0   Chr1  15104  23807
#  5:     5       1        0   Chr1  19059  25264
#  6:     6       3        0   Chr2  25793 106001
#  7:     7       3        0   Chr2  97514 119205
#  8:     8       3        0   Chr2 104718 121576
#  9:     9       2        0   Chr2 118862 124981
# 10:    10       1        0   Chr2 120950 138514

I chose to create overlaps and join it back onto dat1 in case there were other columns that needed to be retained. It's certainly feasible to do this more in-place in dat1, especially helpful if your dataset is too big for a temporary copy.

CodePudding user response：

If you have huge datasets, you can consider using data.table this way:

library(data.table)

dat1 <- data.frame(chrom=c(rep(c("Chr1","Chr2"),each=5)),
               start=c(9885,11944, 13271,15104,19059,25793,97514,104718,118862,120950),
               end=c(11008,17644,20164,23807,25264,106001,119205, 121576,124981,138514)
)

dat2 <- data.frame(chrom=c(rep(c("Chr1","Chr2"),each=5)),
               start=c(9885,11944, 13271,15104,19059,25793,97514,104718,118862,120950),
               end=c(10203,12546,13669,15638,19283,26703,97773, 105102,119388,121331)
               )

setDT(dat1)
setDT(dat2)

dat1[dat2, cbind(
  .SD[i.end >= start & end >= i.start],
  start2 = i.start,
  end2 = i.end), on="chrom", by=.EACHI][!is.na(start)]
#>     chrom  start    end start2   end2
#>  1:  Chr1   9885  11008   9885  10203
#>  2:  Chr1  11944  17644  11944  12546
#>  3:  Chr1  13271  20164  13271  13669
#>  4:  Chr1  15104  23807  15104  15638
#>  5:  Chr1  19059  25264  19059  19283
#>  6:  Chr2  25793 106001  25793  26703
#>  7:  Chr2  97514 119205  97514  97773
#>  8:  Chr2 104718 121576 104718 105102
#>  9:  Chr2  97514 119205 118862 119388
#> 10:  Chr2 118862 124981 118862 119388
#> 11:  Chr2 120950 138514 120950 121331

EDIT i made a correction on overlap matching. Thanks to r2evans.