Filter a dataframe based on a shorter dataframe-CodePudding

I have the following dataframes I've simplified for practical purposes:

        head(coords_int)        
        seqnames     start       end
            1      chr2 181529780 181533313
            2      chr2  98396674  98396940
            3      chr5  64919375  64919395
            4      chr9   2795948   2797647
            5      chr7 138873527 138873574
            6      chr4 154736072 154736138
            7      chr6  10762723  10769212
            8     chr10  93614727  93614773
            9     chr17  76539181  76539300
            10     chr6  99608741  99608872
            11    chr10  47330654  47330828
            12    chr10  47331176  47331237
            13    chr10  93612154  93612575
            14    chr10  84248910  84249043
            15    chr17  28547999  28548101
            16    chr17  28548592  28548705
            17    chr11  46701067  46701141
            18    chr16  49847678  49847918
            19    chr16  49822670  49822738
        
head(pdoms_protein)
        tx_id seqnames     start       end width strand         exon_id exon_rank cds_ok
    1  ENST00000339098        2 181573753 181573876   124      - ENSE00003634697         3   TRUE
    2  ENST00000339098        2 181573753 181573876   124      - ENSE00003634697         3   TRUE
    3  ENST00000339098        2 181566058 181566121    64      - ENSE00003523731         4   TRUE
    4  ENST00000393504        2  98395844  98396397   554        ENSE00000963920         8   TRUE
    5  ENST00000393504        2  98395844  98396397   554        ENSE00000963920         8   TRUE
    6  ENST00000393504        2  98396674  98396940   267        ENSE00000963920         8   TRUE
    7  ENST00000381070        5  64774694  64774787    94        ENSE00003522928         2   TRUE
    8  ENST00000381070        5  64774694  64774787    94        ENSE00003522928         2   TRUE
    9  ENST00000381070        5  64774694  64774787    94        ENSE00003522928         2   TRUE
    10 ENST00000381070        5  64781921  64782033   113        ENSE00003582136         3   TRUE
    11 ENST00000381070        5  64781921  64782033   113        ENSE00003582136         3   TRUE
    12 ENST00000382082        9   2718229   2718276    48        ENSE00001490869         1   TRUE
    13 ENST00000382082        9   2718229   2718276    48        ENSE00001490869         1   TRUE
    14 ENST00000422774        7 138881388 138881584   197      - ENSE00001088065        11   TRUE
    15 ENST00000422774        7 138879538 138879653   116      - ENSE00001088074        12   TRUE
    16 ENST00000422774        7 138871157 138871362   206      - ENSE00001088067        13   TRUE
    17 ENST00000336356        4 154744456 154744845   390        ENSE00001344788         2   TRUE
    18 ENST00000502525        4 154744456 154744530    75        ENSE00002048458         4  FALSE
    19 ENST00000507827        4 154744456 154744845   390        ENSE00001344788         2   TRUE
    20 ENST00000313243        6  10830548  10830639    92      - ENSE00003696993         2   TRUE
    21 ENST00000313243        6  10830548  10830639    92      - ENSE00003696993         2   TRUE
    22 ENST00000313243        6  10830548  10830639    92      - ENSE00003696993         2   TRUE
    23 ENST00000313243        6  10830548  10830639    92      - ENSE00003696993         2   TRUE
       protein_start protein_end protein_domain_id protein_domain_source interpro_accession
    1            164         339           PS50146                pfscan          IPR001206
    2            164         339           PF00781                  pfam          IPR001206
    3            164         339           PS50146                pfscan          IPR001206
    4            171         409           PF16526                  pfam          IPR032406
    5            171         409           SM00100                 smart          IPR000595
    6            502         590           PS50042                pfscan          IPR000595
    7             16         166           PR00153                prints          IPR002130
    8             16         166           PR00153                prints          IPR002130
    9             16         166           PR00153                prints          IPR002130
    10            16         166           PS50072                pfscan          IPR002130
    11            16         166           PS00170           scanprosite          IPR020892
    12           164         179           PR01494                prints          IPR003971
    13           164         179           PR01491                prints          IPR003968
    14          1039        1702           PF12877                  pfam          IPR024606
    15          1039        1702           PF12877                  pfam          IPR024606
    16          1039        1702           PF12877                  pfam          IPR024606
    17            44         173           PF04970                  pfam          IPR007053
    18            44          68           PF04970                  pfam          IPR007053
    19            44         173           PF04970                  pfam          IPR007053
    20             4         284           PS50011                pfscan          IPR000719
    21             4         284           PS00107           scanprosite          IPR017441
    22             4         284           PS00108           scanprosite          IPR008271
    23             4         284          SSF56112           superfamily          IPR011009
       prot_dom_start prot_dom_end gene_name
    1             164          339     CERKL
    2             170          334     CERKL
    3             164          339     CERKL
    4             598          668     CNGA3
    5             482          606     CNGA3
    6             482          596     CNGA3
    7             125          140     CWC27
    8              97          112     CWC27
    9             112          124     CWC27
    10             19          166     CWC27
    11             49           66     CWC27
    12            187          199     KCNV2
    13            410          424     KCNV2
    14           1039         1702  KIAA1549
    15           1039         1702  KIAA1549
    16           1039         1702  KIAA1549
    17             44          173      LRAT
    18             44           68      LRAT
    19             44          173      LRAT
    20              4          284       MAK
    21             10           33       MAK
    22            121          133       MAK
    23              1          285       MAK

I would like to know if any of the coords_int$start are part of the pdoms_protein$start / pdoms_protein$end range and the same for the coords_int$end and then filter only the data that falls in this category.

I'd tried

library(tidyverse)    
pdoms_protein %>% 
      mutate(dom.ok = 98396674>= start & 98396674<= end) %>%
      filter(dom.ok == "TRUE")

And it works but only for one value at a time. Is there a more practical way to do it all at once?

CodePudding user response：

You could use data.table::foverlaps(), like this:

library(data.table)

setDT(coords_int)
setDT(pdoms_protein)

setkey(coords_int,start,end)
foverlaps(pdoms_protein,coords_int)

Also see package IRanges

CodePudding user response：

We could do it with fuzzyjoin:

library(fuzzyjoin)
library(dplyr)

long_coords_int <- coords_int %>% 
  pivot_longer(-seqnames) 

fuzzy_left_join(long_coords_int, pdoms_protein[3:4], by = c("value" = "start", "value" = "end"), 
                match_fun  =list(`>=`, `<=`)) %>% 
  mutate(found = c(NA, "YES")[(!is.na(start))   1])

  seqnames name      value    start      end found
   <chr>    <chr>     <int>    <int>    <int> <chr>
 1 chr2     start 181529780       NA       NA NA   
 2 chr2     end   181533313       NA       NA NA   
 3 chr2     start  98396674 98396674 98396940 YES  
 4 chr2     end    98396940 98396674 98396940 YES  
 5 chr5     start  64919375       NA       NA NA   
 6 chr5     end    64919395       NA       NA NA   
 7 chr9     start   2795948       NA       NA NA   
 8 chr9     end     2797647       NA       NA NA   
 9 chr7     start 138873527       NA       NA NA   
10 chr7     end   138873574       NA       NA NA   
# ... with 28 more rows

coords_int <- structure(list(seqnames = c("chr2", "chr2", "chr5", "chr9", "chr7", 
"chr4", "chr6", "chr10", "chr17", "chr6", "chr10", "chr10", "chr10", 
"chr10", "chr17", "chr17", "chr11", "chr16", "chr16"), start = c(181529780L, 
98396674L, 64919375L, 2795948L, 138873527L, 154736072L, 10762723L, 
93614727L, 76539181L, 99608741L, 47330654L, 47331176L, 93612154L, 
84248910L, 28547999L, 28548592L, 46701067L, 49847678L, 49822670L
), end = c(181533313L, 98396940L, 64919395L, 2797647L, 138873574L, 
154736138L, 10769212L, 93614773L, 76539300L, 99608872L, 47330828L, 
47331237L, 93612575L, 84249043L, 28548101L, 28548705L, 46701141L, 
49847918L, 49822738L)), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19"))

pdoms_protein <- structure(list(tx_id = c("ENST00000339098", "ENST00000339098", 
"ENST00000339098", "ENST00000393504", "ENST00000393504", "ENST00000393504", 
"ENST00000381070", "ENST00000381070", "ENST00000381070", "ENST00000381070", 
"ENST00000381070", "ENST00000382082", "ENST00000382082", "ENST00000422774", 
"ENST00000422774", "ENST00000422774", "ENST00000336356", "ENST00000502525", 
"ENST00000507827", "ENST00000313243", "ENST00000313243", "ENST00000313243", 
"ENST00000313243"), seqnames = c(2L, 2L, 2L, 2L, 2L, 2L, 5L, 
5L, 5L, 5L, 5L, 9L, 9L, 7L, 7L, 7L, 4L, 4L, 4L, 6L, 6L, 6L, 6L
), start = c(181573753L, 181573753L, 181566058L, 98395844L, 98395844L, 
98396674L, 64774694L, 64774694L, 64774694L, 64781921L, 64781921L, 
2718229L, 2718229L, 138881388L, 138879538L, 138871157L, 154744456L, 
154744456L, 154744456L, 10830548L, 10830548L, 10830548L, 10830548L
), end = c(181573876L, 181573876L, 181566121L, 98396397L, 98396397L, 
98396940L, 64774787L, 64774787L, 64774787L, 64782033L, 64782033L, 
2718276L, 2718276L, 138881584L, 138879653L, 138871362L, 154744845L, 
154744530L, 154744845L, 10830639L, 10830639L, 10830639L, 10830639L
), width = c(124L, 124L, 64L, 554L, 554L, 267L, 94L, 94L, 94L, 
113L, 113L, 48L, 48L, 197L, 116L, 206L, 390L, 75L, 390L, 92L, 
92L, 92L, 92L), strand = c("-", "-", "-", " ", " ", " ", " ", 
" ", " ", " ", " ", " ", " ", "-", "-", "-", " ", " ", " ", "-", 
"-", "-", "-"), exon_id = c("ENSE00003634697", "ENSE00003634697", 
"ENSE00003523731", "ENSE00000963920", "ENSE00000963920", "ENSE00000963920", 
"ENSE00003522928", "ENSE00003522928", "ENSE00003522928", "ENSE00003582136", 
"ENSE00003582136", "ENSE00001490869", "ENSE00001490869", "ENSE00001088065", 
"ENSE00001088074", "ENSE00001088067", "ENSE00001344788", "ENSE00002048458", 
"ENSE00001344788", "ENSE00003696993", "ENSE00003696993", "ENSE00003696993", 
"ENSE00003696993"), exon_rank = c(3L, 3L, 4L, 8L, 8L, 8L, 2L, 
2L, 2L, 3L, 3L, 1L, 1L, 11L, 12L, 13L, 2L, 4L, 2L, 2L, 2L, 2L, 
2L), cds_ok = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, 
TRUE, TRUE, TRUE, TRUE, TRUE), protein_start = c(164L, 164L, 
164L, 171L, 171L, 502L, 16L, 16L, 16L, 16L, 16L, 164L, 164L, 
1039L, 1039L, 1039L, 44L, 44L, 44L, 4L, 4L, 4L, 4L), protein_end = c(339L, 
339L, 339L, 409L, 409L, 590L, 166L, 166L, 166L, 166L, 166L, 179L, 
179L, 1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 284L, 284L, 
284L), protein_domain_id = c("PS50146", "PF00781", "PS50146", 
"PF16526", "SM00100", "PS50042", "PR00153", "PR00153", "PR00153", 
"PS50072", "PS00170", "PR01494", "PR01491", "PF12877", "PF12877", 
"PF12877", "PF04970", "PF04970", "PF04970", "PS50011", "PS00107", 
"PS00108", "SSF56112"), protein_domain_source = c("pfscan", "pfam", 
"pfscan", "pfam", "smart", "pfscan", "prints", "prints", "prints", 
"pfscan", "scanprosite", "prints", "prints", "pfam", "pfam", 
"pfam", "pfam", "pfam", "pfam", "pfscan", "scanprosite", "scanprosite", 
"superfamily"), interpro_accession = c("IPR001206", "IPR001206", 
"IPR001206", "IPR032406", "IPR000595", "IPR000595", "IPR002130", 
"IPR002130", "IPR002130", "IPR002130", "IPR020892", "IPR003971", 
"IPR003968", "IPR024606", "IPR024606", "IPR024606", "IPR007053", 
"IPR007053", "IPR007053", "IPR000719", "IPR017441", "IPR008271", 
"IPR011009"), prot_dom_start = c(164L, 170L, 164L, 598L, 482L, 
482L, 125L, 97L, 112L, 19L, 49L, 187L, 410L, 1039L, 1039L, 1039L, 
44L, 44L, 44L, 4L, 10L, 121L, 1L), prot_dom_end = c(339L, 334L, 
339L, 668L, 606L, 596L, 140L, 112L, 124L, 166L, 66L, 199L, 424L, 
1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 33L, 133L, 285L), 
    gene_name = c("CERKL", "CERKL", "CERKL", "CNGA3", "CNGA3", 
    "CNGA3", "CWC27", "CWC27", "CWC27", "CWC27", "CWC27", "KCNV2", 
    "KCNV2", "KIAA1549", "KIAA1549", "KIAA1549", "LRAT", "LRAT", 
    "LRAT", "MAK", "MAK", "MAK", "MAK")), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23"))

CodePudding user response：

Not tested, but I think you could make a function like this

pdoms_protein %>% 
    mutate(dom.ok = isok(start,end)) %>%
    filter(dom.ok == "TRUE")

isok <- function(local_start, local_end) { 
    df <- coords_int %>%
    filter(start >= local_start & end <= local_end)
    return count(df) > 0
}