I have the following dataframes I've simplified for practical purposes:
head(coords_int)
seqnames start end
1 chr2 181529780 181533313
2 chr2 98396674 98396940
3 chr5 64919375 64919395
4 chr9 2795948 2797647
5 chr7 138873527 138873574
6 chr4 154736072 154736138
7 chr6 10762723 10769212
8 chr10 93614727 93614773
9 chr17 76539181 76539300
10 chr6 99608741 99608872
11 chr10 47330654 47330828
12 chr10 47331176 47331237
13 chr10 93612154 93612575
14 chr10 84248910 84249043
15 chr17 28547999 28548101
16 chr17 28548592 28548705
17 chr11 46701067 46701141
18 chr16 49847678 49847918
19 chr16 49822670 49822738
head(pdoms_protein)
tx_id seqnames start end width strand exon_id exon_rank cds_ok
1 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
2 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
3 ENST00000339098 2 181566058 181566121 64 - ENSE00003523731 4 TRUE
4 ENST00000393504 2 98395844 98396397 554 ENSE00000963920 8 TRUE
5 ENST00000393504 2 98395844 98396397 554 ENSE00000963920 8 TRUE
6 ENST00000393504 2 98396674 98396940 267 ENSE00000963920 8 TRUE
7 ENST00000381070 5 64774694 64774787 94 ENSE00003522928 2 TRUE
8 ENST00000381070 5 64774694 64774787 94 ENSE00003522928 2 TRUE
9 ENST00000381070 5 64774694 64774787 94 ENSE00003522928 2 TRUE
10 ENST00000381070 5 64781921 64782033 113 ENSE00003582136 3 TRUE
11 ENST00000381070 5 64781921 64782033 113 ENSE00003582136 3 TRUE
12 ENST00000382082 9 2718229 2718276 48 ENSE00001490869 1 TRUE
13 ENST00000382082 9 2718229 2718276 48 ENSE00001490869 1 TRUE
14 ENST00000422774 7 138881388 138881584 197 - ENSE00001088065 11 TRUE
15 ENST00000422774 7 138879538 138879653 116 - ENSE00001088074 12 TRUE
16 ENST00000422774 7 138871157 138871362 206 - ENSE00001088067 13 TRUE
17 ENST00000336356 4 154744456 154744845 390 ENSE00001344788 2 TRUE
18 ENST00000502525 4 154744456 154744530 75 ENSE00002048458 4 FALSE
19 ENST00000507827 4 154744456 154744845 390 ENSE00001344788 2 TRUE
20 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
21 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
22 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
23 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
protein_start protein_end protein_domain_id protein_domain_source interpro_accession
1 164 339 PS50146 pfscan IPR001206
2 164 339 PF00781 pfam IPR001206
3 164 339 PS50146 pfscan IPR001206
4 171 409 PF16526 pfam IPR032406
5 171 409 SM00100 smart IPR000595
6 502 590 PS50042 pfscan IPR000595
7 16 166 PR00153 prints IPR002130
8 16 166 PR00153 prints IPR002130
9 16 166 PR00153 prints IPR002130
10 16 166 PS50072 pfscan IPR002130
11 16 166 PS00170 scanprosite IPR020892
12 164 179 PR01494 prints IPR003971
13 164 179 PR01491 prints IPR003968
14 1039 1702 PF12877 pfam IPR024606
15 1039 1702 PF12877 pfam IPR024606
16 1039 1702 PF12877 pfam IPR024606
17 44 173 PF04970 pfam IPR007053
18 44 68 PF04970 pfam IPR007053
19 44 173 PF04970 pfam IPR007053
20 4 284 PS50011 pfscan IPR000719
21 4 284 PS00107 scanprosite IPR017441
22 4 284 PS00108 scanprosite IPR008271
23 4 284 SSF56112 superfamily IPR011009
prot_dom_start prot_dom_end gene_name
1 164 339 CERKL
2 170 334 CERKL
3 164 339 CERKL
4 598 668 CNGA3
5 482 606 CNGA3
6 482 596 CNGA3
7 125 140 CWC27
8 97 112 CWC27
9 112 124 CWC27
10 19 166 CWC27
11 49 66 CWC27
12 187 199 KCNV2
13 410 424 KCNV2
14 1039 1702 KIAA1549
15 1039 1702 KIAA1549
16 1039 1702 KIAA1549
17 44 173 LRAT
18 44 68 LRAT
19 44 173 LRAT
20 4 284 MAK
21 10 33 MAK
22 121 133 MAK
23 1 285 MAK
I would like to know if any of the coords_int$start
are part of the pdoms_protein$start
/ pdoms_protein$end
range and the same for the coords_int$end
and then filter only the data that falls in this category.
I'd tried
library(tidyverse)
pdoms_protein %>%
mutate(dom.ok = 98396674>= start & 98396674<= end) %>%
filter(dom.ok == "TRUE")
And it works but only for one value at a time. Is there a more practical way to do it all at once?
CodePudding user response:
You could use data.table::foverlaps()
, like this:
library(data.table)
setDT(coords_int)
setDT(pdoms_protein)
setkey(coords_int,start,end)
foverlaps(pdoms_protein,coords_int)
Also see package IRanges
CodePudding user response:
We could do it with fuzzyjoin
:
library(fuzzyjoin)
library(dplyr)
long_coords_int <- coords_int %>%
pivot_longer(-seqnames)
fuzzy_left_join(long_coords_int, pdoms_protein[3:4], by = c("value" = "start", "value" = "end"),
match_fun =list(`>=`, `<=`)) %>%
mutate(found = c(NA, "YES")[(!is.na(start)) 1])
seqnames name value start end found
<chr> <chr> <int> <int> <int> <chr>
1 chr2 start 181529780 NA NA NA
2 chr2 end 181533313 NA NA NA
3 chr2 start 98396674 98396674 98396940 YES
4 chr2 end 98396940 98396674 98396940 YES
5 chr5 start 64919375 NA NA NA
6 chr5 end 64919395 NA NA NA
7 chr9 start 2795948 NA NA NA
8 chr9 end 2797647 NA NA NA
9 chr7 start 138873527 NA NA NA
10 chr7 end 138873574 NA NA NA
# ... with 28 more rows
coords_int <- structure(list(seqnames = c("chr2", "chr2", "chr5", "chr9", "chr7",
"chr4", "chr6", "chr10", "chr17", "chr6", "chr10", "chr10", "chr10",
"chr10", "chr17", "chr17", "chr11", "chr16", "chr16"), start = c(181529780L,
98396674L, 64919375L, 2795948L, 138873527L, 154736072L, 10762723L,
93614727L, 76539181L, 99608741L, 47330654L, 47331176L, 93612154L,
84248910L, 28547999L, 28548592L, 46701067L, 49847678L, 49822670L
), end = c(181533313L, 98396940L, 64919395L, 2797647L, 138873574L,
154736138L, 10769212L, 93614773L, 76539300L, 99608872L, 47330828L,
47331237L, 93612575L, 84249043L, 28548101L, 28548705L, 46701141L,
49847918L, 49822738L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19"))
pdoms_protein <- structure(list(tx_id = c("ENST00000339098", "ENST00000339098",
"ENST00000339098", "ENST00000393504", "ENST00000393504", "ENST00000393504",
"ENST00000381070", "ENST00000381070", "ENST00000381070", "ENST00000381070",
"ENST00000381070", "ENST00000382082", "ENST00000382082", "ENST00000422774",
"ENST00000422774", "ENST00000422774", "ENST00000336356", "ENST00000502525",
"ENST00000507827", "ENST00000313243", "ENST00000313243", "ENST00000313243",
"ENST00000313243"), seqnames = c(2L, 2L, 2L, 2L, 2L, 2L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 7L, 7L, 7L, 4L, 4L, 4L, 6L, 6L, 6L, 6L
), start = c(181573753L, 181573753L, 181566058L, 98395844L, 98395844L,
98396674L, 64774694L, 64774694L, 64774694L, 64781921L, 64781921L,
2718229L, 2718229L, 138881388L, 138879538L, 138871157L, 154744456L,
154744456L, 154744456L, 10830548L, 10830548L, 10830548L, 10830548L
), end = c(181573876L, 181573876L, 181566121L, 98396397L, 98396397L,
98396940L, 64774787L, 64774787L, 64774787L, 64782033L, 64782033L,
2718276L, 2718276L, 138881584L, 138879653L, 138871362L, 154744845L,
154744530L, 154744845L, 10830639L, 10830639L, 10830639L, 10830639L
), width = c(124L, 124L, 64L, 554L, 554L, 267L, 94L, 94L, 94L,
113L, 113L, 48L, 48L, 197L, 116L, 206L, 390L, 75L, 390L, 92L,
92L, 92L, 92L), strand = c("-", "-", "-", " ", " ", " ", " ",
" ", " ", " ", " ", " ", " ", "-", "-", "-", " ", " ", " ", "-",
"-", "-", "-"), exon_id = c("ENSE00003634697", "ENSE00003634697",
"ENSE00003523731", "ENSE00000963920", "ENSE00000963920", "ENSE00000963920",
"ENSE00003522928", "ENSE00003522928", "ENSE00003522928", "ENSE00003582136",
"ENSE00003582136", "ENSE00001490869", "ENSE00001490869", "ENSE00001088065",
"ENSE00001088074", "ENSE00001088067", "ENSE00001344788", "ENSE00002048458",
"ENSE00001344788", "ENSE00003696993", "ENSE00003696993", "ENSE00003696993",
"ENSE00003696993"), exon_rank = c(3L, 3L, 4L, 8L, 8L, 8L, 2L,
2L, 2L, 3L, 3L, 1L, 1L, 11L, 12L, 13L, 2L, 4L, 2L, 2L, 2L, 2L,
2L), cds_ok = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE), protein_start = c(164L, 164L,
164L, 171L, 171L, 502L, 16L, 16L, 16L, 16L, 16L, 164L, 164L,
1039L, 1039L, 1039L, 44L, 44L, 44L, 4L, 4L, 4L, 4L), protein_end = c(339L,
339L, 339L, 409L, 409L, 590L, 166L, 166L, 166L, 166L, 166L, 179L,
179L, 1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 284L, 284L,
284L), protein_domain_id = c("PS50146", "PF00781", "PS50146",
"PF16526", "SM00100", "PS50042", "PR00153", "PR00153", "PR00153",
"PS50072", "PS00170", "PR01494", "PR01491", "PF12877", "PF12877",
"PF12877", "PF04970", "PF04970", "PF04970", "PS50011", "PS00107",
"PS00108", "SSF56112"), protein_domain_source = c("pfscan", "pfam",
"pfscan", "pfam", "smart", "pfscan", "prints", "prints", "prints",
"pfscan", "scanprosite", "prints", "prints", "pfam", "pfam",
"pfam", "pfam", "pfam", "pfam", "pfscan", "scanprosite", "scanprosite",
"superfamily"), interpro_accession = c("IPR001206", "IPR001206",
"IPR001206", "IPR032406", "IPR000595", "IPR000595", "IPR002130",
"IPR002130", "IPR002130", "IPR002130", "IPR020892", "IPR003971",
"IPR003968", "IPR024606", "IPR024606", "IPR024606", "IPR007053",
"IPR007053", "IPR007053", "IPR000719", "IPR017441", "IPR008271",
"IPR011009"), prot_dom_start = c(164L, 170L, 164L, 598L, 482L,
482L, 125L, 97L, 112L, 19L, 49L, 187L, 410L, 1039L, 1039L, 1039L,
44L, 44L, 44L, 4L, 10L, 121L, 1L), prot_dom_end = c(339L, 334L,
339L, 668L, 606L, 596L, 140L, 112L, 124L, 166L, 66L, 199L, 424L,
1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 33L, 133L, 285L),
gene_name = c("CERKL", "CERKL", "CERKL", "CNGA3", "CNGA3",
"CNGA3", "CWC27", "CWC27", "CWC27", "CWC27", "CWC27", "KCNV2",
"KCNV2", "KIAA1549", "KIAA1549", "KIAA1549", "LRAT", "LRAT",
"LRAT", "MAK", "MAK", "MAK", "MAK")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23"))
CodePudding user response:
Not tested, but I think you could make a function like this
pdoms_protein %>%
mutate(dom.ok = isok(start,end)) %>%
filter(dom.ok == "TRUE")
isok <- function(local_start, local_end) {
df <- coords_int %>%
filter(start >= local_start & end <= local_end)
return count(df) > 0
}