I would like to merge these two data frames by column Weight
, however, I would like to allow tolerance of -1 / 1
in this column. It means even if numbers are not equal in column Weight
the function should look if there is number -1
or 1
different and if yes merge it.
dput(dat)
structure(list(Seq1 = c("TKAALAIWCPGYSETQINATQAMK", "CLEQVSQLQGLWRRFNR",
"RFNRPLLK", "YDFTNCDFEK", "DLITYMSGTK", "AALAIWCPGYSETQINATQAMK",
"VTTNKCLEQVSQLQGLWR", "KARK", "ARKSK", "CLEQVSQLQGLWRRFNR", "YDFTNCDFEKIK"
), Weight = c(2653, 2190, 1044, 1351, 1141, 2436, 2172, 514, 601,
2202, 1604)), row.names = c(NA, -11L), class = "data.frame")
dput(data)
structure(list(Seq2 = c("QVQLVQSGAEVKKP", "QVQLVQSGAEVKKP",
"QVQLVQSGAEVKKP", "QVQLVQSGAEVKKPGASVKV", "QVQLVQSGAEVKKPGASVKVSCKA",
"QVQLVQSGAEVKK", "QVQLVQSGAEVKKP", "QVQLVQSGAEVKKPGASVKV", "QVQLVQSGAEVKKPGASVKVSCKA",
"KKPGASVKV", "KKPGASVKV", "KKPGASVKV", "KKPGASVKVSCKA", "KKPGASVKVSCKA",
"KKPGASVKVSCKA", "KKPGASVKVSCKASGYTFTGYYMHWVRQ", "KKPGASVKVSCKASGYTFTGYYMHWVRQ",
"KPGASVKVSCKA", "KPGASVKVSCKA", "KPGASVKVSCKA", "KPGASVKVSCKASGYTFTGYYMHWVRQ",
"KPGASVKVSCKASGYTFTGYYMHWVRQ", "KPGASVKVSCKASGYTFTGYYMHWVRQ",
"KPGASVKVSCKASGYTFTGYYMHWVRQ", "KPGASVKVSCKASGYTFTGYYMHWVRQ",
"KVSCKASGYTFTGYYMHWVRQ", "KVSCKASGYTFTGYYMHWVRQ", "KVSCKASGYTFTGYYMHWVRQ",
"KVSCKASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ",
"KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ",
"KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", "RQAPGQGLEWMGWINPNSGGTNYAQKF",
"RQAPGQGLEWMGWINPNSGGTNYAQKF", "RQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRD",
"RITCGGNNIGSKS", "RITCGGNNIGSKSVHWYQQKP", "RITCGGNNIGSKSVHWYQQKP",
"RITCGGNNIGSKSVHWYQQKP", "KSVHWYQQKP", "KSVHWYQQKP", "KSVHWYQQKP",
"KSVHWYQQKP", "KSVHWYQQKPGQAPVLVVYDDSDRPSGIPERF"), Weight = c(505,
1850, 1133, 1045, 1437, 2325, 2325, 1133, 3203, 1087, 3435, 505,
2325, 2201, 1604, 5028, 1140, 1087, 1133, 245, 600, 2172, 1510,
3813, 1087, 1850, 1169, 259, 2653, 2189, 697, 513, 1510, 2172,
2363, 245, 513, 1351, 6131, 1087, 569, 2436, 1140, 1140, 1133,
2992, 373, 1297)), class = "data.frame", row.names = c(1L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 15L, 16L, 17L, 19L, 21L,
22L, 24L, 25L, 26L, 50L, 51L, 52L, 55L, 57L, 59L, 60L, 61L, 62L,
63L, 65L, 66L, 67L, 68L, 70L, 71L, 73L, 74L, 75L, 76L, 77L, 78L,
79L, 81L, 82L, 83L, 84L, 85L))
CodePudding user response:
Have you looked at fuzzyjoin
package? You can find the documentation here.
Here's how you can do the inner_join
.
library(fuzzyjoin)
library(dplyr)
data %>%
difference_inner_join(dat, max_dist = 1)
Output:
Seq2 Weight.x Seq1 Weight.y
1 QVQLVQSGAEVKKPGASVKV 1045 RFNRPLLK 1044
2 KKPGASVKVSCKA 2201 CLEQVSQLQGLWRRFNR 2202
3 KKPGASVKVSCKA 1604 YDFTNCDFEKIK 1604
4 KKPGASVKVSCKASGYTFTGYYMHWVRQ 1140 DLITYMSGTK 1141
5 KPGASVKVSCKASGYTFTGYYMHWVRQ 600 ARKSK 601
6 KPGASVKVSCKASGYTFTGYYMHWVRQ 2172 VTTNKCLEQVSQLQGLWR 2172
7 KVSCKASGYTFTGYYMHWVRQ 2653 TKAALAIWCPGYSETQINATQAMK 2653
8 KASGYTFTGYYMHWVRQ 2189 CLEQVSQLQGLWRRFNR 2190
9 KASGYTFTGYYMHWVRQ 513 KARK 514
10 KASGYTFTGYYMHWVRQ 2172 VTTNKCLEQVSQLQGLWR 2172
11 RQAPGQGLEWMGWINPNSGGTNYAQKF 513 KARK 514
12 RQAPGQGLEWMGWINPNSGGTNYAQKF 1351 YDFTNCDFEK 1351
13 RITCGGNNIGSKSVHWYQQKP 2436 AALAIWCPGYSETQINATQAMK 2436
14 RITCGGNNIGSKSVHWYQQKP 1140 DLITYMSGTK 1141
15 KSVHWYQQKP 1140 DLITYMSGTK 1141
CodePudding user response:
You could use data.table
library(data.table)
set(d1)[,`:=`(wu = Weight 1,wl = Weight-1)]
setDT(d2)[d1, on=.(Weight>=wl, Weight<=wu)][,Weight.1:=NULL][]
Seq2 Weight Seq1 i.Weight
<char> <num> <char> <num>
1: KVSCKASGYTFTGYYMHWVRQ 2652 TKAALAIWCPGYSETQINATQAMK 2653
2: KASGYTFTGYYMHWVRQ 2189 CLEQVSQLQGLWRRFNR 2190
3: QVQLVQSGAEVKKPGASVKV 1043 RFNRPLLK 1044
4: RQAPGQGLEWMGWINPNSGGTNYAQKF 1350 YDFTNCDFEK 1351
5: KKPGASVKVSCKASGYTFTGYYMHWVRQ 1140 DLITYMSGTK 1141
6: RITCGGNNIGSKSVHWYQQKP 1140 DLITYMSGTK 1141
7: KSVHWYQQKP 1140 DLITYMSGTK 1141
8: RITCGGNNIGSKSVHWYQQKP 2435 AALAIWCPGYSETQINATQAMK 2436
9: KPGASVKVSCKASGYTFTGYYMHWVRQ 2171 VTTNKCLEQVSQLQGLWR 2172
10: KASGYTFTGYYMHWVRQ 2171 VTTNKCLEQVSQLQGLWR 2172
11: KASGYTFTGYYMHWVRQ 513 KARK 514
12: RQAPGQGLEWMGWINPNSGGTNYAQKF 513 KARK 514
13: KPGASVKVSCKASGYTFTGYYMHWVRQ 600 ARKSK 601
14: KKPGASVKVSCKA 2201 CLEQVSQLQGLWRRFNR 2202
15: KKPGASVKVSCKA 1603 YDFTNCDFEKIK 1604