Home > Software engineering >  Merge data frames different in nrow by specific column and tolerance of value difference
Merge data frames different in nrow by specific column and tolerance of value difference

Time:03-24

I would like to merge these two data frames by column Weight, however, I would like to allow tolerance of -1 / 1 in this column. It means even if numbers are not equal in column Weight the function should look if there is number -1 or 1 different and if yes merge it.

dput(dat)
structure(list(Seq1 = c("TKAALAIWCPGYSETQINATQAMK", "CLEQVSQLQGLWRRFNR", 
                           "RFNRPLLK", "YDFTNCDFEK", "DLITYMSGTK", "AALAIWCPGYSETQINATQAMK", 
                           "VTTNKCLEQVSQLQGLWR", "KARK", "ARKSK", "CLEQVSQLQGLWRRFNR", "YDFTNCDFEKIK"
), Weight = c(2653, 2190, 1044, 1351, 1141, 2436, 2172, 514, 601, 
           2202, 1604)), row.names = c(NA, -11L), class = "data.frame")



dput(data)
structure(list(Seq2 = c("QVQLVQSGAEVKKP", "QVQLVQSGAEVKKP", 
                           "QVQLVQSGAEVKKP", "QVQLVQSGAEVKKPGASVKV", "QVQLVQSGAEVKKPGASVKVSCKA", 
                           "QVQLVQSGAEVKK", "QVQLVQSGAEVKKP", "QVQLVQSGAEVKKPGASVKV", "QVQLVQSGAEVKKPGASVKVSCKA", 
                           "KKPGASVKV", "KKPGASVKV", "KKPGASVKV", "KKPGASVKVSCKA", "KKPGASVKVSCKA", 
                           "KKPGASVKVSCKA", "KKPGASVKVSCKASGYTFTGYYMHWVRQ", "KKPGASVKVSCKASGYTFTGYYMHWVRQ", 
                           "KPGASVKVSCKA", "KPGASVKVSCKA", "KPGASVKVSCKA", "KPGASVKVSCKASGYTFTGYYMHWVRQ", 
                           "KPGASVKVSCKASGYTFTGYYMHWVRQ", "KPGASVKVSCKASGYTFTGYYMHWVRQ", 
                           "KPGASVKVSCKASGYTFTGYYMHWVRQ", "KPGASVKVSCKASGYTFTGYYMHWVRQ", 
                           "KVSCKASGYTFTGYYMHWVRQ", "KVSCKASGYTFTGYYMHWVRQ", "KVSCKASGYTFTGYYMHWVRQ", 
                           "KVSCKASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", 
                           "KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", 
                           "KASGYTFTGYYMHWVRQ", "KASGYTFTGYYMHWVRQ", "RQAPGQGLEWMGWINPNSGGTNYAQKF", 
                           "RQAPGQGLEWMGWINPNSGGTNYAQKF", "RQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRD", 
                           "RITCGGNNIGSKS", "RITCGGNNIGSKSVHWYQQKP", "RITCGGNNIGSKSVHWYQQKP", 
                           "RITCGGNNIGSKSVHWYQQKP", "KSVHWYQQKP", "KSVHWYQQKP", "KSVHWYQQKP", 
                           "KSVHWYQQKP", "KSVHWYQQKPGQAPVLVVYDDSDRPSGIPERF"), Weight = c(505, 
                                                                                           1850, 1133, 1045, 1437, 2325, 2325, 1133, 3203, 1087, 3435, 505, 
                                                                                           2325, 2201, 1604, 5028, 1140, 1087, 1133, 245, 600, 2172, 1510, 
                                                                                           3813, 1087, 1850, 1169, 259, 2653, 2189, 697, 513, 1510, 2172, 
                                                                                           2363, 245, 513, 1351, 6131, 1087, 569, 2436, 1140, 1140, 1133, 
                                                                                           2992, 373, 1297)), class = "data.frame", row.names = c(1L, 2L, 
                                                                                                                                                  3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 15L, 16L, 17L, 19L, 21L, 
                                                                                                                                                  22L, 24L, 25L, 26L, 50L, 51L, 52L, 55L, 57L, 59L, 60L, 61L, 62L, 
                                                                                                                                                  63L, 65L, 66L, 67L, 68L, 70L, 71L, 73L, 74L, 75L, 76L, 77L, 78L, 
                                                                                                                                                  79L, 81L, 82L, 83L, 84L, 85L))

CodePudding user response:

Have you looked at fuzzyjoin package? You can find the documentation here.

Here's how you can do the inner_join.

library(fuzzyjoin)
library(dplyr)

data %>%
  difference_inner_join(dat, max_dist = 1)

Output:

                           Seq2 Weight.x                     Seq1 Weight.y
1          QVQLVQSGAEVKKPGASVKV     1045                 RFNRPLLK     1044
2                 KKPGASVKVSCKA     2201        CLEQVSQLQGLWRRFNR     2202
3                 KKPGASVKVSCKA     1604             YDFTNCDFEKIK     1604
4  KKPGASVKVSCKASGYTFTGYYMHWVRQ     1140               DLITYMSGTK     1141
5   KPGASVKVSCKASGYTFTGYYMHWVRQ      600                    ARKSK      601
6   KPGASVKVSCKASGYTFTGYYMHWVRQ     2172       VTTNKCLEQVSQLQGLWR     2172
7         KVSCKASGYTFTGYYMHWVRQ     2653 TKAALAIWCPGYSETQINATQAMK     2653
8             KASGYTFTGYYMHWVRQ     2189        CLEQVSQLQGLWRRFNR     2190
9             KASGYTFTGYYMHWVRQ      513                     KARK      514
10            KASGYTFTGYYMHWVRQ     2172       VTTNKCLEQVSQLQGLWR     2172
11  RQAPGQGLEWMGWINPNSGGTNYAQKF      513                     KARK      514
12  RQAPGQGLEWMGWINPNSGGTNYAQKF     1351               YDFTNCDFEK     1351
13        RITCGGNNIGSKSVHWYQQKP     2436   AALAIWCPGYSETQINATQAMK     2436
14        RITCGGNNIGSKSVHWYQQKP     1140               DLITYMSGTK     1141
15                   KSVHWYQQKP     1140               DLITYMSGTK     1141

CodePudding user response:

You could use data.table

library(data.table)
set(d1)[,`:=`(wu = Weight 1,wl = Weight-1)]
setDT(d2)[d1, on=.(Weight>=wl, Weight<=wu)][,Weight.1:=NULL][]

                            Seq2 Weight                     Seq1 i.Weight
                          <char>  <num>                   <char>    <num>
 1:        KVSCKASGYTFTGYYMHWVRQ   2652 TKAALAIWCPGYSETQINATQAMK     2653
 2:            KASGYTFTGYYMHWVRQ   2189        CLEQVSQLQGLWRRFNR     2190
 3:         QVQLVQSGAEVKKPGASVKV   1043                 RFNRPLLK     1044
 4:  RQAPGQGLEWMGWINPNSGGTNYAQKF   1350               YDFTNCDFEK     1351
 5: KKPGASVKVSCKASGYTFTGYYMHWVRQ   1140               DLITYMSGTK     1141
 6:        RITCGGNNIGSKSVHWYQQKP   1140               DLITYMSGTK     1141
 7:                   KSVHWYQQKP   1140               DLITYMSGTK     1141
 8:        RITCGGNNIGSKSVHWYQQKP   2435   AALAIWCPGYSETQINATQAMK     2436
 9:  KPGASVKVSCKASGYTFTGYYMHWVRQ   2171       VTTNKCLEQVSQLQGLWR     2172
10:            KASGYTFTGYYMHWVRQ   2171       VTTNKCLEQVSQLQGLWR     2172
11:            KASGYTFTGYYMHWVRQ    513                     KARK      514
12:  RQAPGQGLEWMGWINPNSGGTNYAQKF    513                     KARK      514
13:  KPGASVKVSCKASGYTFTGYYMHWVRQ    600                    ARKSK      601
14:                KKPGASVKVSCKA   2201        CLEQVSQLQGLWRRFNR     2202
15:                KKPGASVKVSCKA   1603             YDFTNCDFEKIK     1604
  •  Tags:  
  • r
  • Related