Fastest way to check matching position of two sequence of equal length in R-CodePudding

I have the following function, which basically check for percentage of the matching positions of two strings:

library(tidyverse)
calculate_sequ_identity <- function(sequ_1 = NULL, sequ_2 = NULL) {
  #   # Two sequs must be of the same length
  # sequ_1 <- "FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH" 
  # sequ_2 <- "FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS" 

  
  try(if (nchar(sequ_1) != nchar(sequ_2)) stop("sequ of different length"))

  
  seq1_dat <- as_tibble(unlist(str_split(string = sequ_1, pattern = ""))) %>%
    dplyr::rename(res = 1) %>%
    dplyr::rename(res.seq1 = 1)
  
  seq2_dat <- as_tibble(unlist(str_split(string = sequ_2, pattern = ""))) %>%
    dplyr::rename(res = 1) %>%
    dplyr::rename(res.seq2 = 1)
  
  final_dat <- bind_cols(seq1_dat, seq2_dat) %>%
    rowwise() %>%
    mutate(identity_status = if_else(res.seq1 == res.seq2, 1, 0)) %>%
    unnest(cols = c()) %>%
    mutate(res_no = row_number()) 
  
  identity <- sum(final_dat$identity_status) / nchar(sequ_1)
  
  identity
}

With this as example:

 sequ_1: FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH
         ||| || |              |           # 7 matches of 30 char seqs
 sequ_2: FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS

The matching identity is 7/30 = 0.23.

But I'm not sure if it's the fastest routine. Please advice what's the fast way to calculate. Typically I have millions of pair to check.

Current benchmark:

rbenchmark::benchmark(

  "m1" = {calculate_sequ_identity(sequ_1 = sequ_1, sequ_2 = sequ_2)},
  replications = 100,
  columns = c("test", "replications", "elapsed",
              "relative", "user.self", "sys.self")
  
)

gives me

  test replications elapsed relative user.self sys.self
1   m1          100   2.267    1.000     2.228    0.032

CodePudding user response：

The stringdist package is optimized for this kind of task.

Yours is a simple use case, you want the string similarity using the hamming distance:

stringdist::stringsim(sequ_1, sequ_2, method = "hamming")

[1] 0.2333333

If you have a lot of pairwise comparisons to make, you can use stringsimmatrix().

Benchmarks - size 100k comparisons:

s1 <- stringi::stri_rand_strings(100000, 30)
s2 <- stringi::stri_rand_strings(100000, 30)

bench::mark(j1 = f(s1, s2),
            j2 = mapply(f2, s1, s2, USE.NAMES = FALSE),
            ds1 = mapply(f3, s1, s2, USE.NAMES = FALSE),
            ds2 = f4(s1, s2),
            ss = stringdist::stringsim(s1, s2, method="hamming"))[1:9] %>%
  dplyr::arrange(median)

# A tibble: 5 × 9
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 ss          32.63ms  33.81ms    29.5       4.2MB     0       15     0   507.88ms
2 ds2        136.29ms 160.98ms     6.32    17.93MB     1.58     4     1   633.25ms
3 j1         299.08ms  320.9ms     3.12    83.78MB     1.56     2     1    641.8ms
4 ds1        717.85ms 717.85ms     1.39     3.29MB     2.79     1     2   717.85ms
5 j2            1.39s    1.39s     0.721   58.22MB     1.44     1     2      1.39s

Functions used:

f <- \(x, y) {
  stopifnot((nx <- nchar(x)) == nchar(y))
  colSums(mapply(`==`, strsplit(x, ''), strsplit(y, '')))/nx
}

f2 <- \(x, y) {
  stopifnot((nx <- nchar(x)) == nchar(y))
  sl <- strsplit(c(x, y), '')
  sum(sl[[1]] == sl[[2]]) / nx
}

f3 <- function(x, y) {
  x <- charToRaw(x)
  y <- charToRaw(y)
  stopifnot((l <- length(x)) == length(y))
  sum(x == y) / l
}

f4 <- \(x, y) {
    l <- nchar(x[1])
    matrixStats::colSums2(vapply(x, "charToRaw", raw(l)) == vapply(y, "charToRaw", raw(l))) / l
  }

CodePudding user response：

What about this?

f <- \(x, y) {
  stopifnot((nx <- nchar(x)) == nchar(y))
  sl <- strsplit(c(x, y), '')
  sum(sl[[1]] == sl[[2]])/nx
}


s1 <- 'FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH'
s2 <- 'FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS'

f(s1, s2)
# [1] 0.2333333

CodePudding user response：

Generally the more you assume about your input, the more you can cut the meat of functions in R. Of note here is that the sample strings contain only ASCII characters.

One approach then is to avoid compare their raw bytes.

s1 <- 'FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH'
s2 <- 'FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS'

f <- function(x, y) {
    x <- charToRaw(x)
    y <- charToRaw(y)
    stopifnot((l <- length(x)) == length(y))
    sum(x == y) / l
}

Which is quite fast, but less safe than other answers proposed here. See for example the output of multibyte characters charToRaw("不") ([1] e4 b8 8d) compared to charToRaw("5") ([1] 35).

f2 <- \(x, y) {
    stopifnot((nx <- nchar(x)) == nchar(y))
    sl <- strsplit(c(x, y), '')
    sum(sl[[1]] == sl[[2]])/nx
}

bench::mark(
    one = f(s1, s2),
    two = f2(s1, s2)
)

#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 one           3.8µs    4.3µs   189066.    2.08KB     37.8
#> 2 two           6.7µs    7.4µs   118876.    3.37KB     11.9