I have the following function, which basically check for percentage of the matching positions of two strings:
library(tidyverse)
calculate_sequ_identity <- function(sequ_1 = NULL, sequ_2 = NULL) {
# # Two sequs must be of the same length
# sequ_1 <- "FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH"
# sequ_2 <- "FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS"
try(if (nchar(sequ_1) != nchar(sequ_2)) stop("sequ of different length"))
seq1_dat <- as_tibble(unlist(str_split(string = sequ_1, pattern = ""))) %>%
dplyr::rename(res = 1) %>%
dplyr::rename(res.seq1 = 1)
seq2_dat <- as_tibble(unlist(str_split(string = sequ_2, pattern = ""))) %>%
dplyr::rename(res = 1) %>%
dplyr::rename(res.seq2 = 1)
final_dat <- bind_cols(seq1_dat, seq2_dat) %>%
rowwise() %>%
mutate(identity_status = if_else(res.seq1 == res.seq2, 1, 0)) %>%
unnest(cols = c()) %>%
mutate(res_no = row_number())
identity <- sum(final_dat$identity_status) / nchar(sequ_1)
identity
}
With this as example:
sequ_1: FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH
||| || | | # 7 matches of 30 char seqs
sequ_2: FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS
The matching identity is 7/30 = 0.23.
But I'm not sure if it's the fastest routine. Please advice what's the fast way to calculate. Typically I have millions of pair to check.
Current benchmark:
rbenchmark::benchmark(
"m1" = {calculate_sequ_identity(sequ_1 = sequ_1, sequ_2 = sequ_2)},
replications = 100,
columns = c("test", "replications", "elapsed",
"relative", "user.self", "sys.self")
)
gives me
test replications elapsed relative user.self sys.self
1 m1 100 2.267 1.000 2.228 0.032
CodePudding user response:
The stringdist
package is optimized for this kind of task.
Yours is a simple use case, you want the string similarity using the hamming
distance:
stringdist::stringsim(sequ_1, sequ_2, method = "hamming")
[1] 0.2333333
If you have a lot of pairwise comparisons to make, you can use stringsimmatrix()
.
Benchmarks - size 100k comparisons:
s1 <- stringi::stri_rand_strings(100000, 30)
s2 <- stringi::stri_rand_strings(100000, 30)
bench::mark(j1 = f(s1, s2),
j2 = mapply(f2, s1, s2, USE.NAMES = FALSE),
ds1 = mapply(f3, s1, s2, USE.NAMES = FALSE),
ds2 = f4(s1, s2),
ss = stringdist::stringsim(s1, s2, method="hamming"))[1:9] %>%
dplyr::arrange(median)
# A tibble: 5 × 9
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 ss 32.63ms 33.81ms 29.5 4.2MB 0 15 0 507.88ms
2 ds2 136.29ms 160.98ms 6.32 17.93MB 1.58 4 1 633.25ms
3 j1 299.08ms 320.9ms 3.12 83.78MB 1.56 2 1 641.8ms
4 ds1 717.85ms 717.85ms 1.39 3.29MB 2.79 1 2 717.85ms
5 j2 1.39s 1.39s 0.721 58.22MB 1.44 1 2 1.39s
Functions used:
f <- \(x, y) {
stopifnot((nx <- nchar(x)) == nchar(y))
colSums(mapply(`==`, strsplit(x, ''), strsplit(y, '')))/nx
}
f2 <- \(x, y) {
stopifnot((nx <- nchar(x)) == nchar(y))
sl <- strsplit(c(x, y), '')
sum(sl[[1]] == sl[[2]]) / nx
}
f3 <- function(x, y) {
x <- charToRaw(x)
y <- charToRaw(y)
stopifnot((l <- length(x)) == length(y))
sum(x == y) / l
}
f4 <- \(x, y) {
l <- nchar(x[1])
matrixStats::colSums2(vapply(x, "charToRaw", raw(l)) == vapply(y, "charToRaw", raw(l))) / l
}
CodePudding user response:
What about this?
f <- \(x, y) {
stopifnot((nx <- nchar(x)) == nchar(y))
sl <- strsplit(c(x, y), '')
sum(sl[[1]] == sl[[2]])/nx
}
s1 <- 'FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH'
s2 <- 'FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS'
f(s1, s2)
# [1] 0.2333333
CodePudding user response:
Generally the more you assume about your input, the more you can cut the meat of functions in R. Of note here is that the sample strings contain only ASCII characters.
One approach then is to avoid compare their raw bytes.
s1 <- 'FKDHKHIDVKDRHRTRHLAKTRCYHIDPHH'
s2 <- 'FKDKKHLDKFSSYHVKTAFFHVCTQNPQDS'
f <- function(x, y) {
x <- charToRaw(x)
y <- charToRaw(y)
stopifnot((l <- length(x)) == length(y))
sum(x == y) / l
}
Which is quite fast, but less safe than other answers proposed here. See for example the output of multibyte characters charToRaw("不")
([1] e4 b8 8d
) compared to charToRaw("5")
([1] 35
).
f2 <- \(x, y) {
stopifnot((nx <- nchar(x)) == nchar(y))
sl <- strsplit(c(x, y), '')
sum(sl[[1]] == sl[[2]])/nx
}
bench::mark(
one = f(s1, s2),
two = f2(s1, s2)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 one 3.8µs 4.3µs 189066. 2.08KB 37.8
#> 2 two 6.7µs 7.4µs 118876. 3.37KB 11.9