I have a data table dt1
:
id1 | id2 | V1 | V2 |
---|---|---|---|
1 | a | c(1, 2, 3, 4) | c(1, 3, 6) |
2 | b | c(2, 6, 9, 8) | c(8, 5) |
I want to add new columns which are the result of the setdiff()
, intersect()
and union()
operations on the V1
and V2
variables.
Expected output:
id1 | id2 | V1 | V2 | diff_V1_V2 | intersect_V1_V2 | union_V1_V2 |
---|---|---|---|---|---|---|
1 | a | c(1, 2, 3, 4) | c(1, 3, 6) | c(2, 4) | c(1, 3) | c(1, 2, 3, 4, 6) |
2 | b | c(2, 6, 9, 8) | c(8, 5) | c(2, 6, 9) | c(8) | c(2, 5, 6, 8, 9) |
I tried:
dt_new <- dt1[, `:=` (diff_V1_V2 = setdiff(V1, V2),
intersect_V1_V2 = intersect(V1, V2),
union_V1_V2 = union(V1, V2))]
But my real vectors are very long, so these operations take a very long time.
So, how can these operations be sped up or how to get similar results using another functions/approaches? I'm looking for the most efficient way.
Or is it possible to parallelize calculations?
CodePudding user response:
Naive approach:
Naive <- function (a, b) {
list(intersect = intersect(a, b),
union = union(a, b),
adiffb = setdiff(a, b))
}
You can exploit basic mathematics to complete all three operations in one scan of vectors a
and b
, instead of three scans by three function calls. In addition, you can skip the expensive unique
if you know for sure that there are no duplicated values in either vector.
SetOp <- function (a, b, no.dup.guaranteed = FALSE) {
if (no.dup.guaranteed) {
au <- a
bu <- b
} else {
au <- unique(a)
bu <- unique(b)
}
ind <- match(bu, au, nomatch = 0)
INTERSECT <- au[ind]
DIFF <- au[-c(ind, length(au) 1)] ## https://stackoverflow.com/a/52772380
UNION <- c(bu, DIFF)
list(intersect = INTERSECT, union = UNION, adiffb = DIFF)
}
SetOp(a = c(1, 2, 3, 4), b = c(1, 3, 6))
SetOp(a = c(2, 6, 9, 8), b = c(8, 5))
Some benchmark:
## no duplicated values in either a or b; can set no.dup.guaranteed = TRUE
a <- sample.int(11000, size = 10000, replace = FALSE)
b <- sample.int(11000, size = 10000, replace = FALSE)
microbenchmark::microbenchmark(naive = Naive(a, b),
better = SetOp(a, b),
fly = SetOp(a, b, no.dup.guaranteed = TRUE))
#Unit: milliseconds
# expr min lq mean median uq max neval
# naive 6.457302 6.489710 6.751996 6.511399 6.567941 8.571623 100
# better 3.251701 3.268873 3.377910 3.277086 3.306723 3.880755 100
# fly 1.734898 1.749300 1.805163 1.755927 1.767114 3.326179 100
## lots of duplicated values in both a and b; must have no.dup.guaranteed = FALSE
a <- sample.int(100, size = 10000, replace = TRUE)
b <- sample.int(100, size = 10000, replace = TRUE)
microbenchmark::microbenchmark(naive = Naive(a, b),
better = SetOp(a, b))
#Unit: microseconds
# expr min lq mean median uq max neval
# naive 1421.702 1431.023 1653.1339 1443.147 1483.255 3809.031 100
# better 396.193 398.695 446.7062 400.293 412.046 1995.294 100
If you want further speedup, you need to think about how to speed up unique()
. This is not easy, because you probably can't beat the internal algorithms used by R.
I see additional speed improvement replacing unique() with the R fast
Rfast::sort_unique()
.
Thank you, @M.Viking. It is nice to see your answer. I have not got time to install GNU Scientific Library (GSL) on my operation system so haven’t been able to install and try Rfast myself.
Here are some comments on your benchmark result.
The reason why “better2” is faster than “fly”, is because
match
is faster on sorted vectors. So yes, even if there are no duplicated values ina
andb
, it is still a good idea to applysort_unique
.You may also want to try the
Match
function from Rfast. I spotted this function in the package's documentation, but don't know how fast it is compared with R's base version. Also, the documentation does not state clearly howMatch
handles no-match. By contrast, the base versionmatch
has a useful argumentnomatch
which I set to 0 to avoid NA indexing.
CodePudding user response:
A version of @Zheyuan Li answer benchmarked using Rfast::sort_unique()
and fastmatch::fmatch()
#install.packages("Rfast") ## Takes time to compile all the Rfast functions
library(Rfast)
SetOp2 <- function (a, b, no.dup.guaranteed = FALSE) {
if (no.dup.guaranteed) {
au <- a
bu <- b
} else {
au <- sort_unique(a) #Only difference from @Zheyuan Li version
bu <- sort_unique(b) #""
}
ind <- match(bu, au, nomatch = 0)
INTERSECT <- au[ind]
DIFF <- au[-c(ind, length(au) 1)]
UNION <- c(bu, DIFF)
list(intersect = INTERSECT, union = UNION, adiffb = DIFF)
}
SetOp3 <- function (a, b, no.dup.guaranteed = FALSE) {
if (no.dup.guaranteed) {
au <- a
bu <- b
} else {
au <- sort_unique(a) ## Updated
bu <- sort_unique(b) ## Updated
}
ind <- fastmatch::fmatch(bu, au, nomatch = 0) ## Updated
INTERSECT <- au[ind]
DIFF <- au[-c(ind, length(au) 1)]
UNION <- c(bu, DIFF)
list(intersect = INTERSECT, union = UNION, adiffb = DIFF)
}
a <- sample.int(11000, size = 10000, replace = FALSE)
b <- sample.int(11000, size = 10000, replace = FALSE)
microbenchmark::microbenchmark(naive = Naive(a, b),
better = SetOp(a, b),
better2 = SetOp2(a, b),
better3 = SetOp3(a, b),
fly = SetOp(a, b, no.dup.guaranteed = TRUE),
times=10000)
Unit: microseconds expr min lq mean median uq max neval naive 2970.053 3110.7045 3409.1796 3215.7650 3352.5110 396391.39 10000 better 1749.386 1819.7195 1987.1681 1877.7340 1984.3705 39385.95 10000 better2 840.537 873.3440 977.6893 902.5495 969.4585 35723.44 10000 better3 421.832 460.3975 530.7420 480.0845 519.7660 38398.51 10000 fly 935.782 972.0125 1074.5115 993.6900 1062.4290 39954.88 10000
x <- sample.int(100, size = 10000, replace = TRUE)
y <- sample.int(100, size = 10000, replace = TRUE)
microbenchmark::microbenchmark(naive = Naive(x, y),
better = SetOp(x, y),
better2 = SetOp2(x, y),
better3 = SetOp3(x, y),
times=10000)
Unit: microseconds expr min lq mean median uq max neval naive 595.526 623.245 771.52724 654.2590 718.4740 38635.977 10000 better 188.642 196.369 255.91779 206.0725 227.1735 41866.460 10000 better2 54.856 59.675 71.53553 63.6070 74.8945 549.182 10000 better3 63.609 70.503 94.74702 80.8270 100.8745 25584.469 10000
Nota Bene: R version 4.0.4 compiled with -mtune=native -march=native -O3 -fopenmp -fpic
, and openblas BLAS/LAPACK.
CodePudding user response:
A possible parallelization approach building off @Zheyuan Li's answer:
library(data.table)
library(Rfast)
library(parallel)
SetOp <- function(V1, V2) {
n <- length(V1)
DIFF <- vector("list", n)
INTERSECT <- vector("list", n)
UNION <- vector("list", n)
for (i in 1:n) {
u1 <- sort_unique(V1[[i]])
u2 <- sort_unique(V2[[i]])
ind <- match(u2, u1, nomatch = 0)
DIFF[[i]] <- u1[-c(ind, length(u1) 1)]
INTERSECT[[i]] <- u1[ind]
UNION[[i]] <- c(u2, DIFF[[i]])
}
list(DIFF, INTERSECT, UNION)
}
SetOpParallel <- function(V1, V2, ncl = detectCores() - 1L) {
ncl <- min(length(V1), ncl)
cl <- makeCluster(ncl)
on.exit(stopCluster(cl))
node <- rep(c(1:ncl, ncl:1), ceiling(length(V1)/ncl/2))[1:length(V1)][rank(-as.numeric(lengths(V1))*as.numeric(lengths(V2)), ties.method = "first")]
clusterEvalQ(cl, {library(data.table); library(Rfast)})
rowidx <- integer(length(V1))
lastidx <- 0L
for (i in 1:ncl) {
# pass only the needed data to each node
nodeidx <- which(node == i)
rowidx[nodeidx] <- (lastidx 1L):(lastidx length(nodeidx))
lastidx <- lastidx length(nodeidx)
v1 <- V1[nodeidx]
v2 <- V2[nodeidx]
clusterExport(cl[i], c("v1", "v2"), environment())
}
rm("v1", "v2")
clusterExport(cl, "SetOp")
rbindlist(clusterEvalQ(cl, SetOp(v1, v2)))[rowidx]
}
A smaller data set to check equivalency of the two functions:
n <- 1e2L
dt <- data.table(id1 = 1:n,
id2 = n:1,
V1 = replicate(n, sample.int(1e5, sample(5e4:6e4)), FALSE),
V2 = replicate(n, sample.int(1e5, sample(3e4:4e4)), FALSE))
identical(copy(dt)[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOp(V1, V2)],
copy(dt)[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOpParallel(V1, V2)])
#> [1] TRUE
A much larger data set for speed comparison:
n <- 1e4L
dt <- data.table(id1 = 1:n,
id2 = n:1,
V1 = replicate(n, sample.int(1e5, sample(5e4:6e4)), FALSE),
V2 = replicate(n, sample.int(1e5, sample(3e4:4e4)), FALSE))
dt2 <- copy(dt)
system.time(dt[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOpParallel(V1, V2)])
#> user system elapsed
#> 8.30 11.22 34.21
rm(dt)
invisible(gc())
system.time(dt2[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOp(V1, V2)])
#> user system elapsed
#> 75.11 5.76 82.89