Speed up setdiff(), intersect(), union() operations on vectors in R-CodePudding

I have a data table dt1:

id1	id2	V1	V2
1	a	c(1, 2, 3, 4)	c(1, 3, 6)
2	b	c(2, 6, 9, 8)	c(8, 5)

I want to add new columns which are the result of the setdiff(), intersect() and union() operations on the V1 and V2 variables.

Expected output:

id1	id2	V1	V2	diff_V1_V2	intersect_V1_V2	union_V1_V2
1	a	c(1, 2, 3, 4)	c(1, 3, 6)	c(2, 4)	c(1, 3)	c(1, 2, 3, 4, 6)
2	b	c(2, 6, 9, 8)	c(8, 5)	c(2, 6, 9)	c(8)	c(2, 5, 6, 8, 9)

I tried:

dt_new <- dt1[, `:=` (diff_V1_V2 = setdiff(V1, V2),
                      intersect_V1_V2 = intersect(V1, V2),
                      union_V1_V2 = union(V1, V2))]

But my real vectors are very long, so these operations take a very long time.

So, how can these operations be sped up or how to get similar results using another functions/approaches? I'm looking for the most efficient way.
Or is it possible to parallelize calculations?

CodePudding user response：

Naive approach:

Naive <- function (a, b) {
  list(intersect = intersect(a, b),
       union = union(a, b),
       adiffb = setdiff(a, b))
}

You can exploit basic mathematics to complete all three operations in one scan of vectors a and b, instead of three scans by three function calls. In addition, you can skip the expensive unique if you know for sure that there are no duplicated values in either vector.

SetOp <- function (a, b, no.dup.guaranteed = FALSE) {
  if (no.dup.guaranteed) {
    au <- a
    bu <- b
  } else {
    au <- unique(a)
    bu <- unique(b)
  }
  ind <- match(bu, au, nomatch = 0)
  INTERSECT <- au[ind]
  DIFF <- au[-c(ind, length(au)   1)]  ## https://stackoverflow.com/a/52772380
  UNION <- c(bu, DIFF)
  list(intersect = INTERSECT, union = UNION, adiffb = DIFF)
}

SetOp(a = c(1, 2, 3, 4), b = c(1, 3, 6))
SetOp(a = c(2, 6, 9, 8), b = c(8, 5))

Some benchmark:

## no duplicated values in either a or b; can set no.dup.guaranteed = TRUE
a <- sample.int(11000, size = 10000, replace = FALSE)
b <- sample.int(11000, size = 10000, replace = FALSE)

microbenchmark::microbenchmark(naive = Naive(a, b),
                               better = SetOp(a, b),
                               fly = SetOp(a, b, no.dup.guaranteed = TRUE))
#Unit: milliseconds
#   expr      min       lq     mean   median       uq      max neval
#  naive 6.457302 6.489710 6.751996 6.511399 6.567941 8.571623   100
# better 3.251701 3.268873 3.377910 3.277086 3.306723 3.880755   100
#    fly 1.734898 1.749300 1.805163 1.755927 1.767114 3.326179   100

## lots of duplicated values in both a and b; must have no.dup.guaranteed = FALSE
a <- sample.int(100, size = 10000, replace = TRUE)
b <- sample.int(100, size = 10000, replace = TRUE)

microbenchmark::microbenchmark(naive = Naive(a, b),
                               better = SetOp(a, b))
#Unit: microseconds
#   expr      min       lq      mean   median       uq      max neval
#  naive 1421.702 1431.023 1653.1339 1443.147 1483.255 3809.031   100
# better  396.193  398.695  446.7062  400.293  412.046 1995.294   100

If you want further speedup, you need to think about how to speed up unique(). This is not easy, because you probably can't beat the internal algorithms used by R.

I see additional speed improvement replacing unique() with the R fast Rfast::sort_unique().

Thank you, @M.Viking. It is nice to see your answer. I have not got time to install GNU Scientific Library (GSL) on my operation system so haven’t been able to install and try Rfast myself.

Here are some comments on your benchmark result.

The reason why “better2” is faster than “fly”, is because match is faster on sorted vectors. So yes, even if there are no duplicated values in a and b, it is still a good idea to apply sort_unique.
You may also want to try the Match function from Rfast. I spotted this function in the package's documentation, but don't know how fast it is compared with R's base version. Also, the documentation does not state clearly how Match handles no-match. By contrast, the base version match has a useful argument nomatch which I set to 0 to avoid NA indexing.

CodePudding user response：

A version of @Zheyuan Li answer benchmarked using Rfast::sort_unique() and fastmatch::fmatch()

#install.packages("Rfast")  ## Takes time to compile all the Rfast functions
library(Rfast)

SetOp2 <- function (a, b, no.dup.guaranteed = FALSE) {
  if (no.dup.guaranteed) {
    au <- a
    bu <- b
  } else {
    au <- sort_unique(a) #Only difference from @Zheyuan Li version
    bu <- sort_unique(b) #""
  }
  ind <- match(bu, au, nomatch = 0)
  INTERSECT <- au[ind]
  DIFF <- au[-c(ind, length(au)   1)]
  UNION <- c(bu, DIFF)
  list(intersect = INTERSECT, union = UNION, adiffb = DIFF)
}

SetOp3 <- function (a, b, no.dup.guaranteed = FALSE) {
  if (no.dup.guaranteed) {
    au <- a
    bu <- b
  } else {
    au <- sort_unique(a)  ## Updated
    bu <- sort_unique(b)  ## Updated
  }
  ind <- fastmatch::fmatch(bu, au, nomatch = 0)   ## Updated
  INTERSECT <- au[ind]
  DIFF <- au[-c(ind, length(au)   1)]
  UNION <- c(bu, DIFF)
  list(intersect = INTERSECT, union = UNION, adiffb = DIFF)
}

a <- sample.int(11000, size = 10000, replace = FALSE)
b <- sample.int(11000, size = 10000, replace = FALSE)

microbenchmark::microbenchmark(naive = Naive(a, b),
                               better = SetOp(a, b),
                               better2 = SetOp2(a, b),
                               better3 = SetOp3(a, b),
                               fly = SetOp(a, b, no.dup.guaranteed = TRUE),
                               times=10000)

Unit: microseconds
    expr      min        lq      mean    median        uq       max neval
   naive 2970.053 3110.7045 3409.1796 3215.7650 3352.5110 396391.39 10000
  better 1749.386 1819.7195 1987.1681 1877.7340 1984.3705  39385.95 10000
 better2  840.537  873.3440  977.6893  902.5495  969.4585  35723.44 10000
 better3  421.832  460.3975  530.7420  480.0845  519.7660  38398.51 10000
     fly  935.782  972.0125 1074.5115  993.6900 1062.4290  39954.88 10000

x <- sample.int(100, size = 10000, replace = TRUE)
y <- sample.int(100, size = 10000, replace = TRUE)

microbenchmark::microbenchmark(naive = Naive(x, y),
                               better = SetOp(x, y),
                               better2 = SetOp2(x, y),
                               better3 = SetOp3(x, y),
                               times=10000)

Unit: microseconds
    expr     min      lq      mean   median       uq       max neval
   naive 595.526 623.245 771.52724 654.2590 718.4740 38635.977 10000
  better 188.642 196.369 255.91779 206.0725 227.1735 41866.460 10000
 better2  54.856  59.675  71.53553  63.6070  74.8945   549.182 10000
 better3  63.609  70.503  94.74702  80.8270 100.8745 25584.469 10000

Nota Bene: R version 4.0.4 compiled with -mtune=native -march=native -O3 -fopenmp -fpic, and openblas BLAS/LAPACK.

CodePudding user response：

A possible parallelization approach building off @Zheyuan Li's answer:

library(data.table)
library(Rfast)
library(parallel)

SetOp <- function(V1, V2) {
  n <- length(V1)
  DIFF <- vector("list", n)
  INTERSECT <- vector("list", n)
  UNION <- vector("list", n)
  for (i in 1:n) {
    u1 <- sort_unique(V1[[i]])
    u2 <- sort_unique(V2[[i]])
    ind <- match(u2, u1, nomatch = 0)
    DIFF[[i]] <- u1[-c(ind, length(u1)   1)]
    INTERSECT[[i]] <- u1[ind]
    UNION[[i]] <- c(u2, DIFF[[i]])
  }
  
  list(DIFF, INTERSECT, UNION)
}

SetOpParallel <- function(V1, V2, ncl = detectCores() - 1L) {
  ncl <- min(length(V1), ncl)
  cl <- makeCluster(ncl)
  on.exit(stopCluster(cl))
  node <- rep(c(1:ncl, ncl:1), ceiling(length(V1)/ncl/2))[1:length(V1)][rank(-as.numeric(lengths(V1))*as.numeric(lengths(V2)), ties.method = "first")]
  clusterEvalQ(cl, {library(data.table); library(Rfast)})
  rowidx <- integer(length(V1))
  lastidx <- 0L
  for (i in 1:ncl) {
    # pass only the needed data to each node
    nodeidx <- which(node == i)
    rowidx[nodeidx] <- (lastidx   1L):(lastidx   length(nodeidx))
    lastidx <- lastidx   length(nodeidx)
    v1 <- V1[nodeidx]
    v2 <- V2[nodeidx]
    clusterExport(cl[i], c("v1", "v2"), environment())
  }
  rm("v1", "v2")
  clusterExport(cl, "SetOp")
  rbindlist(clusterEvalQ(cl, SetOp(v1, v2)))[rowidx]
}

A smaller data set to check equivalency of the two functions:

n <- 1e2L
dt <- data.table(id1 = 1:n,
                 id2 = n:1,
                 V1 = replicate(n, sample.int(1e5, sample(5e4:6e4)), FALSE),
                 V2 = replicate(n, sample.int(1e5, sample(3e4:4e4)), FALSE))
identical(copy(dt)[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOp(V1, V2)],
          copy(dt)[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOpParallel(V1, V2)])
#> [1] TRUE

A much larger data set for speed comparison:

n <- 1e4L
dt <- data.table(id1 = 1:n,
                 id2 = n:1,
                 V1 = replicate(n, sample.int(1e5, sample(5e4:6e4)), FALSE),
                 V2 = replicate(n, sample.int(1e5, sample(3e4:4e4)), FALSE))
dt2 <- copy(dt)
system.time(dt[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOpParallel(V1, V2)])
#>    user  system elapsed 
#>    8.30   11.22   34.21
rm(dt)
invisible(gc())
system.time(dt2[, c("diff_V1_V2", "intersect_V1_V2", "union_V1_V2") := SetOp(V1, V2)])
#>    user  system elapsed 
#>   75.11    5.76   82.89