Home > Back-end >  How can I fast outer-join and filter two lists, preferably in base R?
How can I fast outer-join and filter two lists, preferably in base R?

Time:09-06

## outer join and filter
outer_join <- function(x, y, FUN) {
  if (missing(y)) {y = x}
  cp <- list()
  for (d1 in x) {
    for (d2 in  y) {
      if ( missing(FUN) || FUN(d1, d2) ) {
            cp <- c( cp, list(c(d1, d2)))       # and collect
       }
    }
  }
  return(cp)
}

# examples
system.time(outer_join(seq(2^8))                                                   )  # cartesian product
system.time(outer_join(seq(2^8), FUN=function(a,b){return( a == b)} )              )  # all equal pairs
system.time(outer_join(seq(2^8), FUN=function(a,b,n=7){return( ((a-b) %% n) == 0)}))  # difference is 0 modulo 7

However above method is not suitable for larger datasets (> 1000). Clearly, the nested for loop suggests room for improvement. What is the best practice to do this in R?

Note that the ideal solution works when the expand.grid does not fit in memory (before filtering) but the resulting output does. It is outer-join and filter instantaneously.

CodePudding user response:

You could achieve it with expand.grid subset:

outer_join <- function(x, y, FUN = `==`) {
  if (missing(y)) {y = x}
  subset(expand.grid(x = x, y = y), FUN(x, y))
}
Test
system.time(res1 <- outer_join(seq(2^8)))
#   user  system elapsed 
#  0.005   0.001   0.005

system.time(res2 <- outer_join(seq(2^8), FUN = function(a, b){ return( a == b) }))
#   user  system elapsed 
#  0.003   0.000   0.004

system.time(res3 <- outer_join(seq(2^8), FUN = function(a, b, n = 7){ return( ((a-b) %% n) == 0) }))
#   user  system elapsed 
#  0.007   0.001   0.007

all.equal(res1, res2)
# [1] TRUE

res3
#        x  y
# 1      1  1
# 8      8  1
# 15    15  1
# 22    22  1
# 29    29  1
# 36    36  1
# etc.

CodePudding user response:

If your FUN is something that can be translated to sql by dbplyr, you could use duckdb dbplyr and supply FUN as an expression rather than an actual function. This

library(duckdb)
#> Loading required package: DBI
library(dplyr, warn = FALSE)
library(dbplyr, warn = FALSE)

outer_join <- function(x, y, FUN, show_query = FALSE){
  if (missing(y)) y <- x
  con <- dbConnect(duckdb(), dbdir = ':memory:')
  dbWriteTable(con, 'x', tibble(x))
  dbWriteTable(con, 'y', tibble(y))
  x_tbl <- tbl(con, 'x')
  y_tbl <- tbl(con, 'y')
  out <- 
    x_tbl %>% 
      inner_join(y_tbl, sql_on = '1 = 1') %>% 
      filter({{ FUN }}) %>% 
      {if (show_query) show_query(.) else .} %>% 
      collect
  dbDisconnect(con)
  out
}

Example:

outer_join(seq(2^8), FUN = x == y, show_query = TRUE)
#> <SQL>
#> SELECT *
#> FROM (
#>   SELECT "x", "y"
#>   FROM "x" AS "LHS"
#>   INNER JOIN "y" AS "RHS"
#>     ON (1 = 1)
#> ) "q01"
#> WHERE ("x" = "y")
#> # A tibble: 256 × 2
#>        x     y
#>    <int> <int>
#>  1     1     1
#>  2     2     2
#>  3     3     3
#>  4     4     4
#>  5     5     5
#>  6     6     6
#>  7     7     7
#>  8     8     8
#>  9     9     9
#> 10    10    10
#> # … with 246 more rows

Benchmark (note the memory allocation):

expand_oj <- function(x, y, FUN = `==`) {
  if (missing(y)) {y = x}
  subset(expand.grid(x = x, y = y), FUN(x, y))
}

x <- seq(2^8)
bench::mark(
  duck = outer_join(x, FUN = x == y),
  expand = expand_oj(x, FUN = `==`),
  check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 duck        53.31ms  57.79ms      17.3  266.34KB     21.1
#> 2 expand       1.22ms   1.55ms     461.     3.08MB     29.9

x <- seq(2^13)
bench::mark(
  duck = outer_join(x, FUN = x == y),
  expand = expand_oj(x, FUN = `==`),
  check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 duck        50.69ms  53.34ms    18.2       359KB     5.46
#> 2 expand        1.44s    1.44s     0.693       3GB     2.77

Created on 2022-09-05 with reprex v2.0.2

  •  Tags:  
  • r
  • Related