## outer join and filter
outer_join <- function(x, y, FUN) {
if (missing(y)) {y = x}
cp <- list()
for (d1 in x) {
for (d2 in y) {
if ( missing(FUN) || FUN(d1, d2) ) {
cp <- c( cp, list(c(d1, d2))) # and collect
}
}
}
return(cp)
}
# examples
system.time(outer_join(seq(2^8)) ) # cartesian product
system.time(outer_join(seq(2^8), FUN=function(a,b){return( a == b)} ) ) # all equal pairs
system.time(outer_join(seq(2^8), FUN=function(a,b,n=7){return( ((a-b) %% n) == 0)})) # difference is 0 modulo 7
However above method is not suitable for larger datasets (> 1000). Clearly, the nested for loop suggests room for improvement. What is the best practice to do this in R?
Note that the ideal solution works when the expand.grid does not fit in memory (before filtering) but the resulting output does. It is outer-join and filter instantaneously.
CodePudding user response:
You could achieve it with expand.grid
subset
:
outer_join <- function(x, y, FUN = `==`) {
if (missing(y)) {y = x}
subset(expand.grid(x = x, y = y), FUN(x, y))
}
Test
system.time(res1 <- outer_join(seq(2^8)))
# user system elapsed
# 0.005 0.001 0.005
system.time(res2 <- outer_join(seq(2^8), FUN = function(a, b){ return( a == b) }))
# user system elapsed
# 0.003 0.000 0.004
system.time(res3 <- outer_join(seq(2^8), FUN = function(a, b, n = 7){ return( ((a-b) %% n) == 0) }))
# user system elapsed
# 0.007 0.001 0.007
all.equal(res1, res2)
# [1] TRUE
res3
# x y
# 1 1 1
# 8 8 1
# 15 15 1
# 22 22 1
# 29 29 1
# 36 36 1
# etc.
CodePudding user response:
If your FUN
is something that can be translated to sql by dbplyr, you could use duckdb dbplyr and supply FUN
as an expression rather than an actual function. This
library(duckdb)
#> Loading required package: DBI
library(dplyr, warn = FALSE)
library(dbplyr, warn = FALSE)
outer_join <- function(x, y, FUN, show_query = FALSE){
if (missing(y)) y <- x
con <- dbConnect(duckdb(), dbdir = ':memory:')
dbWriteTable(con, 'x', tibble(x))
dbWriteTable(con, 'y', tibble(y))
x_tbl <- tbl(con, 'x')
y_tbl <- tbl(con, 'y')
out <-
x_tbl %>%
inner_join(y_tbl, sql_on = '1 = 1') %>%
filter({{ FUN }}) %>%
{if (show_query) show_query(.) else .} %>%
collect
dbDisconnect(con)
out
}
Example:
outer_join(seq(2^8), FUN = x == y, show_query = TRUE)
#> <SQL>
#> SELECT *
#> FROM (
#> SELECT "x", "y"
#> FROM "x" AS "LHS"
#> INNER JOIN "y" AS "RHS"
#> ON (1 = 1)
#> ) "q01"
#> WHERE ("x" = "y")
#> # A tibble: 256 × 2
#> x y
#> <int> <int>
#> 1 1 1
#> 2 2 2
#> 3 3 3
#> 4 4 4
#> 5 5 5
#> 6 6 6
#> 7 7 7
#> 8 8 8
#> 9 9 9
#> 10 10 10
#> # … with 246 more rows
Benchmark (note the memory allocation):
expand_oj <- function(x, y, FUN = `==`) {
if (missing(y)) {y = x}
subset(expand.grid(x = x, y = y), FUN(x, y))
}
x <- seq(2^8)
bench::mark(
duck = outer_join(x, FUN = x == y),
expand = expand_oj(x, FUN = `==`),
check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 duck 53.31ms 57.79ms 17.3 266.34KB 21.1
#> 2 expand 1.22ms 1.55ms 461. 3.08MB 29.9
x <- seq(2^13)
bench::mark(
duck = outer_join(x, FUN = x == y),
expand = expand_oj(x, FUN = `==`),
check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 duck 50.69ms 53.34ms 18.2 359KB 5.46
#> 2 expand 1.44s 1.44s 0.693 3GB 2.77
Created on 2022-09-05 with reprex v2.0.2