Home > Software engineering >  How to perform a conditional join on statement containing both AND and OR operators?
How to perform a conditional join on statement containing both AND and OR operators?

Time:10-13

I have a large dataset and would like to form all pairs of rows satisfying some condition and then calculate some variables based on which parts of the condition were satisfied. The following MWE illustrates what I would like to achieve:

library(data.table)
set.seed(1234)

IDs <- data.table(id = letters[1:10],
                  c1 = sample(1:5, 10, replace = T),
                  c2 = sample(1:5, 10, replace = T),
                  c3 = sample(1:5, 10, replace = T),
                  c = 1)

IDs.joined <- IDs[IDs, on = 'c', allow.cartesian = T
                  ][c1 != i.c1 & (c2 == i.c2 | c3 == i.c3)  # condition defining which pairs are joined
                  ][, c('Ic2', 'Ic3') := .(c2 == i.c2, c3 == i.c3)
                  ][, overlap_id := fifelse(Ic2 == 1, 2, 3)
                  ][, overlap := Ic2   Ic3
                  ][, -c('i.c1', 'i.c2', 'i.c3', 'Ic2', 'Ic3')]

The problem is that the full dataset is way too large (~5 million rows) to form the Cartesian join on itself. My question is, is there a way to use data.table's syntax to perform a conditional join like this directly, without going via the Cartesian join first and imposing the desired condition second?

I have seen similar problems on SO but these can typically be expressed as a rolling join, I am not aware of a way to include X | Y statements in the rolling join syntax, or X != Y conditions.

CodePudding user response:

The best option I've found so far for relatively simple conditions like these is to bind multiple joins. It's not pretty, but it is fast and memory efficient.

library(data.table)
set.seed(1234)

IDs <- data.table(id = 1:1e4,
                  c1 = sample(5e3, 1e4, replace = T),
                  c2 = sample(5e3, 1e4, replace = T),
                  c3 = sample(5e3, 1e4, replace = T),
                  c = 0L)

f1 <- function(dt) {
  dt[
    dt, on = 'c', allow.cartesian = TRUE
  ][
    c1 != i.c1 & (c2 == i.c2 | c3 == i.c3)
  ]
}

f2 <- function(dt) {
  unique(
    rbindlist(
      list(
        dt[dt, on = .(c1 > c1, c2 == c2), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
        dt[dt, on = .(c1 < c1, c2 == c2), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
        dt[dt, on = .(c1 > c1, c3 == c3), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
        dt[dt, on = .(c1 < c1, c3 == c3), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0]
      )
    )
  )
}

microbenchmark::microbenchmark(f1(IDs),
                               f2(IDs),
                               times = 10)
#> Unit: milliseconds
#>     expr       min        lq      mean    median        uq       max neval
#>  f1(IDs) 2553.3594 3305.0062 3256.9072 3343.6174 3396.6990 3470.7870    10
#>  f2(IDs)  375.0594  400.9712  428.4382  440.4604  449.4586  490.7598    10

identical(setorder(f1(IDs), id, i.id), setorder(f2(IDs), id, i.id))
#> [1] TRUE

To address Waldi's comment, another option would be to remove the duplicates introduced by c2 == i.c2 & c3 == i.c3:

IDs <- data.table(id = letters[1:10],
                  c1 = sample(1:5, 10, replace = T),
                  c2 = sample(1:5, 10, replace = T),
                  c3 = sample(1:5, 10, replace = T),
                  c = 1)
IDs <- rbindlist(list(IDs, IDs))[sample(20)]

f2 <- function(dt) {
  setorderv(dt, names(dt))
  rbindlist(
    list(
      dt[dt, on = .(c1 > c1, c2 == c2), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
      dt[dt, on = .(c1 < c1, c2 == c2), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
      dt[dt, on = .(c1 > c1, c3 == c3), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
      dt[dt, on = .(c1 < c1, c3 == c3), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, c = x.c, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0]
    )
  )[
    c2 == i.c2 & c3 == i.c3, r := rep(c(NA, FALSE), .N/2L)
  ][
    is.na(r)
  ][
    , r := NULL
  ]
}

identical(setorder(f1(IDs), id, i.id), setorder(f2(IDs), id, i.id))
#> [1] TRUE

CodePudding user response:

Building on @jblood94 answer, and because a comment would be too long, the full list of possible cases is:

rbindlist(list(
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1>c1,c2=c2,c3=c3), allow.cartesian = T,nomatch=NULL],

IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1>c1,c2=c2,c3>c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1>c1,c2=c2,c3<c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1<c1,c2=c2,c3=c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1<c1,c2=c2,c3>c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1<c1,c2=c2,c3<c3), allow.cartesian = T,nomatch=NULL],


IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1>c1,c2>c2,c3=c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1>c1,c2<c2,c3=c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1<c1,c2>c2,c3=c3), allow.cartesian = T,nomatch=NULL],
IDs[IDs,.(id=x.id,c=x.c,c1 = x.c1, c2 = x.c2,c3=x.c3,i.c1,i.c2,i.c3 ), on = .(c,c1<c1,c2<c2,c3=c3), allow.cartesian = T,nomatch=NULL]))

This is necessary to avoid counting twice cases where c2=c2 & c3=c3 while allowing duplicate rows in IDs, which would disappear with unique

  • Related