R - Speeding up a search with apply-CodePudding

Suppose I have a data.table in which an observation is a pair-wise combination of products my consumers buy together.

I would like to find, for each pair of products (a row in dt) in my data.table, if they have a third product in common that is sometimes also bought with one of the products.

I want to include the "common products" as a new column in dt.

Currently, I do this as follows. But my real data holds millions of rows. It takes 20 hours to compute data from 1 week.

How can I speed this up? Is an apply function smart, or should I think about mapping?

Mock example:

library(data.table)
library(stringi)
library(future.apply)

set.seed(1)

# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
                 V2 = stri_rand_strings(100, 1))

head(dt,17)
#   V1 V2
#1:  G  e
#2:  N  L
#3:  Z  G
#4:  u  z
#5:  C  d
#6:  t  D
# 7:  w  8
# 8:  e  T
# 9:  d  v
#10:  3  b
#11:  C  y
#12:  A  j
#13:  g  M
#14:  N  Q
#15:  l  9
#16:  U  0
#17:  i  i

#function to find common products
find_products <- function(a, b){
  library(data.table)
  toString(unique((dt[.(c(a, b)), on=.(V1), V2[duplicated(V2)]])))
}

#initiate parallel processing
plan(multisession) # on Windows machine - use plan(multicore) on Linux

#apply function across rows
common_products <- future_apply(dt, 1, function(y) find_products(y['V1'], y['V2']))

dt_final <- cbind(dt, common_products)

#head(dt, 17)
#    V1 V2 common_products
# 1:  G  e                
# 2:  N  L                
# 3:  Z  G                
# 4:  u  z                
# 5:  C  d                
# 6:  t  D                
# 7:  w  8                
# 8:  e  T                
# 9:  d  v                
#10:  3  b                
#11:  C  y                
#12:  A  j                
#13:  g  M                
#14:  N  Q                
#15:  l  9                
#16:  U  0                
#17:  i  i      i, z, B, l

CodePudding user response：

One can think of the three-way pairs as triangles in an undirected graph. The package igraph can find these efficiently. I included an example with 20M pairs of 3-character product codes. It ran on a single thread in about 70 seconds.

library(data.table)
library(stringi)
library(igraph)

getpaired <- function(g, id) names(unique(neighbors(g, id)))
commonProducts <- function(dt) {
  blnSort <- dt$V1 > dt$V2
  dt[blnSort, c("V2", "V1") := list(V1, V2)] # sort each row
  # get triangles
  g <- graph_from_data_frame(dt, FALSE)
  m <- matrix(V(g)$name[triangles(g)], ncol = 3, byrow = TRUE)
  # sort each row
  m <- matrix(m[order(row(m), m, method = "radix")], ncol = 3, byrow = TRUE)
  dt3 <- as.data.table(m)
  # map common products back to the original dataframe
  dt3 <- rbindlist(
    list(
      # the three ordered pairs in each triangle
      dt3,
      dt3[, c(1, 3, 2)],
      dt3[, c(2, 3, 1)],
      # common products in "two-sided" triangles
      dt[V1 == V2][
        , .(V2 = V2, V3 = .(getpaired(g, V1))), by = "V1"
      ][
        , .(V1 = rep(rep.int(V1, lengths(V3)), 2),
            V2 = c(rep.int(V1, lengths(V3)), unlist(V3)),
            V3 = c(unlist(V3), rep.int(V1, lengths(V3))))
      ][ # sort (V1, V2) in each row
        V1 > V2, c("V2", "V1") := list(V1, V2)
      ]
    ),
    FALSE # bind by index
  )[ # collapse common products into a single vector for each pair
    , .(V3 = .(V3)),
    by = c("V1", "V2")
  ][ # join into the original (row-sorted) data.table
    dt, on = c("V1", "V2")
  ][ # unsort V1, V2 in each row to match the original (unsorted) data.table
    , c("V1", "V2") := dt[blnSort, c("V2", "V1") := list(V1, V2)]
  ]
}

set.seed(1)

# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
                 V2 = stri_rand_strings(100, 1))

dt3 <- commonProducts(dt)
print(dt3)
#>      V1 V2              V3
#>   1:  G  e                
#>   2:  N  L                
#>   3:  Z  G                
#>   4:  u  z                
#>   5:  C  d               B
#>   6:  t  D               t
#>   7:  w  8                
#>   8:  e  T                
#>   9:  d  v                
#>  10:  3  b                
#>  11:  C  y                
#>  12:  A  j                
#>  13:  g  M                
#>  14:  N  Q                
#>  15:  l  9                
#>  16:  U  0                
#>  17:  i  i i,7,E,B,5,z,...
#>  18:  z  6                
#>  19:  N  R               S
#>  20:  m  d                
#>  21:  v  z                
#>  22:  D  U                
#>  23:  e  U                
#>  24:  7  A                
#>  25:  G  k                
#>  26:  N  S               R
#>  27:  0  V                
#>  28:  N  C                
#>  29:  r  E                
#>  30:  L  a                
#>  31:  T  Z                
#>  32:  b  4                
#>  33:  U  2                
#>  34:  B  d             C,6
#>  35:  p  v                
#>  36:  f  b                
#>  37:  n  Y                
#>  38:  6  W               j
#>  39:  i  z               i
#>  40:  P  V                
#>  41:  o  g                
#>  42:  e  b                
#>  43:  m  E                
#>  44:  Y  G                
#>  45:  W  j                
#>  46:  m  S                
#>  47:  1  A                
#>  48:  T  k                
#>  49:  j  6               W
#>  50:  g  r                
#>  51:  T  c                
#>  52:  r  Y                
#>  53:  R  K                
#>  54:  F  S                
#>  55:  4  V                
#>  56:  6  B               d
#>  57:  J  W                
#>  58:  W  4                
#>  59:  f  H                
#>  60:  P  D                
#>  61:  u  H                
#>  62:  I  t               t
#>  63:  S  R               N
#>  64:  K  m                
#>  65:  e  s                
#>  66:  F  P                
#>  67:  T  3                
#>  68:  l  K                
#>  69:  5  i               i
#>  70:  s  K               O
#>  71:  L  d                
#>  72:  q  q           P,q,q
#>  73:  L  r               d
#>  74:  K  O               s
#>  75:  T  N                
#>  76:  t  t       D,I,t,x,t
#>  77:  r  d               L
#>  78:  O  j                
#>  79:  m  b                
#>  80:  x  t               t
#>  81:  Q  I                
#>  82:  i  B               i
#>  83:  O  s               K
#>  84:  K  V                
#>  85:  k  s                
#>  86:  C  B               d
#>  87:  i  l               i
#>  88:  7  i               i
#>  89:  F  w                
#>  90:  8  X                
#>  91:  E  i               i
#>  92:  3  O                
#>  93:  d  6               B
#>  94:  s  v                
#>  95:  m  H                
#>  96:  n  a                
#>  97:  S  6                
#>  98:  P  q               q
#>  99:  o  J                
#> 100:  b  m                
#>      V1 V2              V3

# timing a much larger dataset
dt <- data.table(V1 = stri_rand_strings(2e7, 3),
                 V2 = stri_rand_strings(2e7, 3))

system.time(dt3 <- commonProducts(dt))
#>    user  system elapsed 
#>   72.75    3.05   71.88
dt3[lengths(V3) != 0L] # show only those pairs with common products
#>           V1  V2  V3
#>       1: GBW mDN lxF
#>       2: ix6 jpR 0VI
#>       3: xLG VeE aik
#>       4: A36 RzJ YYu
#>       5: zAo OYu zAo
#>      ---            
#> 1841567: qX9 xrW 7lb
#> 1841568: knO 2G6 knO
#> 1841569: rsU 5Rw ER8
#> 1841570: Bts 3L1 1bQ
#> 1841571: c0h pgd jxJ

This handles "2-sided triangles" created when V1==V2 (as with row 17 in the OP example data). For example, if the entire dataset consisted of the pairs (t, i) and (i, i), then i would be a common product for (t, i) (i is paired with both t and i), and i, t would be the common products for (i, i) (i and t are each paired with both i and i).

CodePudding user response：

Perhaps this could help

library(igraph)

g <- simplify(graph_from_data_frame(dt, directed = FALSE))
g <- simplify(graph_from_data_frame(dt, directed = FALSE))
dcast(
    melt(dt[, id := 1:.N], "id")[
        ,
        common := toString(names(V(g))[do.call(intersect, ego(g, nodes = value, mindist = 1))]),
        id
    ],
    id   common ~ variable
)[, .(V1, V2, common)]

which gives

     V1 V2           common
  1:  G  e
  2:  N  L
  3:  Z  G
  4:  u  z
  5:  C  d                B
  6:  t  D
  7:  w  8
  8:  e  T
  9:  d  v
 10:  3  b
 11:  C  y
 12:  A  j
 13:  g  M
 14:  N  Q
 15:  l  9
 16:  U  0
 17:  i  i l, z, 7, B, 5, E
 18:  z  6
 19:  N  R                S
 20:  m  d
 21:  v  z
 22:  D  U
 23:  e  U
 24:  7  A
 25:  G  k
 26:  N  S                R
 27:  0  V
 28:  N  C
 29:  r  E
 30:  L  a
 31:  T  Z
 32:  b  4
 33:  U  2
 34:  B  d             C, 6
 35:  p  v
 36:  f  b
 37:  n  Y
 38:  6  W                j
 39:  i  z
 40:  P  V
 41:  o  g
 42:  e  b
 43:  m  E
 44:  Y  G
 45:  W  j                6
 46:  m  S
 47:  1  A
 48:  T  k
 49:  j  6                W
 50:  g  r
 51:  T  c
 52:  r  Y
 53:  R  K
 54:  F  S
 55:  4  V
 56:  6  B                d
 57:  J  W
 58:  W  4
 59:  f  H
 60:  P  D
 61:  u  H
 62:  I  t
 63:  S  R                N
 64:  K  m
 65:  e  s
 66:  F  P
 67:  T  3
 68:  l  K
 69:  5  i
 70:  s  K                O
 71:  L  d                r
 72:  q  q                P
 73:  L  r                d
 74:  K  O                s
 75:  T  N
 76:  t  t          D, I, x
 77:  r  d                L
 78:  O  j
 79:  m  b
 80:  x  t
 81:  Q  I
 82:  i  B
 83:  O  s                K
 84:  K  V
 85:  k  s
 86:  C  B                d
 87:  i  l
 88:  7  i
 89:  F  w
 90:  8  X
 91:  E  i
 92:  3  O
 93:  d  6                B
 94:  s  v
 95:  m  H
 96:  n  a
 97:  S  6
 98:  P  q
 99:  o  J
100:  b  m
     V1 V2           common