Suppose I have a data.table
in which an observation is a pair-wise combination of products my consumers buy together.
I would like to find, for each pair of products (a row in dt
) in my data.table
, if they have a third product in common that is sometimes also bought with one of the products.
I want to include the "common products" as a new column in dt
.
Currently, I do this as follows. But my real data holds millions of rows. It takes 20 hours to compute data from 1 week.
How can I speed this up? Is an apply
function smart, or should I think about mapping?
Mock example:
library(data.table)
library(stringi)
library(future.apply)
set.seed(1)
# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
V2 = stri_rand_strings(100, 1))
head(dt,17)
# V1 V2
#1: G e
#2: N L
#3: Z G
#4: u z
#5: C d
#6: t D
# 7: w 8
# 8: e T
# 9: d v
#10: 3 b
#11: C y
#12: A j
#13: g M
#14: N Q
#15: l 9
#16: U 0
#17: i i
#function to find common products
find_products <- function(a, b){
library(data.table)
toString(unique((dt[.(c(a, b)), on=.(V1), V2[duplicated(V2)]])))
}
#initiate parallel processing
plan(multisession) # on Windows machine - use plan(multicore) on Linux
#apply function across rows
common_products <- future_apply(dt, 1, function(y) find_products(y['V1'], y['V2']))
dt_final <- cbind(dt, common_products)
#head(dt, 17)
# V1 V2 common_products
# 1: G e
# 2: N L
# 3: Z G
# 4: u z
# 5: C d
# 6: t D
# 7: w 8
# 8: e T
# 9: d v
#10: 3 b
#11: C y
#12: A j
#13: g M
#14: N Q
#15: l 9
#16: U 0
#17: i i i, z, B, l
CodePudding user response:
One can think of the three-way pairs as triangles in an undirected graph. The package igraph
can find these efficiently. I included an example with 20M pairs of 3-character product codes. It ran on a single thread in about 70 seconds.
library(data.table)
library(stringi)
library(igraph)
getpaired <- function(g, id) names(unique(neighbors(g, id)))
commonProducts <- function(dt) {
blnSort <- dt$V1 > dt$V2
dt[blnSort, c("V2", "V1") := list(V1, V2)] # sort each row
# get triangles
g <- graph_from_data_frame(dt, FALSE)
m <- matrix(V(g)$name[triangles(g)], ncol = 3, byrow = TRUE)
# sort each row
m <- matrix(m[order(row(m), m, method = "radix")], ncol = 3, byrow = TRUE)
dt3 <- as.data.table(m)
# map common products back to the original dataframe
dt3 <- rbindlist(
list(
# the three ordered pairs in each triangle
dt3,
dt3[, c(1, 3, 2)],
dt3[, c(2, 3, 1)],
# common products in "two-sided" triangles
dt[V1 == V2][
, .(V2 = V2, V3 = .(getpaired(g, V1))), by = "V1"
][
, .(V1 = rep(rep.int(V1, lengths(V3)), 2),
V2 = c(rep.int(V1, lengths(V3)), unlist(V3)),
V3 = c(unlist(V3), rep.int(V1, lengths(V3))))
][ # sort (V1, V2) in each row
V1 > V2, c("V2", "V1") := list(V1, V2)
]
),
FALSE # bind by index
)[ # collapse common products into a single vector for each pair
, .(V3 = .(V3)),
by = c("V1", "V2")
][ # join into the original (row-sorted) data.table
dt, on = c("V1", "V2")
][ # unsort V1, V2 in each row to match the original (unsorted) data.table
, c("V1", "V2") := dt[blnSort, c("V2", "V1") := list(V1, V2)]
]
}
set.seed(1)
# build mock data
dt <- data.table(V1 = stri_rand_strings(100, 1),
V2 = stri_rand_strings(100, 1))
dt3 <- commonProducts(dt)
print(dt3)
#> V1 V2 V3
#> 1: G e
#> 2: N L
#> 3: Z G
#> 4: u z
#> 5: C d B
#> 6: t D t
#> 7: w 8
#> 8: e T
#> 9: d v
#> 10: 3 b
#> 11: C y
#> 12: A j
#> 13: g M
#> 14: N Q
#> 15: l 9
#> 16: U 0
#> 17: i i i,7,E,B,5,z,...
#> 18: z 6
#> 19: N R S
#> 20: m d
#> 21: v z
#> 22: D U
#> 23: e U
#> 24: 7 A
#> 25: G k
#> 26: N S R
#> 27: 0 V
#> 28: N C
#> 29: r E
#> 30: L a
#> 31: T Z
#> 32: b 4
#> 33: U 2
#> 34: B d C,6
#> 35: p v
#> 36: f b
#> 37: n Y
#> 38: 6 W j
#> 39: i z i
#> 40: P V
#> 41: o g
#> 42: e b
#> 43: m E
#> 44: Y G
#> 45: W j
#> 46: m S
#> 47: 1 A
#> 48: T k
#> 49: j 6 W
#> 50: g r
#> 51: T c
#> 52: r Y
#> 53: R K
#> 54: F S
#> 55: 4 V
#> 56: 6 B d
#> 57: J W
#> 58: W 4
#> 59: f H
#> 60: P D
#> 61: u H
#> 62: I t t
#> 63: S R N
#> 64: K m
#> 65: e s
#> 66: F P
#> 67: T 3
#> 68: l K
#> 69: 5 i i
#> 70: s K O
#> 71: L d
#> 72: q q P,q,q
#> 73: L r d
#> 74: K O s
#> 75: T N
#> 76: t t D,I,t,x,t
#> 77: r d L
#> 78: O j
#> 79: m b
#> 80: x t t
#> 81: Q I
#> 82: i B i
#> 83: O s K
#> 84: K V
#> 85: k s
#> 86: C B d
#> 87: i l i
#> 88: 7 i i
#> 89: F w
#> 90: 8 X
#> 91: E i i
#> 92: 3 O
#> 93: d 6 B
#> 94: s v
#> 95: m H
#> 96: n a
#> 97: S 6
#> 98: P q q
#> 99: o J
#> 100: b m
#> V1 V2 V3
# timing a much larger dataset
dt <- data.table(V1 = stri_rand_strings(2e7, 3),
V2 = stri_rand_strings(2e7, 3))
system.time(dt3 <- commonProducts(dt))
#> user system elapsed
#> 72.75 3.05 71.88
dt3[lengths(V3) != 0L] # show only those pairs with common products
#> V1 V2 V3
#> 1: GBW mDN lxF
#> 2: ix6 jpR 0VI
#> 3: xLG VeE aik
#> 4: A36 RzJ YYu
#> 5: zAo OYu zAo
#> ---
#> 1841567: qX9 xrW 7lb
#> 1841568: knO 2G6 knO
#> 1841569: rsU 5Rw ER8
#> 1841570: Bts 3L1 1bQ
#> 1841571: c0h pgd jxJ
This handles "2-sided triangles" created when V1
==V2
(as with row 17 in the OP example data). For example, if the entire dataset consisted of the pairs (t, i)
and (i, i)
, then i
would be a common product for (t, i)
(i
is paired with both t
and i
), and i, t
would be the common products for (i, i)
(i
and t
are each paired with both i
and i
).
CodePudding user response:
Perhaps this could help
library(igraph)
g <- simplify(graph_from_data_frame(dt, directed = FALSE))
g <- simplify(graph_from_data_frame(dt, directed = FALSE))
dcast(
melt(dt[, id := 1:.N], "id")[
,
common := toString(names(V(g))[do.call(intersect, ego(g, nodes = value, mindist = 1))]),
id
],
id common ~ variable
)[, .(V1, V2, common)]
which gives
V1 V2 common
1: G e
2: N L
3: Z G
4: u z
5: C d B
6: t D
7: w 8
8: e T
9: d v
10: 3 b
11: C y
12: A j
13: g M
14: N Q
15: l 9
16: U 0
17: i i l, z, 7, B, 5, E
18: z 6
19: N R S
20: m d
21: v z
22: D U
23: e U
24: 7 A
25: G k
26: N S R
27: 0 V
28: N C
29: r E
30: L a
31: T Z
32: b 4
33: U 2
34: B d C, 6
35: p v
36: f b
37: n Y
38: 6 W j
39: i z
40: P V
41: o g
42: e b
43: m E
44: Y G
45: W j 6
46: m S
47: 1 A
48: T k
49: j 6 W
50: g r
51: T c
52: r Y
53: R K
54: F S
55: 4 V
56: 6 B d
57: J W
58: W 4
59: f H
60: P D
61: u H
62: I t
63: S R N
64: K m
65: e s
66: F P
67: T 3
68: l K
69: 5 i
70: s K O
71: L d r
72: q q P
73: L r d
74: K O s
75: T N
76: t t D, I, x
77: r d L
78: O j
79: m b
80: x t
81: Q I
82: i B
83: O s K
84: K V
85: k s
86: C B d
87: i l
88: 7 i
89: F w
90: 8 X
91: E i
92: 3 O
93: d 6 B
94: s v
95: m H
96: n a
97: S 6
98: P q
99: o J
100: b m
V1 V2 common