Frequent product pair given transaction in R-CodePudding

I have the following dataset:

df <- data.frame(transaction = c("A1","A1","A1","B1","B1","C1","C1","C1",
                                 "C1","C1","D1","E1","E1","E1","E1","F1",
                                 "G1","G1","G1","H2","H2","H2","I3","I3"),
                 product = c("milk","eggs","butter",
                             "cocoa","tea",
                             "eggs","cookies","tea","toothpaste","water",
                             "cocoa",
                             "eggs","oil","tea","milk",
                             "ham",
                             "sugar","oil","milk",
                             "tea","milk","sugar",
                             "tea","milk"),
                 place_order = c(1,2,3,1,2,1,2,3,4,5,1,
                                 1,2,3,4,1,1,2,3,1,2,3,1,2),
                 client_id = c("x1","x1","x1","x2","x2",
                               "x3","x3","x3","x3","x3",
                               "x2","x4","x4","x4","x4",
                               "x2","x5","x5","x5",
                               "x3","x3","x3","x4","x4"))
df
   transaction    product place_order client_id
1           A1       milk           1        x1
2           A1       eggs           2        x1
3           A1     butter           3        x1
4           B1      cocoa           1        x2
5           B1        tea           2        x2
6           C1       eggs           1        x3
7           C1    cookies           2        x3
8           C1        tea           3        x3
9           C1 toothpaste           4        x3
10          C1      water           5        x3
11          D1      cocoa           1        x2
12          E1       eggs           1        x4
13          E1        oil           2        x4
14          E1        tea           3        x4
15          E1       milk           4        x4
16          F1        ham           1        x2
17          G1      sugar           1        x5
18          G1        oil           2        x5
19          G1       milk           3        x5
20          H2        tea           1        x3
21          H2       milk           2        x3
22          H2      sugar           3        x3
23          I3        tea           1        x4
24          I3       milk           2        x4

And I want the most frequent product pairs. So I guess grouping by transaction is the right approach, since per each group I can pair the combinations. For example, transaction A1 would have this combination:

> t(combn(sort(c("milk","eggs","butter")),2))
     [,1]     [,2]  
[1,] "butter" "eggs"
[2,] "butter" "milk"
[3,] "eggs"   "milk"

And I could join these pairs:

t(combn(sort(c("milk","eggs","butter")),2)) %>% as.data.frame() %>% unite("pair",V1,V2,sep = "_")
         pair
1 butter_eggs
2 butter_milk
3   eggs_milk

But I'm stuck here, how can I do this for each transaction? If I have the pairs per each group I could count the most frequent pairs, so the expected pairs (the first three for example) are:

pair      count
tea_milk  3
eggs_milk 2
milk_oil 2

Is there a function in dplyr I am missing? Perhaps my method is complicated, do you know a better approach?

CodePudding user response：

Applying the same logic you use, I think this might be a tidy way of doing it

get_combs <- function(x) {
  if (length(x$product)>1){
  t(combn(sort(x$product),2)) %>% as.data.frame() %>% unite("prods",V1,V2,sep = "_") %>% pull(prods)
  }
  else return(NULL)
}

df %>% 
  group_by(transaction) %>% 
  nest() %>% 
  mutate(comb = map(data, get_combs)) %>% 
  unnest(comb) %>% 
  ungroup() %>% 
  count(comb, sort = TRUE)

# A tibble: 21 × 2
   comb             n
   <chr>        <int>
 1 milk_tea         3
 2 eggs_milk        2
 3 eggs_tea         2
 4 milk_oil         2
 5 milk_sugar       2
 6 butter_eggs      1
 7 butter_milk      1
 8 cocoa_tea        1
 9 cookies_eggs     1
10 cookies_tea      1
# … with 11 more rows

CodePudding user response：

A possible crossproduct solution on a tabulation of the transaction and product. I'm not sure how well it will scale, but it seems to work:

tab <- crossprod(table(df$transaction, df$product))
tab[upper.tri(tab, diag=TRUE)] <- 0
tab <- as.data.frame.table(tab)
tab <- tab[tab$Freq > 0,]
tab <- tab[order(tab$Freq, decreasing=TRUE),]

##          Var1       Var2 Freq
##64         tea       milk    3
##39        milk       eggs    2
##42         tea       eggs    2
##62         oil       milk    2
##63       sugar       milk    2
##4         eggs     butter    1
## ...

With a sparse Matrix in case that helps efficiency:

tab <- crossprod(xtabs(value ~ transaction   product,
                data=cbind(df,value=1), sparse=TRUE))
summ <- summary(tab)
summ <- summ[summ$i != summ$j,]
summ <- summ[order(summ$x, decreasing=TRUE),]
data.frame(product1 = rownames(tab)[summ$i],
           product2 = colnames(tab)[summ$j],
           count    = summ$x)

##     product1   product2 count
##1        milk        tea     3
##2        eggs       milk     2
##3        milk        oil     2
##4        milk      sugar     2
##5        eggs        tea     2
##6      butter       eggs     1
## ...