Combinations of rows, where values in columns aren't repeated-CodePudding

I need to get all possible combinations of rows, where values in the first 2 columns aren't repeated in both of these columns. Let's say I have a dataset:

A	B	PRICE
1	3	8
2	3	7
1	4	6
2	4	5
1	5	4
2	5	3
3	5	2

And I need to get all combinations of 2 rows, then the outcome would be:

(1, 3); (2, 4) = 13
(2, 3); (1, 4) = 13
(1, 3); (2, 5) = 11
(2, 3); (1, 5) = 11
(1, 4); (2, 5) = 9
(2, 4); (1, 5) = 9
(1, 4); (3, 5) = 8
(2, 4); (3, 5) = 7

In the real dataset I need to get combinations, that contain more than 2 rows. I tried using for and while loops, but got nowhere.

CodePudding user response：

Possibly there will be a more optimal method, but try this:

dt <- data.table(
           A = c(1L, 2L, 1L, 2L, 1L, 2L, 3L),
           B = c(3L, 3L, 4L, 4L, 5L, 5L, 5L),
       PRICE = c(8L, 7L, 6L, 5L, 4L, 3L, 2L)
)

library(data.table)
x <- list()
for (i in 1:nrow(dt)){
  # get i row
  a1 <- dt$A[i]
  b1 <- dt$B[i]
  d <- c(a1, b1)
  p1 <- dt$PRICE[i]
  # get rows with A,B not in i row
  x[[i]] <- dt %>% 
    filter(!A %in% d, !B %in% d) %>%
    rename(a2=A, b2=B) %>%
    mutate(a1=a1, b1=b1, price=PRICE p1) %>%
    select(a1, b1, a2, b2, price,-PRICE) %>% 
    # create dummy cols and order to detect duplicated data
    mutate(a1b1=pmin(paste0(a1,b1), paste0(a2,b2)),
           a2b2=pmax(paste0(a1,b1), paste0(a2,b2)))
}
# bind the list of data frames and remove duplicated data
x <- rbindlist(x) %>% 
  distinct(a1b1, a2b2, .keep_all = T) %>% 
  select(-a1b1, -a2b2)

> x
   a1 b1 a2 b2 price
1:  1  3  2  4    13
2:  1  3  2  5    11
3:  2  3  1  4    13
4:  2  3  1  5    11
5:  1  4  2  5     9
6:  1  4  3  5     8
7:  2  4  1  5     9
8:  2  4  3  5     7

CodePudding user response：

A brute-force approach may work depending on the size of your actual data.

df <- data.frame(
  A = sequence(c(2,2,3)),
  B = rep.int(3:5, c(2,2,3)),
  PRICE = 8:2
)

m <- combn(nrow(df), 2)

df2 <- cbind(df[m[1,], 1:2], df[m[2,], 1:2], PRICE = df[m[1,], 3]   df[m[2,], 3])
rownames(df2) <- NULL
df2[rowSums(matrixStats::rowTabulates(as.matrix(df2[,1:4])) == 2L) == 0L,]
#>    A B A B PRICE
#> 3  1 3 2 4    13
#> 5  1 3 2 5    11
#> 7  2 3 1 4    13
#> 9  2 3 1 5    11
#> 14 1 4 2 5     9
#> 15 1 4 3 5     8
#> 16 2 4 1 5     9
#> 18 2 4 3 5     7

CodePudding user response：

You can try the code below with combn

Filter(
  length,
  combn(
    1:nrow(df),
    2,
    function(k) {
      d <- df[k, ]
      if (!any(duplicated(unlist(d[c("A", "B")])))) {
        d
      }
    },
    simplify = FALSE
  )
)

which gives

[[1]]
  A B PRICE
1 1 3     8
4 2 4     5

[[2]]
  A B PRICE
1 1 3     8
6 2 5     3

[[3]]
  A B PRICE
2 2 3     7
3 1 4     6

[[4]]
  A B PRICE
2 2 3     7
5 1 5     4

[[5]]
  A B PRICE
3 1 4     6
6 2 5     3

[[6]]
  A B PRICE
3 1 4     6
7 3 5     2

[[7]]
  A B PRICE
4 2 4     5
5 1 5     4

[[8]]
  A B PRICE
4 2 4     5
7 3 5     2

do.call(
  rbind,
  combn(
    1:nrow(df),
    2,
    function(k) {
      d <- df[k, ]
      if (!any(duplicated(unlist(d[c("A", "B")])))) {
        cbind(d[1, c("A", "B")], d[2, c("A", "B")], totPrice = sum(d$PRICE))
      }
    },
    simplify = FALSE
  )
)

gives

   A B A B totPrice
1  1 3 2 4       13
2  1 3 2 5       11
21 2 3 1 4       13
22 2 3 1 5       11
3  1 4 2 5        9
31 1 4 3 5        8
4  2 4 1 5        9
41 2 4 3 5        7