Home > OS >  all possible combinations in the list
all possible combinations in the list

Time:10-19

my data frame:

data <- structure(list(group = c(1L, 1L, 1L, 1L, 1L, 1L,1L,1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), col1 = c(9, 
9.05, 7.15, 7.21, 7.34, 8.12, 7.5, 7.84, 7.8, 7.52, 8.84, 6.98, 
6.1, 6.89, 6.5, 7.5, 7.8, 5.5, 6.61, 7.65, 7.68,8.0,9.0), col2 = c(11L, 
11L, 10L, 1L, 3L, 7L, 11L, 11L, 11L, 11L, 4L, 1L, 1L, 1L, 2L, 
2L, 1L, 4L, 8L, 8L, 1L,3L,4L), col3 = c(7L, 11L, 3L, 7L, 11L, 2L, 11L, 
5L, 11L, 11L, 5L, 11L, 11L, 2L, 9L, 9L, 3L, 8L, 11L, 11L, 2L,5L,6L), 
    col4 = c(11L, 11L, 11L, 11L, 6L, 11L, 11L, 11L, 10L, 7L, 
    11L, 2L, 11L, 3L, 11L, 11L, 6L, 11L, 1L, 11L, 11L,13L,12L), col5 = c(11L, 
    1L, 2L, 2L, 11L, 11L, 1L, 10L, 2L, 11L, 1L, 3L, 11L, 11L, 
    8L, 8L, 11L, 11L, 11L, 2L, 9L,4L,5L)), .Names = c("group", "col1", 
"col2", "col3", "col4", "col5"), class = "data.frame", row.names = c(NA, 
-21L))

function:

comb <- list(c(2, 4), c(3, 5), c(4, 6))

test.fun <- function(dat) { 
  do.call(rbind, lapply(comb, function(x) {
    SUM <- dat[[x[1]]] dat[[x[2]]]
    data.frame(NAME = sprintf('Group %s by Group %s', x[1], x[2]),
                SUM)
   
  }))
}

result <- purrr::map_df(split(data, data$group), test.fun, .id = 'Group')

Now this function processes 2 columns of list in the list(c(2, 4), c(3, 5), c(4, 6)). I want it to process any amount for example:

list(c(2, 4, 6), c(3, 5, 6), c(3, 4, 6), c(2, 3), c(3, 5))

CodePudding user response:

May be this helps

test.fun <- function(dat, comb) { 
  do.call(rbind, lapply(comb, function(x) {
    SUM <- rowSums(dat[x], na.rm = TRUE)
    data.frame(NAME = paste0("Group ", toString(x)),
                SUM)
   
  }))
}

-testing

comb2 <- list(c(2, 4, 6), c(3, 5, 6), c(3, 4, 6), c(2, 3), c(3, 5))

purrr::map_df(split(data, data$group), test.fun, comb = comb2, .id = 'Group') %>%
    as_tibble

-output

# A tibble: 115 × 3
   Group NAME            SUM
   <chr> <chr>         <dbl>
 1 1     Group 2, 4, 6  27  
 2 1     Group 2, 4, 6  21.0
 3 1     Group 2, 4, 6  12.2
 4 1     Group 2, 4, 6  16.2
 5 1     Group 2, 4, 6  29.3
 6 1     Group 2, 4, 6  21.1
 7 1     Group 2, 4, 6  19.5
 8 1     Group 2, 4, 6  22.8
 9 1     Group 3, 5, 6  33  
10 1     Group 3, 5, 6  23  
# … with 105 more rows

For pairwise, use combn

 bind_rows(lapply(comb2, function(x) {
   SUM <- combn(x, 2, FUN = function(y) rowSums(data[y], 
          na.rm = TRUE))
   nm1 <- rep(combn(x, 2, FUN = paste, collapse="_"), 
       each = nrow(data))
   data.frame(NAME= nm1, SUM)}))

CodePudding user response:

A way with outer that avoids expensive splitting of the data. The SUM is achieved using rowSums on a subset derived in a Vectorized FUNction. I used different data with more groups,

groupFun <- function(data, comb2) {
  FUN <- Vectorize(function(x, y) rowSums(data[data$group == x, y]), SIMPLIFY=F)
  o <- outer(unique(data$group), comb2, FUN)
  res <- cbind(
    expand.grid(
      Group=as.character(data$group), 
      NAME=sapply(comb2, function(x) sprintf('Group %s by Group %s', x[1], x[2])),
      stringsAsFactors=FALSE), 
    SUM=unlist(o))
  res <- res[order(res$Group), ]  ## optional
  return(res)
}

where:

all.equal(purrr::map_df(split(data, data$group), test.fun, .id = 'Group'),
          `rownames<-`(groupFun(data, comb), NULL))
# [1] TRUE

Result

comb2 <- list(c(2, 4, 6), c(3, 5, 6), c(3, 4, 6), c(2, 3), c(3, 5))

groupFun(data, comb2)
#    Group               NAME   SUM
# 1      1 Group 2 by Group 4 16.56
# 2      1 Group 2 by Group 4 13.26
# 13     1 Group 3 by Group 5 14.00
# 14     1 Group 3 by Group 5 12.00
# 25     1 Group 3 by Group 4 22.00
# 26     1 Group 3 by Group 4 14.00
# 37     1 Group 2 by Group 3 12.56
# 38     1 Group 2 by Group 3  7.26
# 49     1 Group 3 by Group 5 11.00
# 50     1 Group 3 by Group 5  9.00
# 3      2 Group 2 by Group 4 17.84
# 4      2 Group 2 by Group 4 24.43
# 15     2 Group 3 by Group 5 14.00
# 16     2 Group 3 by Group 5 10.00
# 27     2 Group 3 by Group 4 14.00
# 28     2 Group 3 by Group 4 17.00
# 39     2 Group 2 by Group 3 13.84
# 40     2 Group 2 by Group 3 11.43
# 51     2 Group 3 by Group 5 10.00
# 52     2 Group 3 by Group 5  4.00
# 5      3 Group 2 by Group 4 24.10
# 6      3 Group 2 by Group 4 17.02
# 17     3 Group 3 by Group 5 26.00
# 18     3 Group 3 by Group 5 22.00
# 29     3 Group 3 by Group 4 23.00
# 30     3 Group 3 by Group 4 21.00
# 41     3 Group 2 by Group 3 15.10
# 42     3 Group 2 by Group 3 14.02
# 53     3 Group 3 by Group 5 15.00
# 54     3 Group 3 by Group 5 12.00
# 7      4 Group 2 by Group 4 20.97
# 8      4 Group 2 by Group 4 22.87
# 19     4 Group 3 by Group 5 24.00
# 20     4 Group 3 by Group 5 11.00
# 31     4 Group 3 by Group 4 25.00
# 32     4 Group 3 by Group 4 19.00
# 43     4 Group 2 by Group 3 11.97
# 44     4 Group 2 by Group 3 13.87
# 55     4 Group 3 by Group 5 18.00
# 56     4 Group 3 by Group 5  7.00
# 9      5 Group 2 by Group 4 25.45
# 10     5 Group 2 by Group 4 21.83
# 21     5 Group 3 by Group 5 22.00
# 22     5 Group 3 by Group 5 22.00
# 33     5 Group 3 by Group 4 25.00
# 34     5 Group 3 by Group 4 20.00
# 45     5 Group 2 by Group 3 20.45
# 46     5 Group 2 by Group 3 19.83
# 57     5 Group 3 by Group 5 16.00
# 58     5 Group 3 by Group 5 14.00
# 11     6 Group 2 by Group 4 14.89
# 12     6 Group 2 by Group 4 25.77
# 23     6 Group 3 by Group 5 22.00
# 24     6 Group 3 by Group 5 18.00
# 35     6 Group 3 by Group 4 20.00
# 36     6 Group 3 by Group 4 20.00
# 47     6 Group 2 by Group 3 14.89
# 48     6 Group 2 by Group 3 15.77
# 59     6 Group 3 by Group 5 17.00
# 60     6 Group 3 by Group 5 12.00

Benchmark

# Unit: milliseconds
#  expr      min       lq     mean   median       uq       max neval cld
# purrr 7.067411 7.173874 7.887650 7.253130 8.325982 17.341812   100   b
# outer 3.233047 3.296393 3.499889 3.366309 3.466582  5.976934   100  a 

More than twice as fast.


Data

data <- structure(list(group = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 
5L, 6L, 6L), col1 = c(2.13, 8.91, 8.72, 1.26, 10.7, 1.23, 2.05, 
2.77, 9.35, 8.82, 8.41, 1.1), col2 = c(4, 8, 9, 1, 11, 6, 6, 
11, 1, 7, 9, 1), col3 = c(10, 2, 9, 7, 10, 7, 4, 10, 4, 10, 11, 
1), col4 = c(5, 6, 8, 7, 2, 6, 10, 4, 10, 3, 4, 10), col5 = c(4, 
8, 9, 4, 1, 9, 7, 7, 7, 8, 2, 9)), row.names = c(NA, -12L), class = "data.frame")
  • Related