my data frame:
data <- structure(list(group = c(1L, 1L, 1L, 1L, 1L, 1L,1L,1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), col1 = c(9,
9.05, 7.15, 7.21, 7.34, 8.12, 7.5, 7.84, 7.8, 7.52, 8.84, 6.98,
6.1, 6.89, 6.5, 7.5, 7.8, 5.5, 6.61, 7.65, 7.68,8.0,9.0), col2 = c(11L,
11L, 10L, 1L, 3L, 7L, 11L, 11L, 11L, 11L, 4L, 1L, 1L, 1L, 2L,
2L, 1L, 4L, 8L, 8L, 1L,3L,4L), col3 = c(7L, 11L, 3L, 7L, 11L, 2L, 11L,
5L, 11L, 11L, 5L, 11L, 11L, 2L, 9L, 9L, 3L, 8L, 11L, 11L, 2L,5L,6L),
col4 = c(11L, 11L, 11L, 11L, 6L, 11L, 11L, 11L, 10L, 7L,
11L, 2L, 11L, 3L, 11L, 11L, 6L, 11L, 1L, 11L, 11L,13L,12L), col5 = c(11L,
1L, 2L, 2L, 11L, 11L, 1L, 10L, 2L, 11L, 1L, 3L, 11L, 11L,
8L, 8L, 11L, 11L, 11L, 2L, 9L,4L,5L)), .Names = c("group", "col1",
"col2", "col3", "col4", "col5"), class = "data.frame", row.names = c(NA,
-21L))
function:
comb <- list(c(2, 4), c(3, 5), c(4, 6))
test.fun <- function(dat) {
do.call(rbind, lapply(comb, function(x) {
SUM <- dat[[x[1]]] dat[[x[2]]]
data.frame(NAME = sprintf('Group %s by Group %s', x[1], x[2]),
SUM)
}))
}
result <- purrr::map_df(split(data, data$group), test.fun, .id = 'Group')
Now this function processes 2 columns of list in the list(c(2, 4), c(3, 5), c(4, 6))
. I want it to process any amount for example:
list(c(2, 4, 6), c(3, 5, 6), c(3, 4, 6), c(2, 3), c(3, 5))
CodePudding user response:
May be this helps
test.fun <- function(dat, comb) {
do.call(rbind, lapply(comb, function(x) {
SUM <- rowSums(dat[x], na.rm = TRUE)
data.frame(NAME = paste0("Group ", toString(x)),
SUM)
}))
}
-testing
comb2 <- list(c(2, 4, 6), c(3, 5, 6), c(3, 4, 6), c(2, 3), c(3, 5))
purrr::map_df(split(data, data$group), test.fun, comb = comb2, .id = 'Group') %>%
as_tibble
-output
# A tibble: 115 × 3
Group NAME SUM
<chr> <chr> <dbl>
1 1 Group 2, 4, 6 27
2 1 Group 2, 4, 6 21.0
3 1 Group 2, 4, 6 12.2
4 1 Group 2, 4, 6 16.2
5 1 Group 2, 4, 6 29.3
6 1 Group 2, 4, 6 21.1
7 1 Group 2, 4, 6 19.5
8 1 Group 2, 4, 6 22.8
9 1 Group 3, 5, 6 33
10 1 Group 3, 5, 6 23
# … with 105 more rows
For pairwise, use combn
bind_rows(lapply(comb2, function(x) {
SUM <- combn(x, 2, FUN = function(y) rowSums(data[y],
na.rm = TRUE))
nm1 <- rep(combn(x, 2, FUN = paste, collapse="_"),
each = nrow(data))
data.frame(NAME= nm1, SUM)}))
CodePudding user response:
A way with outer
that avoids expensive split
ting of the data. The SUM
is achieved using rowSums
on a subset derived in a Vectorize
d FUN
ction. I used different data with more groups,
groupFun <- function(data, comb2) {
FUN <- Vectorize(function(x, y) rowSums(data[data$group == x, y]), SIMPLIFY=F)
o <- outer(unique(data$group), comb2, FUN)
res <- cbind(
expand.grid(
Group=as.character(data$group),
NAME=sapply(comb2, function(x) sprintf('Group %s by Group %s', x[1], x[2])),
stringsAsFactors=FALSE),
SUM=unlist(o))
res <- res[order(res$Group), ] ## optional
return(res)
}
where:
all.equal(purrr::map_df(split(data, data$group), test.fun, .id = 'Group'),
`rownames<-`(groupFun(data, comb), NULL))
# [1] TRUE
Result
comb2 <- list(c(2, 4, 6), c(3, 5, 6), c(3, 4, 6), c(2, 3), c(3, 5))
groupFun(data, comb2)
# Group NAME SUM
# 1 1 Group 2 by Group 4 16.56
# 2 1 Group 2 by Group 4 13.26
# 13 1 Group 3 by Group 5 14.00
# 14 1 Group 3 by Group 5 12.00
# 25 1 Group 3 by Group 4 22.00
# 26 1 Group 3 by Group 4 14.00
# 37 1 Group 2 by Group 3 12.56
# 38 1 Group 2 by Group 3 7.26
# 49 1 Group 3 by Group 5 11.00
# 50 1 Group 3 by Group 5 9.00
# 3 2 Group 2 by Group 4 17.84
# 4 2 Group 2 by Group 4 24.43
# 15 2 Group 3 by Group 5 14.00
# 16 2 Group 3 by Group 5 10.00
# 27 2 Group 3 by Group 4 14.00
# 28 2 Group 3 by Group 4 17.00
# 39 2 Group 2 by Group 3 13.84
# 40 2 Group 2 by Group 3 11.43
# 51 2 Group 3 by Group 5 10.00
# 52 2 Group 3 by Group 5 4.00
# 5 3 Group 2 by Group 4 24.10
# 6 3 Group 2 by Group 4 17.02
# 17 3 Group 3 by Group 5 26.00
# 18 3 Group 3 by Group 5 22.00
# 29 3 Group 3 by Group 4 23.00
# 30 3 Group 3 by Group 4 21.00
# 41 3 Group 2 by Group 3 15.10
# 42 3 Group 2 by Group 3 14.02
# 53 3 Group 3 by Group 5 15.00
# 54 3 Group 3 by Group 5 12.00
# 7 4 Group 2 by Group 4 20.97
# 8 4 Group 2 by Group 4 22.87
# 19 4 Group 3 by Group 5 24.00
# 20 4 Group 3 by Group 5 11.00
# 31 4 Group 3 by Group 4 25.00
# 32 4 Group 3 by Group 4 19.00
# 43 4 Group 2 by Group 3 11.97
# 44 4 Group 2 by Group 3 13.87
# 55 4 Group 3 by Group 5 18.00
# 56 4 Group 3 by Group 5 7.00
# 9 5 Group 2 by Group 4 25.45
# 10 5 Group 2 by Group 4 21.83
# 21 5 Group 3 by Group 5 22.00
# 22 5 Group 3 by Group 5 22.00
# 33 5 Group 3 by Group 4 25.00
# 34 5 Group 3 by Group 4 20.00
# 45 5 Group 2 by Group 3 20.45
# 46 5 Group 2 by Group 3 19.83
# 57 5 Group 3 by Group 5 16.00
# 58 5 Group 3 by Group 5 14.00
# 11 6 Group 2 by Group 4 14.89
# 12 6 Group 2 by Group 4 25.77
# 23 6 Group 3 by Group 5 22.00
# 24 6 Group 3 by Group 5 18.00
# 35 6 Group 3 by Group 4 20.00
# 36 6 Group 3 by Group 4 20.00
# 47 6 Group 2 by Group 3 14.89
# 48 6 Group 2 by Group 3 15.77
# 59 6 Group 3 by Group 5 17.00
# 60 6 Group 3 by Group 5 12.00
Benchmark
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# purrr 7.067411 7.173874 7.887650 7.253130 8.325982 17.341812 100 b
# outer 3.233047 3.296393 3.499889 3.366309 3.466582 5.976934 100 a
More than twice as fast.
Data
data <- structure(list(group = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L,
5L, 6L, 6L), col1 = c(2.13, 8.91, 8.72, 1.26, 10.7, 1.23, 2.05,
2.77, 9.35, 8.82, 8.41, 1.1), col2 = c(4, 8, 9, 1, 11, 6, 6,
11, 1, 7, 9, 1), col3 = c(10, 2, 9, 7, 10, 7, 4, 10, 4, 10, 11,
1), col4 = c(5, 6, 8, 7, 2, 6, 10, 4, 10, 3, 4, 10), col5 = c(4,
8, 9, 4, 1, 9, 7, 7, 7, 8, 2, 9)), row.names = c(NA, -12L), class = "data.frame")