dplyr rowwise summarise by column, grouped by name-CodePudding

Let's consider this simple dataset

set.seed(12345)
df <- data.frame(a1 = rnorm(5), a2 = rnorm(5), a3 = rnorm(5), 
                 b1 = rnorm(5), b2 = rnorm(5), b3 = rnorm(5),
                 c1 = rnorm(5), c2 = rnorm(5), c3 = rnorm(5))

Which looks like

          a1         a2         a3         b1         b2         b3        c1          c2         c3
1  0.5855288 -1.8179560 -0.1162478  0.8168998  0.7796219  1.8050975 0.8118732  0.49118828  1.1285108
2  0.7094660  0.6300986  1.8173120 -0.8863575  1.4557851 -0.4816474 2.1968335 -0.32408658 -2.3803581
3 -0.1093033 -0.2761841  0.3706279 -0.3315776 -0.6443284  0.6203798 2.0491903 -1.66205024 -1.0602656
4 -0.4534972 -0.2841597  0.5202165  1.1207127 -1.5531374  0.6121235 1.6324456  1.76773385  0.9371405
5  0.6058875 -0.9193220 -0.7505320  0.2987237 -1.5977095 -0.1623110 0.2542712  0.02580105  0.8544517

Now, I would like to get the mean of columns starting with a specific letter, specified in a vector.

So, for instance if I have

cols <- c("a", "c")

I'd like to output a dataframe with two columns (a and c) containing the mean of the a1/a2/a3 and c1/c2/c3 columns respectively.

            a          c
1 -0.449558319  0.8105241
2  1.052292204 -0.1692037
3 -0.004953185 -0.2243752
4 -0.072480153  1.4457733
5 -0.354655514  0.3781747

I've been playing around with starts_with and row_wise but I can't quite get the correct syntax.

CodePudding user response：

select columns that starts_with a or c, then use split.default to split the columns, and apply rowMeans on each of the groups:

library(dplyr)
library(purrr)
select(df, starts_with(cols)) %>% 
  split.default(gsub("\\d", "", names(.))) %>% 
  map_dfc(rowMeans)

         a      c
1 -0.450    0.811
2  1.05    -0.169
3 -0.00495 -0.224
4 -0.0725   1.45 
5 -0.355    0.378

Note that the gsub part might need to be changed depending on the structure of your column names.

CodePudding user response：

Calculate mean for all column groups, then subset:

data.frame(lapply(split.default(df, gsub("\\d", "", colnames(df))), rowMeans))[, cols]

Depending on the actual data it might be more efficient to get the means then subset.

Edit: Subsetting then getting mean is faster for bigger data.

ix <- gsub("\\d", "", colnames(df)) %in% cols
x1 <- data.frame(lapply(split.default(df[, ix], gsub("\\d", "", colnames(df)[ ix ])), rowMeans))

CodePudding user response：

Benchmarking

data:

library(dplyr)
library(purrr)

n = 100000 * 10 * 10
set.seed(12345); df <- data.frame(matrix(runif(n), ncol = 100))
colnames(df) <- make.unique(rep(letters[1:10], each = 10), sep = "")
cols <- letters[c(1,3,7,9)]

benchmark:

microbenchmark::microbenchmark(
  base1 = {
    ix <- gsub("\\d", "", colnames(df)) %in% cols
    data.frame(lapply(split.default(df[, ix], gsub("\\d", "", colnames(df)[ ix ])), rowMeans))
  },
  base2 = {
    data.frame(lapply(split.default(df, gsub("\\d", "", colnames(df))), rowMeans))[, cols]    
  },
  tidy1 = {
    select(df, starts_with(cols)) %>% 
      split.default(gsub("\\d", "", names(.))) %>% 
      map_dfc(rowMeans)
  },
  tidy2 = {
    split.default(df, gsub("\\d", "", names(df))) %>% 
      map_dfc(rowMeans) %>% 
      select({{cols}})
  },
  tidy3 = {
    df %>%
      mutate(map_df(set_names(cols), ~ rowMeans(across(starts_with(.x)))), .keep = "none")
  },
  check = "equivalent"
)
  
# Unit: milliseconds
#  expr     min       lq      mean   median       uq      max neval
# base1 26.8613 29.42070  36.75314 33.28410  40.9296 114.7524   100
# base2 67.9460 80.95455  94.08483 86.47900 100.4189 200.9891   100
# tidy1 33.8004 36.53475  44.94198 44.44600  49.8111  76.5831   100
# tidy2 73.4420 89.79525 101.52422 97.48275 108.5292 294.6857   100
# tidy3 39.8900 44.37110  53.25316 52.23770  56.4041  99.9845   100

CodePudding user response：

You can iterate across the selector values inside a mutate() call:

library(dplyr)
library(purrr)

df %>%
  mutate(map_df(set_names(cols), ~ rowMeans(across(starts_with(.x)))), .keep = "none")

             a          c
1 -0.449558319  0.8105241
2  1.052292204 -0.1692037
3 -0.004953185 -0.2243752
4 -0.072480153  1.4457733
5 -0.354655514  0.3781747

CodePudding user response：

cbind.data.frame(df %>% select(starts_with("a")) %>% 
  rowMeans() %>% 
  cbind.data.frame() %>% 
  rename("a"="."),
  df %>% select(starts_with("b")) %>% 
                       rowMeans() %>% 
                       cbind.data.frame()%>% 
                     rename("b"="."))