Alternative (faster) approach to dynamically create row means for multiple groups of columns-CodePudding

I'm trying to automatically calculate the mean score per row for multiple groups of columns. E.g. a set of columns could represent items of different scales. The columns are also systematically named (scale_itemnumber).

For example, the dummy data frame below has items from three different scales. (It can happen that not all items of each scale are included, indicated here as the missing VAR_3).

#library(tidyverse)
set.seed(123)
df <- tibble(  G_1 =  sample(1:5, size = 10000, replace = TRUE),
               G_2 =  sample(1:5, size = 10000, replace = TRUE),
               G_3 =  sample(1:5, size = 10000, replace = TRUE),
             MOT_1 =  sample(1:5, size = 10000, replace = TRUE),
             MOT_2 =  sample(1:5, size = 10000, replace = TRUE),
             MOT_3 =  sample(1:5, size = 10000, replace = TRUE),
             VAR_1 =  sample(1:5, size = 10000, replace = TRUE),
             VAR_2 =  sample(1:5, size = 10000, replace = TRUE),
             VAR_4 =  sample(1:5, size = 10000, replace = TRUE))

What I'm trying to do is to create an extra column for each construct (with dynamic names such as mean_G, mean_MOT, mean_VAR) that represents the row mean for their respective set of columns.

# A tibble: 6 x 12
    G_1   G_2   G_3 MOT_1 MOT_2 MOT_3 VAR_1 VAR_2 VAR_4 mean_G mean_MOT mean_VAR
  <int> <int> <int> <int> <int> <int> <int> <int> <int>  <dbl>    <dbl>    <dbl>
1     3     3     1     1     1     1     1     5     4   2.33     1        3.33
2     3     5     3     3     2     1     4     3     5   3.67     2        4   
3     2     5     4     5     3     2     4     1     1   3.67     3.33     2   
4     2     5     4     4     4     1     2     5     4   3.67     3        3.67
5     3     4     2     1     4     5     2     2     3   3        3.33     2.33
6     5     3     4     4     3     4     1     1     4   4        3.67     2

I actually have a working approach using rowwise() and c_across() in combination with purrr but its execution is just so slow compared to doing it manually (mutate rowMeans combo). However, the true df has way more scales with many more items, so I would rather not have to hard code every mean column and insert each item (especially as the exact selection included might also vary per data frame).

#functional but slow approach

#get list of variable prefixes
var_names <- str_extract(names(df), "^.*(?=(_))") %>% 
  unique()

#use map and c_across to calculate the means rowwise per variable group

df_functional <-
      df %>% 
      bind_cols(
        map_dfc(.x = var_names, 
                .f = ~ .y %>% 
                  rowwise() %>% 
                  transmute(!!str_c("mean_", .x) := mean(c_across(starts_with(.x)))),
                .y = .))



#manual approach
df_manual <- df %>% mutate(mean_G   = rowMeans(select(., G_1,   G_2,   G_3)),
                             mean_MOT = rowMeans(select(., MOT_1,   MOT_2,   MOT_3)),
                             mean_VAR = rowMeans(select(., VAR_1,   VAR_2,   VAR_4)))

The result is identical but the dynamic/functional approach is significantly slower! Not sure what this would look like for dfs with many more columns/groups. How could I speed this up while still keeping the flexibility of the dynamic approach?

> identical(df_manual, df_functional)
[1] TRUE

#Benchmark (using the microbenchmark package)
benchmark
Unit: milliseconds
       expr        min         lq        mean     median         uq        max neval
 functional 37198.3569 38592.6855 48313.00156 52936.3254 55349.0561 59831.0141   100
     manual    16.0662    18.0139    27.53403    19.9085    22.9384   138.5401   100

CodePudding user response：

This should be way faster:

library(dplyr, warn.conflicts = FALSE)
library(purrr)
df <- tibble(  G_1 =  sample(1:5, size = 10000, replace = TRUE),
               G_2 =  sample(1:5, size = 10000, replace = TRUE),
               G_3 =  sample(1:5, size = 10000, replace = TRUE),
               MOT_1 =  sample(1:5, size = 10000, replace = TRUE),
               MOT_2 =  sample(1:5, size = 10000, replace = TRUE),
               MOT_3 =  sample(1:5, size = 10000, replace = TRUE),
               VAR_1 =  sample(1:5, size = 10000, replace = TRUE),
               VAR_2 =  sample(1:5, size = 10000, replace = TRUE),
               VAR_4 =  sample(1:5, size = 10000, replace = TRUE))
f <- function(df){
    row_means <- split.default(df, stringr::str_remove(names(df), '_[0-9]')) %>% 
        map(rowMeans) %>% 
        setNames(paste0("mean_", names(.)))
    df %>% 
        mutate(
            !!!row_means
        )
}
    manual <- function(df) {
        df %>% mutate(
            mean_G = rowMeans(select(., G_1, G_2, G_3)),
            mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
            mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4))
        )
    }
    microbenchmark::microbenchmark(prog = f(df), man = manual(df))
#> Unit: milliseconds
#>  expr    min      lq     mean   median       uq     max neval cld
#>  prog 2.6982 2.91245  3.30497  3.09260  3.30435  7.5209   100  a 
#>   man 9.1948 9.85690 10.79482 10.13105 10.81000 19.4007   100   b

^{Created on 2022-07-01 by the reprex package (v2.0.1)}

CodePudding user response：

Here is a approach using base Reduce. Not as fast as the manual approach but nearly:

functional <- function(df) {
  df %>%
    Reduce(function(x, y) {
      mutate(x, "mean_{y}" := rowMeans(across(starts_with(y)), na.rm = TRUE))
    }, var_names, init = .)
}

manual <- function(df) {
  df %>% mutate(
    mean_G = rowMeans(select(., G_1, G_2, G_3)),
    mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
    mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4))
  )
}


microbenchmark::microbenchmark(functional(df), manual(df))
#> Unit: milliseconds
#>            expr      min       lq     mean   median       uq      max neval cld
#>  functional(df) 7.582979 7.891255 8.702247 7.994792 8.440233 20.11192   100   a
#>      manual(df) 7.362384 7.816135 8.312074 7.988434 8.433740 11.55050   100   a

CodePudding user response：

Here are two more approaches using purrr::map_dfc and dplyover::over, both identical in terms of speed, a bit faster than the reduce approach, but much slower than the split_mutate approach from @Baraliuhs answer.

library(dplyr)
library(purrr)
library(dplyover) # https://github.com/TimTeaFan/dplyover
library(stringr)

# purrr's `map_dfc()` inside mutate
f_map_dfc <- function(df) {

    var_names <- str_extract(names(df), "^.*(?=(_))") %>% 
    unique()

    df %>% 
      mutate(map_dfc(set_names(var_names, paste0("mean_", var_names)),
                     ~ rowMeans(across(starts_with(.x)), na.rm = TRUE)
                     )
             )
}

# dplyover's `over()` (disclaimer: I'm the maintainer)
f_over <- function(df) {
  
  df %>% 
    mutate(over(cut_names("_[0-9]"),
                ~ rowMeans(across(starts_with(.x)), na.rm = TRUE),
                .names = "mean_{x}"
    )
    )
}

# Baraliuhs answer
split_mutate <- function(df){
  row_means <- split.default(df, stringr::str_remove(names(df), '_[0-9]')) %>% 
    map(rowMeans)
  df %>% 
    mutate(
      !!!row_means
    ) %>% 
    rename_with(~paste0('mean_', .), .cols = !matches('_'))
}

# Stefans functional approach
functional <- function(df) {
  
  var_names <- str_extract(names(df), "^.*(?=(_))") %>% 
    unique()
  
  df %>%
    Reduce(function(x, y) {
      mutate(x, "mean_{y}" := rowMeans(across(starts_with(y)), na.rm = TRUE))
    }, var_names, init = .)
}

# Stefans manual dplyr approach
manual <- function(df) {
  df %>% mutate(
    mean_G = rowMeans(select(., G_1, G_2, G_3)),
    mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
    mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4))
  )
}

# benchmark using the {bench} package:
bench::mark(map_dfc = f_map_dfc(df),
                               over = f_over(df),
                               reduce = functional(df),
                               dplyr_manual = manual(df),
                               split_mutate = split_mutate(df))

#> # A tibble: 5 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 map_dfc       12.68ms  13.54ms      72.5    3.14MB     9.07
#> 2 over          12.28ms  13.06ms      75.2  818.39KB     6.64
#> 3 reduce        16.27ms  17.27ms      57.4   719.1KB    12.5 
#> 4 dplyr_manual  32.99ms  34.91ms      28.2  951.76KB     7.69
#> 5 split_mutate   6.04ms   6.42ms     151.   758.86KB     8.73

^{Created on 2022-07-01 by the reprex package (v0.3.0)}

CodePudding user response：

Here is a fully automated version:

library(data.table)
library(tidyverse)

df %>% 
  pivot_longer(everything()) %>% 
  mutate(x = paste0(str_extract(name, '\\w \\_'), "mean")) %>% 
  mutate(newcol = rleid(x)) %>% 
  group_by(newcol, x) %>% 
  mutate(mean = mean(value, na.rm=TRUE)) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(x, mean) %>% 
  group_by(x) %>% 
  mutate(row = row_number()) %>% 
  pivot_wider(names_from = x, values_from = mean) %>% 
  bind_cols(df)

   row G_mean MOT_mean VAR_mean   G_1   G_2   G_3 MOT_1 MOT_2 MOT_3 VAR_1 VAR_2 VAR_4
   <int>  <dbl>    <dbl>    <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1     1   1.33     3.33     2        2     1     1     1     4     5     2     2     2
 2     2   2        2        3        1     4     1     4     1     1     3     5     1
 3     3   4        4.33     3        4     5     3     4     5     4     2     2     5
 4     4   2.67     3        3.67     4     3     1     5     3     1     4     3     4
 5     5   3        3.33     2.33     3     4     2     5     2     3     5     1     1
 6     6   2.67     1.33     3.33     1     2     5     1     2     1     5     3     2
 7     7   2.33     3.33     2.33     2     4     1     3     2     5     2     1     4
 8     8   3.67     3        3.67     5     5     1     3     4     2     2     5     4
 9     9   2.33     3.33     2.33     3     2     2     4     1     5     1     5     1
10    10   4        2        2.67     5     3     4     1     1     4     5     1     2
# ... with 9,990 more rows