I'm trying to automatically calculate the mean score per row for multiple groups of columns. E.g. a set of columns could represent items of different scales. The columns are also systematically named (scale_itemnumber).
For example, the dummy data frame below has items from three different scales. (It can happen that not all items of each scale are included, indicated here as the missing VAR_3).
#library(tidyverse)
set.seed(123)
df <- tibble( G_1 = sample(1:5, size = 10000, replace = TRUE),
G_2 = sample(1:5, size = 10000, replace = TRUE),
G_3 = sample(1:5, size = 10000, replace = TRUE),
MOT_1 = sample(1:5, size = 10000, replace = TRUE),
MOT_2 = sample(1:5, size = 10000, replace = TRUE),
MOT_3 = sample(1:5, size = 10000, replace = TRUE),
VAR_1 = sample(1:5, size = 10000, replace = TRUE),
VAR_2 = sample(1:5, size = 10000, replace = TRUE),
VAR_4 = sample(1:5, size = 10000, replace = TRUE))
What I'm trying to do is to create an extra column for each construct (with dynamic names such as mean_G, mean_MOT, mean_VAR) that represents the row mean for their respective set of columns.
# A tibble: 6 x 12
G_1 G_2 G_3 MOT_1 MOT_2 MOT_3 VAR_1 VAR_2 VAR_4 mean_G mean_MOT mean_VAR
<int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
1 3 3 1 1 1 1 1 5 4 2.33 1 3.33
2 3 5 3 3 2 1 4 3 5 3.67 2 4
3 2 5 4 5 3 2 4 1 1 3.67 3.33 2
4 2 5 4 4 4 1 2 5 4 3.67 3 3.67
5 3 4 2 1 4 5 2 2 3 3 3.33 2.33
6 5 3 4 4 3 4 1 1 4 4 3.67 2
I actually have a working approach using rowwise() and c_across() in combination with purrr but its execution is just so slow compared to doing it manually (mutate rowMeans combo). However, the true df has way more scales with many more items, so I would rather not have to hard code every mean column and insert each item (especially as the exact selection included might also vary per data frame).
#functional but slow approach
#get list of variable prefixes
var_names <- str_extract(names(df), "^.*(?=(_))") %>%
unique()
#use map and c_across to calculate the means rowwise per variable group
df_functional <-
df %>%
bind_cols(
map_dfc(.x = var_names,
.f = ~ .y %>%
rowwise() %>%
transmute(!!str_c("mean_", .x) := mean(c_across(starts_with(.x)))),
.y = .))
#manual approach
df_manual <- df %>% mutate(mean_G = rowMeans(select(., G_1, G_2, G_3)),
mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4)))
The result is identical but the dynamic/functional approach is significantly slower! Not sure what this would look like for dfs with many more columns/groups. How could I speed this up while still keeping the flexibility of the dynamic approach?
> identical(df_manual, df_functional)
[1] TRUE
#Benchmark (using the microbenchmark package)
benchmark
Unit: milliseconds
expr min lq mean median uq max neval
functional 37198.3569 38592.6855 48313.00156 52936.3254 55349.0561 59831.0141 100
manual 16.0662 18.0139 27.53403 19.9085 22.9384 138.5401 100
CodePudding user response:
This should be way faster:
library(dplyr, warn.conflicts = FALSE)
library(purrr)
df <- tibble( G_1 = sample(1:5, size = 10000, replace = TRUE),
G_2 = sample(1:5, size = 10000, replace = TRUE),
G_3 = sample(1:5, size = 10000, replace = TRUE),
MOT_1 = sample(1:5, size = 10000, replace = TRUE),
MOT_2 = sample(1:5, size = 10000, replace = TRUE),
MOT_3 = sample(1:5, size = 10000, replace = TRUE),
VAR_1 = sample(1:5, size = 10000, replace = TRUE),
VAR_2 = sample(1:5, size = 10000, replace = TRUE),
VAR_4 = sample(1:5, size = 10000, replace = TRUE))
f <- function(df){
row_means <- split.default(df, stringr::str_remove(names(df), '_[0-9]')) %>%
map(rowMeans) %>%
setNames(paste0("mean_", names(.)))
df %>%
mutate(
!!!row_means
)
}
manual <- function(df) {
df %>% mutate(
mean_G = rowMeans(select(., G_1, G_2, G_3)),
mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4))
)
}
microbenchmark::microbenchmark(prog = f(df), man = manual(df))
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> prog 2.6982 2.91245 3.30497 3.09260 3.30435 7.5209 100 a
#> man 9.1948 9.85690 10.79482 10.13105 10.81000 19.4007 100 b
Created on 2022-07-01 by the reprex package (v2.0.1)
CodePudding user response:
Here is a approach using base Reduce
. Not as fast as the manual approach but nearly:
functional <- function(df) {
df %>%
Reduce(function(x, y) {
mutate(x, "mean_{y}" := rowMeans(across(starts_with(y)), na.rm = TRUE))
}, var_names, init = .)
}
manual <- function(df) {
df %>% mutate(
mean_G = rowMeans(select(., G_1, G_2, G_3)),
mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4))
)
}
microbenchmark::microbenchmark(functional(df), manual(df))
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> functional(df) 7.582979 7.891255 8.702247 7.994792 8.440233 20.11192 100 a
#> manual(df) 7.362384 7.816135 8.312074 7.988434 8.433740 11.55050 100 a
CodePudding user response:
Here are two more approaches using purrr::map_dfc
and dplyover::over
, both identical in terms of speed, a bit faster than the reduce
approach, but much slower than the split_mutate
approach from @Baraliuhs answer.
library(dplyr)
library(purrr)
library(dplyover) # https://github.com/TimTeaFan/dplyover
library(stringr)
# purrr's `map_dfc()` inside mutate
f_map_dfc <- function(df) {
var_names <- str_extract(names(df), "^.*(?=(_))") %>%
unique()
df %>%
mutate(map_dfc(set_names(var_names, paste0("mean_", var_names)),
~ rowMeans(across(starts_with(.x)), na.rm = TRUE)
)
)
}
# dplyover's `over()` (disclaimer: I'm the maintainer)
f_over <- function(df) {
df %>%
mutate(over(cut_names("_[0-9]"),
~ rowMeans(across(starts_with(.x)), na.rm = TRUE),
.names = "mean_{x}"
)
)
}
# Baraliuhs answer
split_mutate <- function(df){
row_means <- split.default(df, stringr::str_remove(names(df), '_[0-9]')) %>%
map(rowMeans)
df %>%
mutate(
!!!row_means
) %>%
rename_with(~paste0('mean_', .), .cols = !matches('_'))
}
# Stefans functional approach
functional <- function(df) {
var_names <- str_extract(names(df), "^.*(?=(_))") %>%
unique()
df %>%
Reduce(function(x, y) {
mutate(x, "mean_{y}" := rowMeans(across(starts_with(y)), na.rm = TRUE))
}, var_names, init = .)
}
# Stefans manual dplyr approach
manual <- function(df) {
df %>% mutate(
mean_G = rowMeans(select(., G_1, G_2, G_3)),
mean_MOT = rowMeans(select(., MOT_1, MOT_2, MOT_3)),
mean_VAR = rowMeans(select(., VAR_1, VAR_2, VAR_4))
)
}
# benchmark using the {bench} package:
bench::mark(map_dfc = f_map_dfc(df),
over = f_over(df),
reduce = functional(df),
dplyr_manual = manual(df),
split_mutate = split_mutate(df))
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 map_dfc 12.68ms 13.54ms 72.5 3.14MB 9.07
#> 2 over 12.28ms 13.06ms 75.2 818.39KB 6.64
#> 3 reduce 16.27ms 17.27ms 57.4 719.1KB 12.5
#> 4 dplyr_manual 32.99ms 34.91ms 28.2 951.76KB 7.69
#> 5 split_mutate 6.04ms 6.42ms 151. 758.86KB 8.73
Created on 2022-07-01 by the reprex package (v0.3.0)
CodePudding user response:
Here is a fully automated version:
library(data.table)
library(tidyverse)
df %>%
pivot_longer(everything()) %>%
mutate(x = paste0(str_extract(name, '\\w \\_'), "mean")) %>%
mutate(newcol = rleid(x)) %>%
group_by(newcol, x) %>%
mutate(mean = mean(value, na.rm=TRUE)) %>%
slice(1) %>%
ungroup() %>%
select(x, mean) %>%
group_by(x) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = x, values_from = mean) %>%
bind_cols(df)
row G_mean MOT_mean VAR_mean G_1 G_2 G_3 MOT_1 MOT_2 MOT_3 VAR_1 VAR_2 VAR_4
<int> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1.33 3.33 2 2 1 1 1 4 5 2 2 2
2 2 2 2 3 1 4 1 4 1 1 3 5 1
3 3 4 4.33 3 4 5 3 4 5 4 2 2 5
4 4 2.67 3 3.67 4 3 1 5 3 1 4 3 4
5 5 3 3.33 2.33 3 4 2 5 2 3 5 1 1
6 6 2.67 1.33 3.33 1 2 5 1 2 1 5 3 2
7 7 2.33 3.33 2.33 2 4 1 3 2 5 2 1 4
8 8 3.67 3 3.67 5 5 1 3 4 2 2 5 4
9 9 2.33 3.33 2.33 3 2 2 4 1 5 1 5 1
10 10 4 2 2.67 5 3 4 1 1 4 5 1 2
# ... with 9,990 more rows