Running the same dplyr commands over a bunch of group

I want to run the same commands (basically some general filter(), mutate() summarise() etc.) over a bunch of variables with the group_by() command. Is there a way to bundle this process instead of having to use a block of code for each individual variable? I've tried using a for loop, which unfortunately didn't work for me. My goal is to only have to write one block of code for all group_by variables.

library(dplyr)
         
df <- mtcars         
         
Test1 <- df %>%
  group_by(mpg) %>%
  filter(hp > 100) %>%
  summarise(N = n())

Test2 <- df %>%
  group_by(cyl) %>%
  filter(hp > 100) %>%
  summarise(N = n())

Test3 <- df %>%
  group_by(disp) %>%
  filter(hp > 100) %>%
  summarise(N = n())

Thank you in advance!

CodePudding user response：

You can do it using lapply, which will result in a list of data frames. Then you can unlist it to individual data frames.

After my code, you should find three data frames in your global environment with the name of "Test1", "Test2" and "Test3".

library(dplyr)

setNames(
  lapply(c("mpg", "cyl", "disp"), function(x)
    mtcars %>% group_by(mtcars[x]) %>% filter(hp > 100) %>% summarize(N = n())),
  c("Test1", "Test2", "Test3")
) %>% list2env(envir = .GlobalEnv)

Output

Test1
# A tibble: 18 x 2
     mpg     N
   <dbl> <int>
 1  10.4     2
 2  13.3     1
 3  14.3     1
 4  14.7     1
 5  15       1
 6  15.2     2
 7  15.5     1
 8  15.8     1
 9  16.4     1
10  17.3     1
11  17.8     1
12  18.1     1
13  18.7     1
14  19.2     2
15  19.7     1
16  21       2
17  21.4     2
18  30.4     1

Test2
# A tibble: 3 x 2
    cyl     N
  <dbl> <int>
1     4     2
2     6     7
3     8    14

Test3
# A tibble: 18 x 2
    disp     N
   <dbl> <int>
 1  95.1     1
 2 121       1
 3 145       1
 4 160       2
 5 168.      2
 6 225       1
 7 258       1
 8 276.      3
 9 301       1
10 304       1
11 318       1
12 350       1
13 351       1
14 360       2
15 400       1
16 440       1
17 460       1
18 472       1

CodePudding user response：

One option would be to store the variable names in a vector or list, then loop over this list using e.g. lapply where I make use of the .data pro-noun:

library(dplyr)

df <- mtcars         

var_list <- c("mpg", "cyl", "disp")

lapply(var_list, function(x) {
  df %>%
    group_by(.data[[x]]) %>%
    filter(hp > 100) %>%
    summarise(N = n())  
})
#> [[1]]
#> # A tibble: 18 × 2
#>      mpg     N
#>    <dbl> <int>
#>  1  10.4     2
#>  2  13.3     1
#>  3  14.3     1
#>  4  14.7     1
#>  5  15       1
#>  6  15.2     2
#>  7  15.5     1
#>  8  15.8     1
#>  9  16.4     1
#> 10  17.3     1
#> 11  17.8     1
#> 12  18.1     1
#> 13  18.7     1
#> 14  19.2     2
#> 15  19.7     1
#> 16  21       2
#> 17  21.4     2
#> 18  30.4     1
#> 
#> [[2]]
#> # A tibble: 3 × 2
#>     cyl     N
#>   <dbl> <int>
#> 1     4     2
#> 2     6     7
#> 3     8    14
#> 
#> [[3]]
#> # A tibble: 18 × 2
#>     disp     N
#>    <dbl> <int>
#>  1  95.1     1
#>  2 121       1
#>  3 145       1
#>  4 160       2
#>  5 168.      2
#>  6 225       1
#>  7 258       1
#>  8 276.      3
#>  9 301       1
#> 10 304       1
#> 11 318       1
#> 12 350       1
#> 13 351       1
#> 14 360       2
#> 15 400       1
#> 16 440       1
#> 17 460       1
#> 18 472       1

EDIT To store the results in a dataframe I would first name the vector of variables, inside the function use the same name for the grouping column. i.e. I simply used "value" then use bind_rows to bind the results together where as an identifier I added the variable name:

var_list <- c("mpg", "cyl", "disp")
names(var_list) <- var_list
lapply(var_list, function(x) {
  df %>%
    group_by(value = .data[[x]]) %>%
    filter(hp > 100) %>%
    summarise(N = n())  
}) %>% 
  bind_rows(.id = "var")
#> # A tibble: 39 x 3
#>    var   value     N
#>    <chr> <dbl> <int>
#>  1 mpg    10.4     2
#>  2 mpg    13.3     1
#>  3 mpg    14.3     1
#>  4 mpg    14.7     1
#>  5 mpg    15       1
#>  6 mpg    15.2     2
#>  7 mpg    15.5     1
#>  8 mpg    15.8     1
#>  9 mpg    16.4     1
#> 10 mpg    17.3     1
#> # ... with 29 more rows

CodePudding user response：

Since you already use a tidyverse approach you could just pivot the table:

library(tidyverse)

df <- mtcars

df %>%
  filter(hp > 100) %>%
  select(mpg, cyl, hp) %>%
  pivot_longer(cols = c(mpg, cyl, hp),
               names_to = "var",
               values_to = "value") %>%
  group_by(var) %>%
  summarise(N = n())
#> # A tibble: 3 × 2
#>   var       N
#>   <chr> <int>
#> 1 cyl      23
#> 2 hp       23
#> 3 mpg      23

Would that solve your problem? This way you can put any functions after the pivoted dataset and apply them to all variables as you wish. You can pivot it back if you prefer to have them as variables again. The main advantage of this approach is that you never split your dataset. However, there are downsides to this as well.

Another option could be using purrr.