Home > database >  Is there a way to use `across` together with `group_by` and `mutate`?
Is there a way to use `across` together with `group_by` and `mutate`?

Time:04-30

I have a dataset with quartile columns

library(dplyr)
library(glue)

set.seed(234)
df = 
tibble(x1 = sample(1:100, size = 100), # simulate data
       y1 = sample(1:100, size = 100),
       z1 = sample(1:100, size = 100)) %>% 
  mutate(across(ends_with("1"), ~factor(ntile(.x, 4)), # assign to quartiles
                                        .names = "{.col}_q"))
  
head(df)         
#> # A tibble: 6 × 6
#>      x1    y1    z1 x1_q  y1_q  z1_q 
#>   <int> <int> <int> <fct> <fct> <fct>
#> 1    97    94     9 4     4     1    
#> 2    31    66    66 2     3     3    
#> 3    34    43    62 2     2     3    
#> 4    46    57    41 2     3     2    
#> 5    98     3    38 4     1     2    
#> 6    18    37    40 1     2     2

For each of these quartile columns, I would like to append the min and the max of the quartile like so:

df %>% 
  group_by(x1_q) %>% 
  mutate(x1_q = glue("Q{x1_q} ({min(x1)} - {max(x1)})")) %>% 
  head()
#> # A tibble: 6 × 6
#> # Groups:   x1_q [3]
#>      x1    y1    z1 x1_q          y1_q  z1_q 
#>   <int> <int> <int> <glue>        <fct> <fct>
#> 1    97    94     9 Q4 (76 - 100) 4     1    
#> 2    31    66    66 Q2 (26 - 50)  3     3    
#> 3    34    43    62 Q2 (26 - 50)  2     3    
#> 4    46    57    41 Q2 (26 - 50)  3     2    
#> 5    98     3    38 Q4 (76 - 100) 1     2    
#> 6    18    37    40 Q1 (1 - 25)   2     2

Created on 2022-04-29 by the reprex package (v2.0.1)

Is there a scalable was I can apply this group_by mutate pattern to all of my quartile columns?

CodePudding user response:

Here is one method - loop across the _q columns, extract the corresponding columns without the _q by removing the substring suffix from the column name, and get the value ('tmp'), then use one of the formatting functions to append the column values along with the min and max

library(dplyr)
library(stringr)
df %>%
    mutate(across(ends_with("_q"),  ~ 
       {
        tmp <- get(str_remove(cur_column(), "_q"))
        str_c("Q", .x, " (", ave(tmp, .x, FUN = min), " - ", 
                 ave(tmp, .x, FUN = max), ")")
        }
  ))

-output

# A tibble: 100 × 6
      x1    y1    z1 x1_q          y1_q          z1_q         
   <int> <int> <int> <chr>         <chr>         <chr>        
 1    97    94     9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)  
 2    31    66    66 Q2 (26 - 50)  Q3 (51 - 75)  Q3 (51 - 75) 
 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (51 - 75) 
 4    46    57    41 Q2 (26 - 50)  Q3 (51 - 75)  Q2 (26 - 50) 
 5    98     3    38 Q4 (76 - 100) Q1 (1 - 25)   Q2 (26 - 50) 
 6    18    37    40 Q1 (1 - 25)   Q2 (26 - 50)  Q2 (26 - 50) 
 7    56     4    98 Q3 (51 - 75)  Q1 (1 - 25)   Q4 (76 - 100)
 8     1    17    27 Q1 (1 - 25)   Q1 (1 - 25)   Q2 (26 - 50) 
 9    68    99    73 Q3 (51 - 75)  Q4 (76 - 100) Q3 (51 - 75) 
10    92    65    16 Q4 (76 - 100) Q3 (51 - 75)  Q1 (1 - 25)  
# … with 90 more rows

Or loop over the names in map2

library(purrr)
map2_dfc(names(df)[4:6], names(df)[1:3], 
    ~ df %>%
          group_by(across(all_of(.x))) %>%
          transmute(!! .x := glue::glue("Q{.data[[.x]]} ({min(.data[[.y]])} - {max(.data[[.y]])})")) %>% ungroup ) %>% bind_cols(df[1:3], .)
# A tibble: 100 × 6
      x1    y1    z1 x1_q          y1_q          z1_q         
   <int> <int> <int> <glue>        <glue>        <glue>       
 1    97    94     9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)  
 2    31    66    66 Q2 (26 - 50)  Q3 (51 - 75)  Q3 (51 - 75) 
 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (51 - 75) 
 4    46    57    41 Q2 (26 - 50)  Q3 (51 - 75)  Q2 (26 - 50) 
 5    98     3    38 Q4 (76 - 100) Q1 (1 - 25)   Q2 (26 - 50) 
 6    18    37    40 Q1 (1 - 25)   Q2 (26 - 50)  Q2 (26 - 50) 
 7    56     4    98 Q3 (51 - 75)  Q1 (1 - 25)   Q4 (76 - 100)
 8     1    17    27 Q1 (1 - 25)   Q1 (1 - 25)   Q2 (26 - 50) 
 9    68    99    73 Q3 (51 - 75)  Q4 (76 - 100) Q3 (51 - 75) 
10    92    65    16 Q4 (76 - 100) Q3 (51 - 75)  Q1 (1 - 25)  
# … with 90 more rows

CodePudding user response:

You could write your own myntile function and just use across as usual. Below myntile is based on ntile, cut and some string manipulation.

library(dplyr)
library(stringr)

set.seed(234)

myntile <- function(x, n) {
  
  q <- ntile(x, n)
  rng <- as.character(cut(x, n))
  rng2 <- str_replace_all(rng, "[0-9\\.] ", function(x) round(as.numeric(x), 0)) %>%
    str_replace(",", " - ") %>%
    str_replace("]", ")")

  paste0("Q", q, " ", rng2)
}

df = 
  tibble(x1 = sample(1:100, size = 100), # simulate data
         y1 = sample(1:100, size = 100),
         z1 = sample(1:100, size = 100)) %>% 
  mutate(across(ends_with("1"), ~ myntile(.x, 4), # assign to quartiles
                .names = "{.col}_q"))

head(df)

#> # A tibble: 6 x 6
#>      x1    y1    z1 x1_q          y1_q          z1_q        
#>   <int> <int> <int> <chr>         <chr>         <chr>       
#> 1    97    94     9 Q4 (75 - 100) Q4 (75 - 100) Q1 (1 - 26) 
#> 2    31    66    66 Q2 (26 - 50)  Q3 (50 - 75)  Q3 (50 - 75)
#> 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (50 - 75)
#> 4    46    57    41 Q2 (26 - 50)  Q3 (50 - 75)  Q2 (26 - 50)
#> 5    98     3    38 Q4 (75 - 100) Q1 (1 - 26)   Q2 (26 - 50)
#> 6    18    37    40 Q1 (1 - 26)   Q2 (26 - 50)  Q2 (26 - 50)

Created on 2022-04-29 by the reprex package (v2.0.1)

  • Related