I have a dataset with quartile columns
library(dplyr)
library(glue)
set.seed(234)
df =
tibble(x1 = sample(1:100, size = 100), # simulate data
y1 = sample(1:100, size = 100),
z1 = sample(1:100, size = 100)) %>%
mutate(across(ends_with("1"), ~factor(ntile(.x, 4)), # assign to quartiles
.names = "{.col}_q"))
head(df)
#> # A tibble: 6 × 6
#> x1 y1 z1 x1_q y1_q z1_q
#> <int> <int> <int> <fct> <fct> <fct>
#> 1 97 94 9 4 4 1
#> 2 31 66 66 2 3 3
#> 3 34 43 62 2 2 3
#> 4 46 57 41 2 3 2
#> 5 98 3 38 4 1 2
#> 6 18 37 40 1 2 2
For each of these quartile columns, I would like to append the min and the max of the quartile like so:
df %>%
group_by(x1_q) %>%
mutate(x1_q = glue("Q{x1_q} ({min(x1)} - {max(x1)})")) %>%
head()
#> # A tibble: 6 × 6
#> # Groups: x1_q [3]
#> x1 y1 z1 x1_q y1_q z1_q
#> <int> <int> <int> <glue> <fct> <fct>
#> 1 97 94 9 Q4 (76 - 100) 4 1
#> 2 31 66 66 Q2 (26 - 50) 3 3
#> 3 34 43 62 Q2 (26 - 50) 2 3
#> 4 46 57 41 Q2 (26 - 50) 3 2
#> 5 98 3 38 Q4 (76 - 100) 1 2
#> 6 18 37 40 Q1 (1 - 25) 2 2
Created on 2022-04-29 by the reprex package (v2.0.1)
Is there a scalable was I can apply this group_by
mutate
pattern to all of my quartile columns?
CodePudding user response:
Here is one method - loop across
the _q
columns, extract the corresponding columns without the _q
by removing the substring suffix from the column name, and get
the value ('tmp'), then use one of the formatting functions to append the column values along with the min
and max
library(dplyr)
library(stringr)
df %>%
mutate(across(ends_with("_q"), ~
{
tmp <- get(str_remove(cur_column(), "_q"))
str_c("Q", .x, " (", ave(tmp, .x, FUN = min), " - ",
ave(tmp, .x, FUN = max), ")")
}
))
-output
# A tibble: 100 × 6
x1 y1 z1 x1_q y1_q z1_q
<int> <int> <int> <chr> <chr> <chr>
1 97 94 9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)
2 31 66 66 Q2 (26 - 50) Q3 (51 - 75) Q3 (51 - 75)
3 34 43 62 Q2 (26 - 50) Q2 (26 - 50) Q3 (51 - 75)
4 46 57 41 Q2 (26 - 50) Q3 (51 - 75) Q2 (26 - 50)
5 98 3 38 Q4 (76 - 100) Q1 (1 - 25) Q2 (26 - 50)
6 18 37 40 Q1 (1 - 25) Q2 (26 - 50) Q2 (26 - 50)
7 56 4 98 Q3 (51 - 75) Q1 (1 - 25) Q4 (76 - 100)
8 1 17 27 Q1 (1 - 25) Q1 (1 - 25) Q2 (26 - 50)
9 68 99 73 Q3 (51 - 75) Q4 (76 - 100) Q3 (51 - 75)
10 92 65 16 Q4 (76 - 100) Q3 (51 - 75) Q1 (1 - 25)
# … with 90 more rows
Or loop over the names
in map2
library(purrr)
map2_dfc(names(df)[4:6], names(df)[1:3],
~ df %>%
group_by(across(all_of(.x))) %>%
transmute(!! .x := glue::glue("Q{.data[[.x]]} ({min(.data[[.y]])} - {max(.data[[.y]])})")) %>% ungroup ) %>% bind_cols(df[1:3], .)
# A tibble: 100 × 6
x1 y1 z1 x1_q y1_q z1_q
<int> <int> <int> <glue> <glue> <glue>
1 97 94 9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)
2 31 66 66 Q2 (26 - 50) Q3 (51 - 75) Q3 (51 - 75)
3 34 43 62 Q2 (26 - 50) Q2 (26 - 50) Q3 (51 - 75)
4 46 57 41 Q2 (26 - 50) Q3 (51 - 75) Q2 (26 - 50)
5 98 3 38 Q4 (76 - 100) Q1 (1 - 25) Q2 (26 - 50)
6 18 37 40 Q1 (1 - 25) Q2 (26 - 50) Q2 (26 - 50)
7 56 4 98 Q3 (51 - 75) Q1 (1 - 25) Q4 (76 - 100)
8 1 17 27 Q1 (1 - 25) Q1 (1 - 25) Q2 (26 - 50)
9 68 99 73 Q3 (51 - 75) Q4 (76 - 100) Q3 (51 - 75)
10 92 65 16 Q4 (76 - 100) Q3 (51 - 75) Q1 (1 - 25)
# … with 90 more rows
CodePudding user response:
You could write your own myntile
function and just use across
as usual. Below myntile
is based on ntile
, cut
and some string manipulation.
library(dplyr)
library(stringr)
set.seed(234)
myntile <- function(x, n) {
q <- ntile(x, n)
rng <- as.character(cut(x, n))
rng2 <- str_replace_all(rng, "[0-9\\.] ", function(x) round(as.numeric(x), 0)) %>%
str_replace(",", " - ") %>%
str_replace("]", ")")
paste0("Q", q, " ", rng2)
}
df =
tibble(x1 = sample(1:100, size = 100), # simulate data
y1 = sample(1:100, size = 100),
z1 = sample(1:100, size = 100)) %>%
mutate(across(ends_with("1"), ~ myntile(.x, 4), # assign to quartiles
.names = "{.col}_q"))
head(df)
#> # A tibble: 6 x 6
#> x1 y1 z1 x1_q y1_q z1_q
#> <int> <int> <int> <chr> <chr> <chr>
#> 1 97 94 9 Q4 (75 - 100) Q4 (75 - 100) Q1 (1 - 26)
#> 2 31 66 66 Q2 (26 - 50) Q3 (50 - 75) Q3 (50 - 75)
#> 3 34 43 62 Q2 (26 - 50) Q2 (26 - 50) Q3 (50 - 75)
#> 4 46 57 41 Q2 (26 - 50) Q3 (50 - 75) Q2 (26 - 50)
#> 5 98 3 38 Q4 (75 - 100) Q1 (1 - 26) Q2 (26 - 50)
#> 6 18 37 40 Q1 (1 - 26) Q2 (26 - 50) Q2 (26 - 50)
Created on 2022-04-29 by the reprex package (v2.0.1)