I am working with the R programming language.
I have the following dataset:
set.seed(123)
library(dplyr)
var1 = rnorm(10000, 100,100)
var2 = rnorm(10000, 100,100)
var3 = rnorm(10000, 100,100)
var4 = rnorm(10000, 100,100)
var5 <- factor(sample(c("A","B", "C", "D", "E"), 1000, replace=TRUE, prob=c(0.2, 0.2, 0.2, 0.2, 0.2)))
my_data = data.frame( var1, var2, var3, var4, var5)
> head(my_data)
var1 var2 var3 var4 var5
1 43.95244 337.07252 16.370326 80.63936 E
2 76.98225 83.31880 77.942701 125.81473 E
3 255.87083 192.69614 -110.351477 46.16874 E
4 107.05084 43.18483 -66.780754 -17.90628 A
5 112.92877 122.50901 -9.796286 190.06474 C
6 271.50650 213.19859 -66.562121 98.37505 E
I would like to calculate the "grouped" percentiles (e.g. 4 levels for var1 and 5 levels for var2) for var1 and var2 relative to the groups in var5.
An example of the final output should look something like this:
# sample of final output (with hypothetical numbers)
var1 var2 var3 var4 var5 var1_q var2_q
1 43.95244 337.07252 16.370326 80.63936 E one one
2 76.98225 83.31880 77.942701 125.81473 E two two
3 255.87083 192.69614 -110.351477 46.16874 E one one
4 107.05084 43.18483 -66.780754 -17.90628 A three three
5 112.92877 122.50901 -9.796286 190.06474 C five four
6 271.50650 213.19859 -66.562121 98.37505 E two one
I know that there are many ways to do this, but many of these ways have been giving me problems (e.g. R: Using DPLYR to Calculate Quantiles?).
I found this approach that seems to be working so far:
var1_df = my_data %>% group_by(var5) %>%
summarize(first=quantile(var1,probs=0.25),
second=quantile(var1,probs=0.5),
third=quantile(var1,probs=0.75),
fourth=quantile(var1,probs=1))
var2_df = my_data %>% group_by(var5) %>%
summarize(first=quantile(var2,probs=0.2),
second=quantile(var2,probs=0.4),
third=quantile(var2,probs=0.6),
fourth=quantile(var2,probs=0.8), fifth=quantile(var2,probs=1))
num.cols <- c('first','second','third', 'fourth')
var1_df[num.cols] <- sapply(var1_df[num.cols], as.numeric)
num.cols <- c('first','second','third', 'fourth', "fifth")
var2_df[num.cols] <- sapply(var2_df[num.cols], as.numeric)
An example of this output looks something like this:
head(var1_df)
# A tibble: 5 x 5
var5 first second third fourth
<fct> <dbl> <dbl> <dbl> <dbl>
1 A 36.6 96.7 166. 439.
2 B 30.5 97.0 166. 429.
3 C 29.6 99.2 166. 485.
4 D 33.4 102. 170. 440.
5 E 35.5 99.5 168. 472.
Based on this code, I had the following idea on how to accomplish my task (i.e. use "greater than or equal to conditions" to find the corresponding quantile for each group):
for (i in 1:length(levels(var1_df$var5)))
{
temp_i = var1_df[i,]
my_data$var1_q <- my_data%>% mutate(my_data$var1 <= temp_i[2] ~ "one",
my_data$var1 > temp_i[2] & my_data$var1 <= temp_i[3] ~ "two",
my_data$var1 > temp_i[3] & my_data$var1 <= temp_i[4] ~ "three",
TRUE ~ "four")))
}
for (i in 1:length(levels(var2_df$var5)))
{
temp_i = var2_df[i,]
my_data$var2_q <- my_data%>% mutate(my_data$var2 <= temp_i[2] ~ "one",
my_data$var2 > temp_i[2] & my_data$var2 <= temp_i[3] ~ "two",
my_data$var2 > temp_i[3] & my_data$var2 <= temp_i[4] ~ "three", my_data$var2 <= temp_i[5] ~ "four",
TRUE ~ "five")))
}
But I am getting lost while this writing these loops - can someone please show me how to fix these?
Thanks!
CodePudding user response:
If the intention is to create columns in my_data
, use cut
with quantile
as breaks
and specify the labels
corresponding to it
library(dplyr)
library(english)
my_data %>%
group_by(var5) %>%
mutate(var1_q = cut(var1, breaks = c(-Inf, quantile(var1,
probs = seq(0.25, 1, by = 0.25))),
labels = english(1:4)),
var2_q = cut(var2, breaks = c(-Inf, quantile(var2,
probs = seq(0.2, 1, by = 0.2))), labels = english(1:5))) %>%
ungroup
-output
# A tibble: 10,000 × 7
var1 var2 var3 var4 var5 var1_q var2_q
<dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
1 44.0 337. 16.4 80.6 E two five
2 77.0 83.3 77.9 126. E two three
3 256. 193. -110. 46.2 E four five
4 107. 43.2 -66.8 -17.9 A three two
5 113. 123. -9.80 190. C three three
6 272. 213. -66.6 98.4 E four five
7 146. 238. 95.0 118. D three five
8 -26.5 76.7 256. 160. D one three
9 31.3 -60.1 59.5 126. A one one
10 55.4 70.2 179. 130. B two two
# … with 9,990 more rows
In this way, we directly create the columns instead of creating a list
of summarised output and then trying to join with original data to create the columns
CodePudding user response:
You can just use dplyr::summarize(quantile(var, probs = ...)
and supply customized vectors of quantiles for each variable as long as you adjust the length of the output accordingly.
library(tidyverse)
n <- 50
d <- tibble(grp = sample(c("A", "B"), n, T), v1 = runif(n, 1, 10), v2 = runif(n, 50, 500))
v1_quantiles <- c(0.25, 0.5, 0.75, 1)
v2_quantiles <- c(0.2, 0.4, 0.6, 0.8, 0.1)
e <- d %>%
group_by(grp) %>%
summarise(var = c(rep("v1", length(v1_quantiles)),
rep("v2", length(v2_quantiles))),
val = c(quantile(v1, v1_quantiles),
quantile(v2, v2_quantiles)),
q = c(v1_quantiles, v2_quantiles),
.groups = "drop")
e
#> # A tibble: 18 × 4
#> grp var val q
#> <chr> <chr> <dbl> <dbl>
#> 1 A v1 2.46 0.25
#> 2 A v1 5.77 0.5
#> 3 A v1 7.34 0.75
#> 4 A v1 9.55 1
#> 5 A v2 145. 0.2
#> 6 A v2 232. 0.4
#> 7 A v2 314. 0.6
#> 8 A v2 355. 0.8
#> 9 A v2 80.2 0.1
#> 10 B v1 3.64 0.25
#> 11 B v1 4.81 0.5
#> 12 B v1 7.08 0.75
#> 13 B v1 9.87 1
#> 14 B v2 218. 0.2
#> 15 B v2 279. 0.4
#> 16 B v2 321. 0.6
#> 17 B v2 376. 0.8
#> 18 B v2 169. 0.1
If you want to split into separate tables and convert to wide format try this:
e %>%
group_split(var) %>%
map(~.x %>% pivot_wider(names_from = q, values_from = val, names_prefix = "q"))
#> [[1]]
#> # A tibble: 2 × 6
#> grp var q0.25 q0.5 q0.75 q1
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A v1 2.46 5.77 7.34 9.55
#> 2 B v1 3.64 4.81 7.08 9.87
#>
#> [[2]]
#> # A tibble: 2 × 7
#> grp var q0.2 q0.4 q0.6 q0.8 q0.1
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A v2 145. 232. 314. 355. 80.2
#> 2 B v2 218. 279. 321. 376. 169.
Created on 2023-01-05 with reprex v2.0.2
CodePudding user response:
My own attempt at an answer after much trial and error - is this correct?
my_data_final = left_join(my_data, var1_df) %>%
mutate(var1_q = if_else(var1 < first, "one",
if_else(var1 >= first & var1 <second, "two",
if_else(var1 >=second & var1 <third, "three",
ifelse(var1 >= third & var1 < fourth, "four", NA_character_) )))) %>%
select(names(my_data), var1_q)
-output
> head(my_data_final)
var1 var2 var3 var4 var5 var1_q
1 43.95244 337.07252 16.370326 80.63936 E two
2 76.98225 83.31880 77.942701 125.81473 E two
3 255.87083 192.69614 -110.351477 46.16874 E four
4 107.05084 43.18483 -66.780754 -17.90628 A three
5 112.92877 122.50901 -9.796286 190.06474 C three
6 271.50650 213.19859 -66.562121 98.37505 E four