I am working with the R programming language.
I have the following dataset:
set.seed(123)
library(dplyr)
var1 = rnorm(10000, 100,100)
var2 = rnorm(10000, 100,100)
var3 = rnorm(10000, 100,100)
var4 = rnorm(10000, 100,100)
var5 <- factor(sample(c("Yes", "No"), 1000, replace=TRUE, prob=c(0.4, 0.6)))
var6 <- factor(sample(c("Yes", "No"), 1000, replace=TRUE, prob=c(0.4, 0.6)))
my_data = data.frame( var1, var2, var3, var4, var5, var6)
I want to calculate "grouped percentiles" (e.g. at arbitrary levels) for different columns in this dataset according to the categorical variables.
Initially, I was trying to do this using a function - but this kept giving me a lot of difficulty (e.g. R: Difficulty Calculating Percentiles?).
As a result, I am trying to do this "manually" in the meantime. For instance, suppose:
- Based on groupings of var5 and var6
- I want to create a variable "class3" that splits var3 into groups of 10 percentiles
- And I want to create a variable "class4" that splits var4 into groups of 20 percentiles
As an example, here are two different ways I am trying to do this:
Method 1: Produces some NA's?
library(dplyr)
final = my_data %>% group_by(var5, var6) %>%
mutate(class3 = case_when(ntile(var3, 10) == 1 ~ paste0(round(min(var3), 2), " to ", round(quantile(var3, 0.1), 2), " decile 1"),
ntile(var3, 10) == 2 ~ paste0(round(quantile(var3, 0.1), 2), " to ", round(quantile(var3, 0.2), 2), " decile 2"),
ntile(var3, 10) == 3 ~ paste0(round(quantile(var3, 0.2), 2), " to ", round(quantile(var3, 0.3), 2), " decile 3"),
ntile(var3, 10) == 4 ~ paste0(round(quantile(var3, 0.3), 2), " to ", round(quantile(var3, 0.4), 2), " decile 4"),
ntile(var3, 10) == 5 ~ paste0(round(quantile(var3, 0.4), 2), " to ", round(quantile(var3, 0.5), 2), " decile 5"),
ntile(var3, 10) == 6 ~ paste0(round(quantile(var3, 0.5), 2), " to ", round(quantile(var3, 0.6), 2), " decile 6"),
ntile(var3, 10) == 7 ~ paste0(round(quantile(var3, 0.6), 2), " to ", round(quantile(var3, 0.7), 2), " decile 7"),
ntile(var3, 10) == 8 ~ paste0(round(quantile(var3, 0.7), 2), " to ", round(quantile(var3, 0.8), 2), " decile 8"),
ntile(var3, 10) == 9 ~ paste0(round(quantile(var3, 0.8), 2), " to ", round(quantile(var3, 0.9), 2), " decile 9"),
ntile(var3, 10) == 10 ~ paste0(round(quantile(var3, 0.9), 2), " to ", round(max(var3), 2), " decile 10"))) %>%
mutate(class4 = case_when(ntile(var4, 20) == 1 ~ paste0(round(min(var4), 2), " to ", round(quantile(var4, 0.1), 2), " pcile 1"),
ntile(var4, 20) == 2 ~ paste0(round(quantile(var4, 0.1), 2), " to ", round(quantile(var4, 0.2), 2), " pcile 2"),
ntile(var4, 20) == 3 ~ paste0(round(quantile(var4, 0.2), 2), " to ", round(quantile(var4, 0.3), 2), " pcile 3"),
ntile(var4, 20) == 4 ~ paste0(round(quantile(var4, 0.3), 2), " to ", round(quantile(var4, 0.4), 2), " pcile 4"),
ntile(var4, 20) == 5 ~ paste0(round(quantile(var4, 0.4), 2), " to ", round(quantile(var4, 0.5), 2), " pcile 5")))
Method 2: Fewer NA's?
final = my_data %>% group_by(var5, var6) %>% mutate(class3 = paste0(cut(var3, breaks = c(-Inf, quantile(var3, c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)), Inf),
labels = c("ptile 1", "ptile 2", "ptile 3", "ptile 4", "ptile 5", "ptile 6", "ptile 7", "ptile 8", "ptile 9", "ptile 10")),
" (", round(min(var3), 2), " to ", round(max(var3), 2), ")")) %>%
mutate(class4 = paste0(cut(var4, breaks = c(-Inf, quantile(var4, c(0.2, 0.4, 0.6, 0.8)), Inf),
labels = c("ptile 1", "ptile 2", "ptile 3", "ptile 4", "ptile 5")),
" (", round(min(var4), 2), " to ", round(max(var4), 2), ")"))
I think that Method 2 might be more correct, seeing as there are fewer NA values being produced - but in the meantime, could someone please help me verify if I am doing this correctly (in Method 2) ... and if not, how can I correct this?
Thanks!
CodePudding user response:
For Method 1, I believe you mainly have NA
values because you need to use ntile(..., 5)
instead of ntile(..., 20)
.
final = my_data %>% group_by(var5, var6) %>%
mutate(class3 = case_when(ntile(var3, 10) == 1 ~ paste0(round(min(var3), 2), " to ", round(quantile(var3, 0.1), 2), " decile 1"),
ntile(var3, 10) == 2 ~ paste0(round(quantile(var3, 0.1), 2), " to ", round(quantile(var3, 0.2), 2), " decile 2"),
ntile(var3, 10) == 3 ~ paste0(round(quantile(var3, 0.2), 2), " to ", round(quantile(var3, 0.3), 2), " decile 3"),
ntile(var3, 10) == 4 ~ paste0(round(quantile(var3, 0.3), 2), " to ", round(quantile(var3, 0.4), 2), " decile 4"),
ntile(var3, 10) == 5 ~ paste0(round(quantile(var3, 0.4), 2), " to ", round(quantile(var3, 0.5), 2), " decile 5"),
ntile(var3, 10) == 6 ~ paste0(round(quantile(var3, 0.5), 2), " to ", round(quantile(var3, 0.6), 2), " decile 6"),
ntile(var3, 10) == 7 ~ paste0(round(quantile(var3, 0.6), 2), " to ", round(quantile(var3, 0.7), 2), " decile 7"),
ntile(var3, 10) == 8 ~ paste0(round(quantile(var3, 0.7), 2), " to ", round(quantile(var3, 0.8), 2), " decile 8"),
ntile(var3, 10) == 9 ~ paste0(round(quantile(var3, 0.8), 2), " to ", round(quantile(var3, 0.9), 2), " decile 9"),
ntile(var3, 10) == 10 ~ paste0(round(quantile(var3, 0.9), 2), " to ", round(max(var3), 2), " decile 10"))) %>%
mutate(class4 = case_when(ntile(var4, 5) == 1 ~ paste0(round(min(var4), 2), " to ", round(quantile(var4, 0.1), 2), " pcile 1"),
ntile(var4, 5) == 2 ~ paste0(round(quantile(var4, 0.2), 2), " to ", round(quantile(var4, 0.4), 2), " pcile 2"),
ntile(var4, 5) == 3 ~ paste0(round(quantile(var4, 0.4), 2), " to ", round(quantile(var4, 0.6), 2), " pcile 3"),
ntile(var4, 5) == 4 ~ paste0(round(quantile(var4, 0.6), 2), " to ", round(quantile(var4, 0.8), 2), " pcile 4"),
ntile(var4, 5) == 5 ~ paste0(round(quantile(var4, 0.8), 2), " to ", round(quantile(var4, 1), 2), " pcile 5")))
Edit: If I were approaching this from scratch, I would use this data.table approach:
library(data.table)
dt = as.data.table(my_data)
dt[, c("class3", "class4") := {
class3_cut = cut(var3,
breaks = c(min(var3),
quantile(var3, seq(.1, .9, .1)),
max(var3)),
include.lowest = TRUE)
class4_cut = cut(var4,
breaks = c(min(var4),
quantile(var4, seq(.2, .8, .2)),
max(var4)),
include.lowest = TRUE)
.(paste(class3_cut, as.integer(class3_cut)),
paste(class4_cut, as.integer(class4_cut)))
}]
dt
## var1 var2 var3 var4 var5 var6 class3 class4
## <num> <num> <num> <num> <fctr> <fctr> <char> <char>
## 1: 43.952435 337.07252 16.370326 80.63936 Yes Yes (14.9,46.1] 3 (74.2,125] 3
## 2: 76.982251 83.31880 77.942701 125.81473 Yes Yes (72.3,98.8] 5 (125,183] 4
## 3: 255.870831 192.69614 -110.351477 46.16874 Yes No [-255,-29.4] 1 (14.7,74.2] 2
## 4: 107.050839 43.18483 -66.780754 -17.90628 Yes Yes [-255,-29.4] 1 [-313,14.7] 1
## 5: 112.928774 122.50901 -9.796286 190.06474 No No (-29.4,14.9] 2 (183,498] 5
## ---
## 9996: -2.478396 61.26773 168.942861 314.75822 No Yes (153,185] 8 (183,498] 5
## 9997: 63.288361 93.22906 238.307092 116.06609 No No (228,532] 10 (74.2,125] 3
## 9998: 24.252710 190.50561 82.354479 159.20177 Yes No (72.3,98.8] 5 (125,183] 4
## 9999: 207.932887 288.07347 83.362980 28.31323 No Yes (72.3,98.8] 5 (14.7,74.2] 2
## 10000: 35.507583 192.95621 88.647989 73.35896 Yes No (72.3,98.8] 5 (14.7,74.2] 2
CodePudding user response:
We can easily calculate percentiles in R using the quantile() function, which uses the following syntax:
quantile(x, probs = seq(0, 1, 0.25))
x: a numeric vector whose percentiles we wish to find probs: a numeric vector of probabilities in [0,1] that represent the percentiles we wish to find
#create vector of 100 random values uniformly distributed between 0 and 500
data <- runif(100, 0, 500)
#Find the quartiles (25th, 50th, and 75th percentiles) of the vector
quantile(data, probs = c(.25, .5, .75))
# 25% 50% 75%
# 97.78961 225.07593 356.47943
#Find the deciles (10th, 20th, 30th, ..., 90th percentiles) of the vector
quantile(data, probs = seq(.1, .9, by = .1))
# 10% 20% 30% 40% 50% 60% 70% 80%
# 45.92510 87.16659 129.49574 178.27989 225.07593 300.79690 337.84393 386.36108
# 90%
#423.28070
#Find the 37th, 53rd, and 87th percentiles
quantile(data, probs = c(.37, .53, .87))
# 37% 53% 87%
#159.9561 239.8420 418.4787