Home > OS >  calculating n quantiles by group in tidyverse
calculating n quantiles by group in tidyverse

Time:05-17

I have a unique problem where I would like to add a column of percentiles for each group in a data frame. Here is how my data look like:

library(tidyverse)
set.seed(123)

df <- tibble(id = 1:100, 
                 group = rep(letters[1:4], 25), 
                 x = c(sample(1:100, 25, replace = T),
                       sample(101:200, 25, replace = T),
                       sample(201:300, 25, replace = T),
                       sample(301:400, 25, replace = T)))

> df
# A tibble: 100 x 3
      id group     x
   <int> <chr> <int>
 1     1 a        78
 2     2 b        80
 3     3 c         7
 4     4 d       100
 5     5 a        45
 6     6 b        76
 7     7 c        25
 8     8 d        91
 9     9 a        13
10    10 b        84
# ... with 90 more rows

# Function to create a table ten percentiles for a numeric vector
percentiles_table <- function(x) {
  res <- round(quantile(x, probs = seq(from=.1, to=1, by=0.1)), 0)
  res <- data.frame(percentile = names(res), to = res )
  res <- res %>% 
    mutate(from = lag(to, default = 0)) %>% 
    select(from,to,percentile)
}

# Table of percentiles
percentiles <- df %>% 
  group_by(group) %>% 
  summarise(percentiles_table(x)) %>% 
  ungroup()

> percentiles
# A tibble: 40 x 4
   group  from    to percentile
   <chr> <dbl> <dbl> <chr>     
 1 a         0    25 10%       
 2 a        25    71 20%       
 3 a        71   106 30%       
 4 a       106   125 40%       
 5 a       125   198 50%       
 6 a       198   236 60%       
 7 a       236   278 70%       
 8 a       278   325 80%       
 9 a       325   379 90%       
10 a       379   389 100%   

I would like to add the percentile column to df for each group where the value of x falls between from and to.

CodePudding user response:

install.packages("zoo")

library(zoo)

y=as.data.frame(c(0:max(percentiles$to)))

y=merge(y,unique(percentiles[,c(1)]))

y=merge(y,percentiles[,c(1,2,4)], by.x = c("group","c(0:max(percentiles$to))"), by.y = c("group","from"), all.x = TRUE)

y=na.locf(y)

df=merge(df,y, all.x = TRUE, by.x = c("group","x"), by.y = c("group","c(0:max(percentiles$to))"))

CodePudding user response:

Using data.table:

setDT(df)[
  , 
  percentile := cut(
    x, 
    quantile(x, seq(0, 1, 0.1)),
    include.lowest = TRUE,
    labels = paste0(seq(10, 100, 10), "%")
  ), 
  by = group
]
  • Related