Home > Blockchain >  R: Calculate Quantiles Using Loops
R: Calculate Quantiles Using Loops

Time:01-05

I am working with the R programming language.

I have the following dataset:

set.seed(123)
library(dplyr)

var1 = rnorm(10000, 100,100)
var2 = rnorm(10000, 100,100)
var3 = rnorm(10000, 100,100)
var4 = rnorm(10000, 100,100)
var5 <- factor(sample(c("A","B", "C", "D", "E"), 1000, replace=TRUE, prob=c(0.2, 0.2, 0.2, 0.2, 0.2)))

my_data = data.frame( var1, var2, var3, var4, var5)

> head(my_data)
       var1      var2        var3      var4 var5
1  43.95244 337.07252   16.370326  80.63936    E
2  76.98225  83.31880   77.942701 125.81473    E
3 255.87083 192.69614 -110.351477  46.16874    E
4 107.05084  43.18483  -66.780754 -17.90628    A
5 112.92877 122.50901   -9.796286 190.06474    C
6 271.50650 213.19859  -66.562121  98.37505    E

I would like to calculate the "grouped" percentiles (e.g. 4 levels for var1 and 5 levels for var2) for var1 and var2 relative to the groups in var5.

An example of the final output should look something like this:

# sample of final output (with hypothetical numbers)
       var1      var2        var3      var4 var5 var1_q var2_q
1  43.95244 337.07252   16.370326  80.63936    E    one    one
2  76.98225  83.31880   77.942701 125.81473    E    two    two
3 255.87083 192.69614 -110.351477  46.16874    E    one    one
4 107.05084  43.18483  -66.780754 -17.90628    A  three  three
5 112.92877 122.50901   -9.796286 190.06474    C   five   four
6 271.50650 213.19859  -66.562121  98.37505    E    two    one

I know that there are many ways to do this, but many of these ways have been giving me problems (e.g. R: Using DPLYR to Calculate Quantiles?).

I found this approach that seems to be working so far:

var1_df = my_data %>% group_by(var5) %>%
  summarize(first=quantile(var1,probs=0.25),
            second=quantile(var1,probs=0.5),
            third=quantile(var1,probs=0.75),
            fourth=quantile(var1,probs=1))

var2_df = my_data %>% group_by(var5) %>%
  summarize(first=quantile(var2,probs=0.2),
            second=quantile(var2,probs=0.4),
            third=quantile(var2,probs=0.6),
            fourth=quantile(var2,probs=0.8), fifth=quantile(var2,probs=1))

num.cols <- c('first','second','third', 'fourth')
var1_df[num.cols] <- sapply(var1_df[num.cols], as.numeric)
 num.cols <- c('first','second','third', 'fourth', "fifth")
var2_df[num.cols] <- sapply(var2_df[num.cols], as.numeric)

An example of this output looks something like this:

head(var1_df)
# A tibble: 5 x 5
  var5  first second third fourth
  <fct> <dbl>  <dbl> <dbl>  <dbl>
1 A      36.6   96.7  166.   439.
2 B      30.5   97.0  166.   429.
3 C      29.6   99.2  166.   485.
4 D      33.4  102.   170.   440.
5 E      35.5   99.5  168.   472.

Based on this code, I had the following idea on how to accomplish my task (i.e. use "greater than or equal to conditions" to find the corresponding quantile for each group):

for (i in 1:length(levels(var1_df$var5)))
{
    temp_i = var1_df[i,]

    my_data$var1_q <- my_data%>% mutate(my_data$var1 <= temp_i[2] ~ "one",
                                                   my_data$var1 > temp_i[2] & my_data$var1 <= temp_i[3] ~ "two",
                                                   my_data$var1 > temp_i[3] & my_data$var1 <= temp_i[4] ~ "three",
                                                   TRUE ~ "four")))
}


for (i in 1:length(levels(var2_df$var5)))
{
    temp_i = var2_df[i,]

    my_data$var2_q <- my_data%>% mutate(my_data$var2 <= temp_i[2] ~ "one",
                                                   my_data$var2 > temp_i[2] & my_data$var2 <= temp_i[3] ~ "two",
                                                   my_data$var2 > temp_i[3] & my_data$var2 <= temp_i[4] ~ "three", my_data$var2 <= temp_i[5] ~ "four",
                                                   TRUE ~ "five")))
}

But I am getting lost while this writing these loops - can someone please show me how to fix these?

Thanks!

CodePudding user response:

If the intention is to create columns in my_data, use cut with quantile as breaks and specify the labels corresponding to it

library(dplyr)
library(english)
my_data %>%
   group_by(var5) %>% 
   mutate(var1_q = cut(var1, breaks = c(-Inf, quantile(var1,
     probs = seq(0.25, 1, by = 0.25))), 
     labels = english(1:4)), 
    var2_q = cut(var2, breaks = c(-Inf, quantile(var2,
      probs = seq(0.2, 1, by = 0.2))), labels = english(1:5))) %>%
   ungroup

-output

# A tibble: 10,000 × 7
    var1  var2    var3  var4 var5  var1_q var2_q
   <dbl> <dbl>   <dbl> <dbl> <fct> <fct>  <fct> 
 1  44.0 337.    16.4   80.6 E     two    five  
 2  77.0  83.3   77.9  126.  E     two    three 
 3 256.  193.  -110.    46.2 E     four   five  
 4 107.   43.2  -66.8  -17.9 A     three  two   
 5 113.  123.    -9.80 190.  C     three  three 
 6 272.  213.   -66.6   98.4 E     four   five  
 7 146.  238.    95.0  118.  D     three  five  
 8 -26.5  76.7  256.   160.  D     one    three 
 9  31.3 -60.1   59.5  126.  A     one    one   
10  55.4  70.2  179.   130.  B     two    two   
# … with 9,990 more rows

In this way, we directly create the columns instead of creating a list of summarised output and then trying to join with original data to create the columns

CodePudding user response:

You can just use dplyr::summarize(quantile(var, probs = ...) and supply customized vectors of quantiles for each variable as long as you adjust the length of the output accordingly.

library(tidyverse)

n <- 50

d <- tibble(grp = sample(c("A", "B"), n, T), v1 = runif(n, 1, 10), v2 = runif(n, 50, 500))

v1_quantiles <- c(0.25, 0.5, 0.75, 1)
v2_quantiles <- c(0.2, 0.4, 0.6, 0.8, 0.1)

e <- d %>% 
  group_by(grp) %>% 
  summarise(var = c(rep("v1", length(v1_quantiles)),
                        rep("v2", length(v2_quantiles))),
                        val = c(quantile(v1, v1_quantiles),
                        quantile(v2, v2_quantiles)),
                q = c(v1_quantiles, v2_quantiles),
            .groups = "drop")
e
#> # A tibble: 18 × 4
#>    grp   var      val     q
#>    <chr> <chr>  <dbl> <dbl>
#>  1 A     v1      2.46  0.25
#>  2 A     v1      5.77  0.5 
#>  3 A     v1      7.34  0.75
#>  4 A     v1      9.55  1   
#>  5 A     v2    145.    0.2 
#>  6 A     v2    232.    0.4 
#>  7 A     v2    314.    0.6 
#>  8 A     v2    355.    0.8 
#>  9 A     v2     80.2   0.1 
#> 10 B     v1      3.64  0.25
#> 11 B     v1      4.81  0.5 
#> 12 B     v1      7.08  0.75
#> 13 B     v1      9.87  1   
#> 14 B     v2    218.    0.2 
#> 15 B     v2    279.    0.4 
#> 16 B     v2    321.    0.6 
#> 17 B     v2    376.    0.8 
#> 18 B     v2    169.    0.1

If you want to split into separate tables and convert to wide format try this:

e %>% 
  group_split(var) %>% 
  map(~.x %>% pivot_wider(names_from = q, values_from = val, names_prefix = "q"))
#> [[1]]
#> # A tibble: 2 × 6
#>   grp   var   q0.25  q0.5 q0.75    q1
#>   <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A     v1     2.46  5.77  7.34  9.55
#> 2 B     v1     3.64  4.81  7.08  9.87
#> 
#> [[2]]
#> # A tibble: 2 × 7
#>   grp   var    q0.2  q0.4  q0.6  q0.8  q0.1
#>   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A     v2     145.  232.  314.  355.  80.2
#> 2 B     v2     218.  279.  321.  376. 169.

Created on 2023-01-05 with reprex v2.0.2

CodePudding user response:

My own attempt at an answer after much trial and error - is this correct?

my_data_final = left_join(my_data, var1_df) %>% 
    mutate(var1_q = if_else(var1 < first, "one",
     if_else(var1 >= first & var1 <second, "two",
     if_else(var1 >=second & var1 <third, "three", 
   ifelse(var1 >= third & var1 < fourth, "four", NA_character_) )))) %>%
    select(names(my_data), var1_q)

-output

> head(my_data_final)
       var1      var2        var3      var4 var5 var1_q
1  43.95244 337.07252   16.370326  80.63936    E    two
2  76.98225  83.31880   77.942701 125.81473    E    two
3 255.87083 192.69614 -110.351477  46.16874    E   four
4 107.05084  43.18483  -66.780754 -17.90628    A  three
5 112.92877 122.50901   -9.796286 190.06474    C  three
6 271.50650 213.19859  -66.562121  98.37505    E   four
  •  Tags:  
  • r
  • Related