Using stat_summary to produce individualized boxplots with aes(group = ...)?-CodePudding

I tried to produce 12 boxplots per ggplots stat_summary() functions, as you can see below in the reproducible example. I used stat_summary() instead of geom_boxplot(), because I want to whiskers to end at the 1st and 99th percentile of the data or to be individualized so to speak. I coded two functions, one for the whiskers and one for the outliers and used them as arguments in stat_summary(). This is the result:

I see two problems with this plot:

Not all outliers are coloured in red.
Outliers cut the whiskers, which is not supposed to happen by definition of my functions.

The help file has not been helping me in solving this issue. Comments are welcome.

The code:

library(stats)
library(ggplot2)
library(dplyr)

# Example Data
{
  set.seed(123)
  
  indexnumber_of_entity = rep(c(1:30),
              each = 12)
  
  month = rep(c(1:12), 
             each = 1, 
             times = 30)
  
  variable_of_interest = runif(n = 360,
                               min = 0,
                               max = 100)
  
  Data = as.data.frame(cbind(indexnumber_of_entity,
                            month,
                            variable_of_interest)) %>% mutate_at(.vars = c(1,2,3),
                                                                 as.numeric)
  Data_Above_99th_Percentile = filter(Data,
                                      variable_of_interest > stats::quantile(Data$variable_of_interest,
                                                                             0.99))
  Data_Below_1st_Percentile = filter(Data,
                                     variable_of_interest < stats::quantile(Data$variable_of_interest,
                                                                            0.01))
}

# Functions that enable individualizing boxplots
{
  Individualized_Boxplot_Quantiles <- function(x){
    d <- data.frame(ymin = stats::quantile(x,0.01),
                    lower = stats::quantile(x,0.25),
                    middle = stats::quantile(x,0.5),
                    upper = stats::quantile(x,0.75),
                    ymax = stats::quantile(x,0.99),
                    row.names = NULL)
    d[1, ]
  }
  
  Definition_of_Outliers = function(x) 
  {
    subset(x, 
           stats::quantile(x,0.99) < x | stats::quantile(x,0.01) > x)
  }
}

# Producing the ggplot
ggplot(data = Data)   
  
  aes(x = month,
      y = variable_of_interest,
      group = month)  
  
  stat_summary(fun.data = Individualized_Boxplot_Quantiles, 
               geom="boxplot",
               lwd = 0.5)  
  
  stat_summary(fun.y = Definition_of_Outliers, 
               geom="point",
               size = 1)   
  
  labs(title = "Distributions of Variable of Interest based on months",
       x = "Month",
       y = "Variable of Interest")   
  
  theme(plot.title = element_text(size = 20,
                                  hjust = 0.5, 
                                  face = "bold"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(size = 12,
                                   face = "bold"),
        axis.text.y = element_text(size = 12,
                                   face = "bold"),
        axis.title.x = element_text(size = 16,
                                    face = "bold",
                                    vjust = -3),
        axis.title.y = element_text(size = 16,
                                    face = "bold",
                                    vjust = 3))   
  
  scale_x_continuous(breaks = c(seq(from = 1,
                                    to = 12,
                                    by = 1)))  
  
  scale_y_continuous(breaks = c(seq(from = 0,
                                    to = 100,
                                    by = 10)))   
  
  geom_point(data = Data_Above_99th_Percentile,
             colour = "red",
             size = 1)   
  
  geom_point(data = Data_Below_1st_Percentile,
             colour = "red",
             size = 1)

CodePudding user response：

You can simplify the functions a little bit like this:

boxplot_quantiles <- function(x) {
  y <- as.data.frame(t(stats::quantile(x, c(0.01, 0.25, 0.5, 0.75, 0.99))))
  setNames(y, c('ymin', 'lower', 'middle', 'upper', 'ymax'))
}

outliers <- function(x) {
  subset(x, stats::quantile(x,0.99) < x | stats::quantile(x,0.01) > x)
}

You can rely on the summary functions, since the Data_above_99th_Percentile and Data_Below_1st_Percentile were not groupwise calculations in your own code.

ggplot(data = Data, aes(x = month, y = variable_of_interest, group = month))  
  stat_summary(fun = outliers, geom = "point", col = 'red', size = 1)   
  stat_summary(fun.data = boxplot_quantiles, geom = "boxplot", lwd = 0.5)  
  scale_x_continuous('Month', breaks = 1:12)  
  scale_y_continuous('Variable of Interest' , breaks = 0:10 * 10)  
  labs(title = "Distributions of Variable of Interest based on months")   
  theme(text = element_text(face = 'bold', size = 12),
        plot.title = element_text(size = 20, hjust = 0.5),
        axis.ticks.x = element_blank(),
        axis.title.x = element_text(size = 16, margin = margin(20, 0, 0, 0)),
        axis.title.y = element_text(size = 16, vjust = 3))

Edit

As long as you perform groupwise operations on the filtered data frames, your alternative method of drawing the outliers will work too. Note that I have added these in colored layers above the existing plot so that the red points are overplotted with blue and green dots:

Data_Above_99th_Percentile <- Data %>%
  group_by(month) %>%
  filter(variable_of_interest > quantile(variable_of_interest,0.99))

Data_Below_1st_Percentile <- Data %>% 
  group_by(month) %>%
  filter(variable_of_interest < quantile(variable_of_interest, 0.01))


ggplot(data = Data, aes(x = month, y = variable_of_interest, group = month))  
  stat_summary(fun = outliers, geom = "point", col = 'red', size = 1)   
  stat_summary(fun.data = boxplot_quantiles, geom = "boxplot", lwd = 0.5)  
  scale_x_continuous('Month', breaks = 1:12)  
  scale_y_continuous('Variable of Interest' , breaks = 0:10 * 10)  
  labs(title = "Distributions of Variable of Interest based on months")   
  theme(text = element_text(face = 'bold', size = 12),
        plot.title = element_text(size = 20, hjust = 0.5),
        axis.ticks.x = element_blank(),
        axis.title.x = element_text(size = 16, margin = margin(20, 0, 0, 0)),
        axis.title.y = element_text(size = 16, vjust = 3))  
  geom_point(data = Data_Below_1st_Percentile, color = 'green')  
  geom_point(data = Data_Above_99th_Percentile, color = 'blue')