Home > OS >  Adding summary statistics labels to box plot using ggplot in R
Adding summary statistics labels to box plot using ggplot in R

Time:12-01

I am trying to add labels to sit above box plots. For example, in this example, instead of NA, I would want the label above A to say "total number of var3 = 11" and over B "total number of var3 = 34". In my real data, numbers are produced, but they bear no relation to the original data set (I cannot work out how they could possibly be calculated from the original data, so I must be doing something wrong!).

var1<- c("A", "B", "A", "B", "B", "B", "A", "B", "B")
var2<- as.numeric(c(4:12))
var3<- as.numeric(c(1:9))

df<- data.frame(var1, var2, var3)

stat_box_data <- function(y, upper_limit = max(df$var2) * 1.15 ) {
  return( 
    data.frame(
      y = 0.95* upper_limit,
      label = paste('number of var1 =', length(y), '\n', 
                    'total number of var3 =', sum(df$var3[y])
      )
    )
  )
}

ggplot(df, aes(var1, var2))   
  geom_boxplot()  
  stat_summary(    fun.data = stat_box_data, 
                   geom = "text", 
                   hjust = 0.5,
                   vjust = 0.9)

df%>% group_by (var1) %>% summarise (sum = sum(var3))

example

You can automate this a little bit using this

  group1 <- df%>%
    filter(var1 == "A")
  group2 <- df %>%
    filter(var1 == "B")
  
  stat_box_data <- function(y,upper_limit = max(df$var2) * 1.15, y2 = df[c(1,3)]) {
    return( 
      data.frame(
        y = 0.95* upper_limit,
        label = paste('number of var1 =', length(y), '\n', 
                      'total number of var3 =', ifelse(sum(df$var1 == "A") < length(y), sum(group2$var3), sum(group1$var3)) , '\n'
                      #print(z)
        )
      )
    )
  }
  
  ggplot(df, aes(var1, var2, group = var1))   
    geom_boxplot()  
    stat_summary(    fun.data = stat_box_data, 
                     geom = "text", 
                     hjust = 0.5,
                     vjust = 0.9)

CodePudding user response:

You could get the result you want using this rather convoluted method.

library(dplyr)
library(ggplot2)
var1<- c("A", "B", "A", "B", "B", "B", "A", "B", "B")
var2<- as.numeric(c(4:12))
var3<- as.numeric(c(1:9))

df<- data.frame(var1, var2, var3)

stat_box_data <- function(y,  upper_limit = max(df$var2) * 1.15) {
  
  return( 
    data.frame(
      y = 0.95* upper_limit,label = paste('count =', length(y), '\n',
                                          'mean =', sum(df$var3[match(y, df$var2)]), '\n'
      )
    )
  )
}

d<-df%>% group_by (var1) %>% summarise (sum = sum(var3)) %>% pull(sum)

ggplot(df, aes(var1, var2))   
  geom_boxplot()  
  stat_summary(fun.data = stat_box_data,
                   geom = "text", 
                   hjust = 0.5,
                   vjust = 0.9)

enter image description here

  • Related