Home > OS >  R: Trying to remove NAs from a boxplot
R: Trying to remove NAs from a boxplot

Time:02-02

I am trying to do a basic boxplot, and I can't get NA values away from it. I have tried many tricks for this issue. For example this one.

ggplot(df=subset(df, !is.na(sum_variable)), aes(x = gender, y = sum_variable, fill = gender))  
  stat_boxplot(geom ="errorbar", width = 0.5)  
  geom_boxplot(fill = "light blue")   
  stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black")   
  ggtitle("Title")   
  theme_bw()   theme(legend.position="none")

And this ggplot(na.omit(data), aes(x=luse, y=rich))

And none of these solve the issue. What would you recommend?

Data↓

structure(list(gender = structure(c(2L, 2L, NA, 2L, 1L, 1L, 
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 1L, 2L, 1L, 1L), .Label = c("1", "2"), class = "factor"), 
    sum_variable = c(9, 6, 13, 3, 4, 3, 12, 2, 7, 8, 7, 4, 5, 
    10, 2, 5, 4, NA, 14, 9, 2, 5, 7, 3, NA, 3, 5, 7, 3, 8, 3, 
    3, 4, 8, 10, 9, 5, 7, 8, 4, 9, NA, 10, 14, 10, 3, 4, 10, 
    3, NA, 5, 3, 4, 4, NA, 5, 4, 6, 6, 9, 6, 2, 3, NA, 4, NA, 
    2, 2, 6, 5, 5, 3, 5, NA, 4, 4)), class = c("rowwise_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -76L), groups = structure(list(
    .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
        10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 
        21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
        32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 
        43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 
        54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 
        65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 
        76L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -76L), class = c("tbl_df", 
"tbl", "data.frame")))

CodePudding user response:

You can filter your NA values before creating the plot:

df %>% 
  filter(!is.na(gender)) %>% 
  ggplot(aes(x = gender, y = sum_variable, fill = gender))  
  stat_boxplot(geom ="errorbar", width = 0.5)  
  geom_boxplot(fill = "light blue")   
  stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black")   
  ggtitle("Title")   
  theme_bw()   theme(legend.position="none")

enter image description here

CodePudding user response:

Try removing NAs first before passing in the dataset.

sub_dta = na.omit(dta)

ggplot(data = sub_dta, aes(x = gender, y = sum_variable, fill = gender))  
  stat_boxplot(geom ="errorbar", width = 0.5)  
  geom_boxplot(fill = "light blue")   
  stat_summary(fun =mean, geom="point", shape=10, size=3.5, color="black")   
  ggtitle("Title")   
  theme_bw()   theme(legend.position="none")

CodePudding user response:

One solution is to use complete.cases

ggplot(df[complete.cases(df), ], aes(x = gender, y = sum_variable, 
    fill = gender))  
  stat_boxplot(geom ="errorbar", width = 0.5)  
  geom_boxplot(fill = "light blue")   
  stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black")   
  ggtitle("Title")   
  theme_bw()   theme(legend.position="none")

boxplot

Data

df <- structure(list(gender = structure(c(2L, 2L, NA, 2L, 1L, 1L, 1L, 
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 1L, 2L, 1L, 1L), levels = c("1", "2"), class = "factor"), 
    sum_variable = c(9, 6, 13, 3, 4, 3, 12, 2, 7, 8, 7, 4, 5, 
    10, 2, 5, 4, NA, 14, 9, 2, 5, 7, 3, NA, 3, 5, 7, 3, 8, 3, 
    3, 4, 8, 10, 9, 5, 7, 8, 4, 9, NA, 10, 14, 10, 3, 4, 10, 
    3, NA, 5, 3, 4, 4, NA, 5, 4, 6, 6, 9, 6, 2, 3, NA, 4, NA, 
    2, 2, 6, 5, 5, 3, 5, NA, 4, 4)), class = c("rowwise_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -76L), groups = structure(list(
    .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
        10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 
        21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
        32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 
        43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 
        54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 
        65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 
        76L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -76L), class = c("tbl_df", 
"tbl", "data.frame")))
  • Related