Home > database >  Bar chart geom_jitter with the points of the jitter inside the fills of the bar chart
Bar chart geom_jitter with the points of the jitter inside the fills of the bar chart

Time:10-28

I would like to have a bar chart with the distribution of the population I am studying, this bar chart has age as x axis (count is y) and the filling is ethnicity. I'd like to overlay some of the subjects inside their corresponding groups with a geom_scatter. However it creates its own axes. I can't share the data but here's a dummy tibble

df = tribble(
  ~id, ~agegroup, ~ethnicity,
  #--|--|----
  "a", "20s", "African Descent",
  "b", "30s", "White",
  "c", "50s", "White",
  "d", "40s", "Hispanic",
  "e", "20s", "White",
  "f", "30s", "Hispanic",
  "g", "20s", "Hispanic",
  "h", "30s", "White",
  "i", "20s", "African Descent",
  "j", "30s", "White",
  "k", "50s", "White",
  "l", "20s", "White",
  "m", "30s", "Hispanic",
  "n", "20s", "Hispanic",
  "o", "30s", "White",
  
)
df
dmplot <- ggplot(df, aes(x = agegroup, fill = ethnicity ))  
  geom_bar(stat = "count") 
  labs(
    x = "Age Group",
    y = paste0("Population (total = ", df %>% nrow(), ")")) 
  geom_jitter(df,
              aes(x = agegroup,
                  y = ethnicity)) # this is where I would need to retrieve the geom_bar fill location
dmplot

CodePudding user response:

This is a bit hacky, but I think something like this would do it. unfortunately it seems like you cannot assign jitter height to the aestetics, but you might be able to find another way to make height dependent on the height of the rect.

df = tribble(
  ~id, ~agegroup, ~ethnicity,
  #--|--|----
  "a", "20s", "African Descent",
  "b", "30s", "White",
  "c", "50s", "White",
  "d", "40s", "Hispanic",
  "e", "20s", "White",
  "f", "30s", "Hispanic",
  "g", "20s", "Hispanic",
  "h", "30s", "White",
  "i", "20s", "African Descent",
  "j", "30s", "White",
  "k", "50s", "White",
  "l", "20s", "White",
  "m", "30s", "Hispanic",
  "n", "20s", "Hispanic",
  "o", "30s", "White",
  
)
df_2 <- df %>%
  count(agegroup, ethnicity) %>%
  group_by(agegroup ) %>%
  mutate(top_rect = cumsum(n),
         bottom_rect = lag(top_rect, default = 0))

df_2_uncounted <- df_2 %>%
  ungroup() %>%
  uncount(n)


ggplot(df_2)  
  geom_rect( aes(xmin = as.numeric(as.factor(agegroup)) - .45,
                 xmax=  as.numeric(as.factor(agegroup))   .45,
                 ymin = bottom_rect, 
                 ymax = top_rect,
                 fill = ethnicity ))  
  geom_jitter(data = df_2_uncounted,
              aes(x = as.numeric(as.factor(agegroup)),
                  y = (bottom_rect   top_rect)/2),
              width = .3,
              height = .5)  
  scale_x_continuous(breaks = unique(as.numeric(as.factor(df_2$agegroup))), 
                     labels = levels(as.factor(df_2$agegroup)))   
  labs(
    x = "Age Group",
    y = paste0("Population (total = ", df_2_uncounted %>% nrow(), ")"))

enter image description here

Update

now with labels


df_2_uncounted <- df_2 %>%
  ungroup() %>%
  uncount(n)%>%
  arrange(agegroup, ethnicity) %>%
  group_by(agegroup, ethnicity) %>%
  mutate(id2 = 1:n()) %>%
  left_join(df %>%
              arrange(agegroup, ethnicity) %>%
              group_by(agegroup, ethnicity) %>%
              mutate(id2 = 1:n()),
            by = c("agegroup", "ethnicity", "id2"))


ggplot(df_2)  
  geom_rect( aes(xmin = as.numeric(as.factor(agegroup)) - .45,
                 xmax=  as.numeric(as.factor(agegroup))   .45,
                 ymin = bottom_rect, 
                 ymax = top_rect,
                 fill = ethnicity ))  
  geom_jitter(data = df_2_uncounted,
              aes(x = as.numeric(as.factor(agegroup)),
                  y = (bottom_rect   top_rect)/2),
              position = position_jitter(seed = 1, height =0.5))  
  geom_text(data = df_2_uncounted,
            aes(x = as.numeric(as.factor(agegroup)),
                y = (bottom_rect   top_rect)/2,
                label = id),
            position = position_jitter(seed = 1, height =0.5)) 
  scale_x_continuous(breaks = unique(as.numeric(as.factor(df_2$agegroup))), 
                     labels = levels(as.factor(df_2$agegroup)))   
  labs(
    x = "Age Group",
    y = paste0("Population (total = ", df_2_uncounted %>% nrow(), ")"))

enter image description here

  • Related