I am trying to summarize a UMAP scatter plot of single cell sequencing data with hexagons. As the goal is to simplify very busy clustering results, I am mixing colors for each bin (=hexagon) according to how many cells of each cluster are in the bin. In other words, if there are 2 cells from cluster 1 and 8 from cluster 2, I mix the colors assigned to those clusters in the proportions of the cells. This means I need to assign a specific color to each hexagon.
Please excuse the long code, I tried to shorten it as far as I could.
library(hexbin)
library(ggplot2)
library(tibble)
####################
# helper functions #
####################
#' Determines majority in a vector
#' @description
#' Changed version of mclust::majorityVote. Ties are broken randomly.
#'
#' @param x a vector
#'
#' @returns
#' A single element of x that has the highest count.
#'
get_majority <- function(x){
x <- as.vector(x)
tally <- table(x)
max_idx <- seq_along(tally)[tally == max(tally, na.rm = TRUE)]
if(length(max_idx) > 1){
max_idx <- sample(max_idx, size = 1)
}
majority <- names(tally)[max_idx]
return(majority)
}
###################
# Toy data
umap_coords <- tibble( x = rnorm(1000),
y = rnorm(1000),
cluster = rep(c(1,2,3,4,5), 200))
colors <- c("#8DD3C7",
"#FFFFB3",
"#BEBADA",
"#FB8072",
"#80B1D3")
names(colors) <- 1:5
hexb <- hexbin::hexbin(umap_coords$x,
umap_coords$y,
xbins = 10,
xbnds = c(min(umap_coords$x),
max(umap_coords$x)),
ybnds = c(min(umap_coords$y),
max(umap_coords$y)),
IDs = TRUE)
gghex <- data.frame(hexbin::hcell2xy(hexb),
count = hexb@count,
cell = hexb@cell,
xo = hexb@xcm,
yo = hexb@ycm,
hexclust = NA)
for (i in seq_along(gghex$cell)){
cell_id <- gghex$cell[i]
hcnt <- gghex$count[i]
orig_id <- which(hexb@cID == cell_id)
umap_coords[orig_id,"hexbin"] <- cell_id
gghex$hexclust[i] <- get_majority(umap_coords[orig_id, "cluster"])
}
hex_colors <- vector(mode = "character", length = length(gghex$cell))
# For simplicity, here I assign a fixed color per cluster.
for (n in seq_along(gghex$cell)){
hex_colors[n] <- colors[names(colors) == gghex$hexclust[n]]
}
gghex$colors <- hex_colors
# I define the data in the geom because I combine it with a scatterplot from a different data.frame.
# (scatter plot is not relevatn for the question though.)
p <- ggplot2::ggplot()
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y),
fill = gghex$colors,
alpha = 0.8,
stat = "identity")
p
However, the resulting plot clearly does not assign the colors to the correct hexagons. If I plot the clusters by assigning it inside of aes()
I get a different picture:
ggplot2::ggplot()
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = hexclust),
alpha = 0.8,
stat = "identity")
p
Now, for this particular toy problem I can just assign the colors via scale_fill_manual
:
names(hex_colors) <- gghex$hexclust
ggplot2::ggplot()
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = hexclust),
alpha = 0.8,
stat = "identity")
scale_fill_manual(values = hex_colors, guide = "none")
But remember, in my actual proplem, I have to assign each hexagon a specific color. And here geom_hex seems to break down:
names(hex_colors) <- as.character(gghex$cell)
ggplot2::ggplot()
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = as.character(cell)),
alpha = 0.8,
stat = "identity")
scale_fill_manual(values = hex_colors, guide = "none")
p
As you can see the size of the hexagons suddenly is completely wrong. I read a short suggestion by Hadley to use group = 1
in aes
to make the hexagons aware of each other, but this does not work for me either.
Does anybody have a suggestion on how to get a working plot with geom_hex?
Thanks a lot!
EDIT: The answer by @Allen Cameron solves the question originally posed and I will mark it as the solution if there is no final answer to the edit.
However, I found that if I actually assign unique colors to the data, geom_hex once again produces hexagons of differing sizes:
library(hexbin)
library(ggplot2)
library(tibble)
####################
# helper functions #
####################
#' Determines majority in a vector
#' @description
#' Changed version of mclust::majorityVote. Ties are broken randomly.
#'
#' @param x a vector
#'
#' @returns
#' A single element of x that has the highest count.
#'
get_majority <- function(x){
x <- as.vector(x)
tally <- table(x)
max_idx <- seq_along(tally)[tally == max(tally, na.rm = TRUE)]
if(length(max_idx) > 1){
max_idx <- sample(max_idx, size = 1)
}
majority <- names(tally)[max_idx]
return(majority)
}
#' Mixes the colors of two clusters proportionally.
#'
#' @param df data.frame of cells with clusters in `color_by` and assigned
#' hex bin in `hexbin`.
#' @param colors colors to be mixed.
#' @param cell Which hexbin to mix colors in.
#' @param color_by Column name where the clusters/groups are stored in `df`.
#'
#' @returns
#' Mixed color as hex code.
#'
mix_rgb <- function(df, colors, cell, color_by){
rgbcols <- col2rgb(colors)
sel <- which(df$hexbin == cell)
n_clust <- dplyr::pull(df[sel,color_by])
n_clust <- table(as.character(n_clust))
prop <- as.numeric(n_clust)
names(prop) <- names(n_clust)
prop <- prop/sum(prop)
rgb_new <- sweep(rgbcols[,names(prop), drop=FALSE], MARGIN =2, FUN = "*", prop)
rgb_new <- rowSums(rgb_new)
rgb_new <- rgb(red = rgb_new["red"],
green = rgb_new["green"],
blue = rgb_new["blue"],
maxColorValue = 255)
return(rgb_new)
}
###################
umap_coords <- tibble( x = rnorm(1000),
y = rnorm(1000),
cluster = rep(c(1,2,3,4,5), 200))
colors <- c("#8DD3C7",
"#FFFFB3",
"#BEBADA",
"#FB8072",
"#80B1D3")
names(colors) <- 1:5
hexb <- hexbin::hexbin(umap_coords$x,
umap_coords$y,
xbins = 10,
xbnds = c(min(umap_coords$x),
max(umap_coords$x)),
ybnds = c(min(umap_coords$y),
max(umap_coords$y)),
IDs = TRUE)
gghex <- data.frame(hexbin::hcell2xy(hexb),
count = hexb@count,
cell = hexb@cell,
xo = hexb@xcm,
yo = hexb@ycm,
hexclust = NA)
for (i in seq_along(gghex$cell)){
cell_id <- gghex$cell[i]
hcnt <- gghex$count[i]
orig_id <- which(hexb@cID == cell_id)
umap_coords[orig_id,"hexbin"] <- cell_id
gghex$hexclust[i] <- get_majority(umap_coords[orig_id, "cluster"])
}
hex_colors <- vector(mode = "character", length = length(gghex$cell))
for (n in seq_along(gghex$cell)){
hex_colors[n] <- mix_rgb(umap_coords,
colors = colors,
cell = gghex$cell[n],
color_by = "cluster")
}
gghex$colors <- hex_colors
ggplot2::ggplot()
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = colors),
alpha = 0.8,
stat = "identity")
scale_fill_identity()
The resulting plot looks as follows:
CodePudding user response:
If you wish to fill each hexagon according to the color
column, you can use scale_fill_identity
:
ggplot(gghex, aes(x, y, fill = colors))
geom_hex(stat = 'identity')
scale_fill_identity()
We can see that all the colors are the desired ones and match the designated cluster by adding their cluster and color value as strings on the hexagons:
ggplot(gghex, aes(x, y, fill = colors))
geom_hex(stat = 'identity')
geom_text(aes(label = paste(colors, hexclust, sep = '\n')), size = 2.5)
scale_fill_identity()
Update
For the edited version of the data, this is where the group = 1
is needed:
ggplot(gghex, aes(x, y, fill = colors, group = 1))
geom_hex(stat = "identity")
scale_fill_identity()