Create Flow chart in R-CodePudding

I am trying to make either a multi node Sankey or Alluvial plot whichever is more appropriate.

The output would be similar to this which is from the ggalluvial packgage vignette here

The difference would be that my time_period would be on the x axis and my source column would replace the survey responses. I also tried the Sankey plot from the networkD3 package for a single time period to get results similar to below from the vignette here

Which would be a good compromise if I cant visualize all the time periods but it did not work either. My sample data and code is below. Thanks

Data

  dat = structure(list(time_period = c("1 -> 2", "1 -> 2", "1 -> 2", 
"1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", 
"1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "2 -> 3", 
"2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", 
"2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", 
"2 -> 3", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", 
"3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", 
"3 -> 4", "3 -> 4", "3 -> 4", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", 
"4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", 
"4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5"), source = c("A", 
"A", "A", "A", "B", "B", "B", "B", "C", "C", "C", "C", "D", "D", 
"D", "D", "A", "A", "A", "A", "B", "B", "B", "B", "C", "C", "C", 
"C", "D", "D", "D", "D", "A", "A", "A", "A", "B", "B", "B", "B", 
"C", "C", "C", "C", "D", "D", "D", "D", "A", "A", "A", "A", "B", 
"B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "D"), target = c("A", 
"B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", 
"C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", 
"D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", 
"A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", 
"B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"), count = c(200573, 
27490, 869, 11330, 22136, 208721, 243, 921, 1552, 266, 97647, 
489, 9644, 743, 491, 62900, 179754, 23188, 1111, 9760, 27824, 
193337, 228, 769, 858, 159, 83213, 330, 10410, 869, 474, 54946, 
188765, 30850, 973, 9485, 22181, 196101, 218, 1012, 1482, 292, 
91553, 392, 9989, 724, 431, 50766, 201313, 25308, 1095, 10801, 
25842, 206138, 246, 836, 1199, 210, 94152, 362, 8414, 624, 457, 
55365)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-64L))

Code:

library(networkD3)
library(tidyverse)

  
 dat  = dat %>% filter(time_period == '4 -> 5') 
 
 
 nodes <- data.frame(name=c(as.character(dat$source), as.character(dat$target)) %>% unique())
  
 
 dat$IDsource=match(dat$source, nodes$name)-1 
 dat$IDtarget=match(dat$target, nodes$name)-1
  
      
  # Make the Network
  sankeyNetwork(Links = dat, Nodes = nodes,
                Source = "IDsource", Target = "IDtarget",
                Value = "count", NodeID = "name", 
                sinksRight=FALSE, nodeWidth=40, fontSize=13, nodePadding=20)

CodePudding user response：

I gave it a go using ggalluvial. Each section is plotted quite independently. The height difference between borders can be fixed by adding empty rows.

library(ggalluvial)
library(dplyr)

head(dat)
# time_period source target  count
#  <chr>       <chr>  <chr>   <dbl>
#1 1 -> 2      A      A      200573
#2 1 -> 2      A      B       27490
#3 1 -> 2      A      C         869
#4 1 -> 2      A      D       11330
#5 1 -> 2      B      A       22136
#6 1 -> 2      B      B      208721

# re-format into long data format
dat2 = NULL
for (i in 1:4) {
  tem <- dat %>%
    filter(time_period == paste0(i," -> ", i 1)) %>%
    {data.frame(
      alluvium = paste(i, c(seq(nrow(.)), 
                            seq(nrow(.)))), # same alluvium for each pair
      time = c(rep(i ifelse(i>1,0.0001,0), nrow(.)), 
               rep(i 1, nrow(.))), #  0.0001 to avoid overlapping with previous section
      count = c(.$count, .$count),
      group = c(.$source, .$target)
    )
    }
  dat2 <- rbind(dat2, tem)
}

# padding missing values to match borders between sections
for (i in 1:4) {
  # right side missing
  if (i < 4) {
    count_this <- dat %>%
      filter(time_period == paste0(i," -> ", i 1)) %>%
      group_by(target) %>%
      summarise(count = sum(count)) %>%
      .$count
    
    count_right <- dat %>%
      filter(time_period == paste0(i 1," -> ", i 2)) %>%
      group_by(source) %>%
      summarise(count = sum(count)) %>%
      .$count
    
    count_max <- pmax(count_this, count_right)
    
    dat2 = dat2 %>%
      rbind(., data.frame(
        alluvium = paste(i, "right_missing", seq(4)),
        time = c(rep(i 1, 4)),
        count = count_max - count_this, # padding if less count than the right side
        group = c("A", "B", "C", "D")
      )
      )
  }

  # left side missing
  if (i > 1) {
    count_left <- dat %>%
      filter(time_period == paste0(i-1," -> ", i)) %>%
      group_by(target) %>%
      summarise(count = sum(count)) %>%
      .$count
    
    count_this <- dat %>%
      filter(time_period == paste0(i," -> ", i 1)) %>%
      group_by(source) %>%
      summarise(count = sum(count)) %>%
      .$count
    
    count_max <- pmax(count_left, count_this)
    
    dat2 = dat2 %>%
      rbind(., data.frame(
        alluvium = paste(i, "left_missing", seq(4)),
        time = c(rep(i 0.0001, 4)),
        count = count_max - count_this,  # padding if less count than the left side
        group = c("A", "B", "C", "D")
      )
      )
  }
}

dat2 %>%
  filter(count>0) %>%
  ggplot(aes(x = time, y = count, stratum = group, alluvium = alluvium))  
  geom_flow(aes(fill = group))  
  geom_stratum(data = dat2[dat2$time%%1==0,], aes(fill = group), alpha = 0.3)  
  geom_text(data = dat2[dat2$time%%1==0,], stat = "stratum", aes(label = after_stat(stratum)))  
  theme_bw()