I am trying to create a sankey diagram but the connections turn out all messed up. Here is my data:
#> A tibble: 61 x 3
#> # Groups: id_2 [55]
#> id_2 origin target
#> <int> <fct> <chr>
#> 1 4 Pendlerkort i app Rejsekort
#> 2 38 Ungdomskort Rejsekort
#> 3 84 Rejsekort Rejsekort
#> 4 89 Rejsekort Pendlerkort i app
#> 5 95 Rejsekort Rejsekort
#> 6 112 Rejsekort Pendlerkort Rejsekort
#> 7 118 Pendler20 Rejsekort med pendler kombi
#> 8 140 Pendlerkort i app Rejsekort
#> 9 167 DSB Orange Pendler20
#> 10 193 Rejsekort Rejsekort
#> # ... with 51 more rows
sankey_plot <-
tibble::tribble(
~id_2, ~origin, ~target,
4L, "Pendlerkort i app", "Rejsekort",
38L, "Ungdomskort", "Rejsekort",
84L, "Rejsekort", "Rejsekort",
89L, "Rejsekort", "Pendlerkort i app",
95L, "Rejsekort", "Rejsekort",
112L, "Rejsekort Pendlerkort", "Rejsekort"
)
Then I run the following code:
# create nodes
nodes <- data.frame(
name=c(as.character(sankey_plot$origin),
as.character(sankey_plot$target)) %>% unique()
)
# Get values
sankey_plot <- sankey_plot %>% group_by(origin, target) %>%
summarise(values = n())
sankey_plot <- sankey_plot %>%
dplyr::ungroup()
# Create source and target ID's
sankey_plot$IDsource <- match(sankey_plot$origin, nodes$name)-1
sankey_plot$IDtarget <- match(sankey_plot$target, nodes$name)-1
# Plot
sankeyNetwork(Links = sankey_plot, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "values",
NodeID = "name",
# fontSize = 10
nodeWidth = 20,
sinksRight = TRUE)
This gives me a messy plot looking like this:
I have no idea what I am doing wrong. I have tried to create the nodes in a few different ways. Also where I don't group_by()
and simply let each row represent each movement (instead of having a count). The plot ends up looking the same
CodePudding user response:
Given the sample data you provided, the loops are expected. For instance, line 3 represents a link that starts and ends at the same node, and line 1 represents a link from a node ("Pendlerkort I app") that follows the "Rejsekort" node and then links back to the "Rejsekort" node again.
sankey_plot
#> # A tibble: 5 × 5
#> origin target values IDsource IDtarget
#> <fct> <chr> <int> <dbl> <dbl>
#> 1 Pendlerkort i app Rejsekort 1 0 2
#> 2 Rejsekort Pendlerkort i app 1 2 0
#> 3 Rejsekort Rejsekort 2 2 2
#> 4 Rejsekort Pendlerkort Rejsekort 1 3 2
#> 5 Ungdomskort Rejsekort 1 1 2
If what you're expecting is that there are different nodes that all have the same name "Rejsekort", then you will have to distinguish those nodes in your data.
library(dplyr)
library(networkD3)
sankey_plot <-
tibble::tribble(
~id_2, ~origin, ~target,
4L, "Pendlerkort i app", "Rejsekort",
38L, "Ungdomskort", "Rejsekort",
84L, "Rejsekort", "Rejsekort",
89L, "Rejsekort", "Pendlerkort i app",
95L, "Rejsekort", "Rejsekort",
112L, "Rejsekort Pendlerkort", "Rejsekort"
)
dupe_targets <- sankey_plot$target == "Rejsekort"
sankey_plot$target[dupe_targets] <-
paste0(sankey_plot$target[dupe_targets], "_", seq_along(sankey_plot$target[dupe_targets]))
sankey_plot$values <- 1
# create nodes
nodes <- data.frame(
name=c(as.character(sankey_plot$origin),
as.character(sankey_plot$target)) %>% unique()
)
# Create source and target ID's
sankey_plot$IDsource <- match(sankey_plot$origin, nodes$name)-1
sankey_plot$IDtarget <- match(sankey_plot$target, nodes$name)-1
sankey_plot
#> # A tibble: 6 × 6
#> id_2 origin target values IDsource IDtarget
#> <int> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 4 Pendlerkort i app Rejsekort_1 1 0 4
#> 2 38 Ungdomskort Rejsekort_2 1 1 5
#> 3 84 Rejsekort Rejsekort_3 1 2 6
#> 4 89 Rejsekort Pendlerkort i app 1 2 0
#> 5 95 Rejsekort Rejsekort_4 1 2 7
#> 6 112 Rejsekort Pendlerkort Rejsekort_5 1 3 8
sankeyNetwork(Links = sankey_plot, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "values",
NodeID = "name",
# fontSize = 10
nodeWidth = 20,
sinksRight = TRUE)