Home > Software engineering >  Is there a clean dplyr-way of doing multiple left-(self)joins?
Is there a clean dplyr-way of doing multiple left-(self)joins?

Time:11-24

I have the following, working, code:

test_hierarchie <- tribble(~child, ~parent,
                "A", "B",
                "B", "C",
                "D", "E"
                )

test_hierarchie_transformed <- test_hierarchie %>%
    left_join(test_hierarchie, by = c("parent" = "child"), suffix = c("", "_grant")) %>%
    left_join(test_hierarchie, by = c("parent_grant" = "child"), suffix = c("", "_grant")) %>%
    left_join(test_hierarchie, by = c("parent_grant_grant" = "child"), suffix = c("", "_grant")) %>%
    left_join(test_hierarchie, by = c("parent_grant_grant_grant" = "child"), suffix = c("", "_grant")) %>%
    left_join(test_hierarchie, by = c("parent_grant_grant_grant_grant" = "child"), suffix = c("", "_grant")) %>%
    pivot_longer(names_to = "relation", cols = contains("parent"), values_to = "parent") %>%
    filter(!is.na(parent))

With result:

# A tibble: 4 x 3
  child relation     parent
  <chr> <chr>        <chr> 
1 A     parent       B     
2 A     parent_grant C     
3 B     parent       C     
4 D     parent       E 

This is the desired result, the large amount of left_joins are there because I'm for the real data not sure what is the maximum hierarchy.

My question is: is there a way to do this more succinct and dynamic? Thanks!

EDIT 1: Yes, I do mean 'grand' instead of 'grant', haha EDIT 2: Great solution, exactly what I was looking for! Thanks everyone for pitching in, the other day I was thinking about another project and iGraph does seem very helpful for that.

CodePudding user response:

Following the suggestion by @zx8754 one option to achieve your desired result would be to do the left_joins via a recursive function which stops when there are no more matches:

library(dplyr)
library(tidyr)

test_hierarchie <- tribble(
  ~child, ~parent,
  "A", "B",
  "B", "C",
  "D", "E"
)

left_join_recursive <- function(x, by) {
  x <- left_join(x, test_hierarchie, by = setNames("child", by), suffix = c("", "_grant"))
  byby <- paste0(by, "_grant")
  if (!all(is.na(x[[byby]]))) {
    left_join_recursive(x, byby)  
  } else {
    x
  }
}

test_hierarchie_transformed <- left_join_recursive(test_hierarchie, "parent") %>%
  pivot_longer(names_to = "relation", cols = contains("parent"), values_to = "parent") %>%
  filter(!is.na(parent))

test_hierarchie_transformed
#> # A tibble: 4 × 3
#>   child relation     parent
#>   <chr> <chr>        <chr> 
#> 1 A     parent       B     
#> 2 A     parent_grant C     
#> 3 B     parent       C     
#> 4 D     parent       E

To check wether the approach works in a more general case I added another row to your example data:

test_hierarchie <- add_row(test_hierarchie, child = "C", parent = "D")

test_hierarchie_transformed <- left_join_recursive(test_hierarchie, "parent") %>%
  pivot_longer(names_to = "relation", cols = contains("parent"), values_to = "parent") %>%
  filter(!is.na(parent))

test_hierarchie_transformed
#> # A tibble: 10 × 3
#>    child relation                 parent
#>    <chr> <chr>                    <chr> 
#>  1 A     parent                   B     
#>  2 A     parent_grant             C     
#>  3 A     parent_grant_grant       D     
#>  4 A     parent_grant_grant_grant E     
#>  5 B     parent                   C     
#>  6 B     parent_grant             D     
#>  7 B     parent_grant_grant       E     
#>  8 D     parent                   E     
#>  9 C     parent                   D     
#> 10 C     parent_grant             E

CodePudding user response:

As was mentioned you can use the igraph package, but it probably only pays off for more complex cases:

library(tidyverse)
library(igraph)

test_hierarchie <- tribble(~child, ~parent,
                           "A", "B",
                           "B", "C",
                           "D", "E"
)

g <- graph_from_data_frame(test_hierarchie)
finals <- V(g)[degree(g, mode = "out") == 0]
starts <- V(g)[!V(g) %in% finals]
#starts <- V(g)[degree(g, mode = "in") == 0] # use this to avoid sub-paths
imap_dfr(starts, 
         ~enframe(all_simple_paths(g, from = starts[[.y]], to = finals)[[1]],
                  name = "parent") %>%
           mutate(child = .y)) %>%
  filter(child != parent) %>%
  select(-value) %>%
  group_by(child) %>%
  mutate(nr = row_number() - 1) %>%
  ungroup() %>%
  mutate(relation = map_chr(nr, ~str_c("parent", str_c(rep("_grant", .x), collapse = "")))) %>%
  select(child, relation, parent)

# # A tibble: 4 x 3
# child relation     parent
# <chr> <chr>        <chr> 
# 1 A     parent       B     
# 2 A     parent_grant C     
# 3 B     parent       C     
# 4 D     parent       E   
  • Related