Home > other >  Order a data frame based on a column that points to the next record
Order a data frame based on a column that points to the next record

Time:09-10

I have a data frame with a column that points to the next record, sample dataframe below.

OG_Data <- data.frame(
  Record = c("aaaa", "NNNN", "rrrr", "tttt", "pppp", "ssss", "bbbb"),
  NextRecord = c("pppp", "tttt", "bbbb", "N/A" , "NNNN", "rrrr", "N/A") 
)
# Record NextRecord 
#   aaaa       pppp
#   NNNN       tttt       
#   rrrr       bbbb       
#   tttt        N/A        
#   pppp       NNNN       
#   ssss       rrrr       
#   bbbb        N/A        

I want to order this data frame based on a predefined sequence determined by column B (NextRecord) that points to the next record's column A (Record) to get the sequence order and line group.

Desired Output:

# Record  NextRecord  Sequence  Line 
#   aaaa        pppp         1     1    
#   pppp        NNNN         2     1    
#   NNNN        tttt         3     1    
#   tttt         N/A         4     1    
#   ssss        rrrr         1     2    
#   rrrr        bbbb         2     2    
#   bbbb         N/A         3     2    

I was thinking of something like this:

OG_Data[1,] %>%
  add_row(OG_Data, filter(OG_Data, OG_Data$Record == NextRecord))

But that doesn't work and is not scalable. Also, I am not sure where to start to find the beginning of the line groups.

CodePudding user response:

I bet there are simpler ways, but at least it's fun to approach it as a graph problem.

library(igraph)
g = graph_from_data_frame(OG_Data)
g2 = sapply(V(g)[degree(g, mode = 'in') == 0], function(v) all_simple_paths(g, v, "N/A"))
d2 = OG_Data[{x = unlist(g2); x[!endsWith(names(x), ".N/A")]},]
d2$Line = rep.int(seq_along(g2), lengths(g2) - 1)

  Record NextRecord Line
1   aaaa       pppp    1
5   pppp       NNNN    1
2   NNNN       tttt    1
4   tttt        N/A    1
6   ssss       rrrr    2
3   rrrr       bbbb    2
7   bbbb        N/A    2

Then enter image description here

g2
# $aaaa
#   5/8 vertices, named, from b21b8d2:
# [1] aaaa pppp NNNN tttt N/A 

# $ssss
#   4/8 vertices, named, from b21b8d2:
# [1] ssss rrrr bbbb N/A

CodePudding user response:

With cumsum and lag:

library(dplyr)
OG_Data %>%
  mutate(NextRecord = na_if(NextRecord, "N/A"),
         Line = cumsum(lag(is.na(NextRecord), default = T))) %>% 
  group_by(Line) %>% 
  mutate(Sequence = row_number())

output

  Record NextRecord  Line Sequence
  <chr>  <chr>      <int>    <int>
1 aaaa   pppp           1        1
2 NNNN   tttt           1        2
3 rrrr   bbbb           1        3
4 tttt   NA             1        4
5 pppp   NNNN           2        1
6 ssss   rrrr           2        2
7 bbbb   NA             2        3

CodePudding user response:

A fast and scalable approach:

library(data.table)

seqGroups <- function(firstSeq, nextMatch) {
  len <- length(nextMatch)
  idxOut <- seqOut <- lineOut <- integer(len)
  irow <- 0L
  for (i in seq_along(firstSeq)) {
    idxOut[irow <- irow   1L] <- firstSeq[i]
    seqOut[irow] <- 1L
    lineOut[irow] <- i
    while (nextMatch[idxOut[irow]]) {
      idxOut[irow <- irow   1L] <- nextMatch[idxOut[irow]]
      seqOut[irow] <- seqOut[irow - 1L]   1L
      lineOut[irow] <- i
      if (irow > len) stop(paste("cycle detected at row:", idxOut[irow]))
    }
  }
  list(idx = idxOut, seqLine = list(seqOut, lineOut))
}

with(
  with(
    OG_Data,
    seqGroups(which(!Record %chin% NextRecord), match(NextRecord, Record, 0L))
  ),
  setDT(OG_Data)[idx][, c("Sequence", "Line") := seqLine]
)[]
#>    Record NextRecord Sequence Line
#> 1:   aaaa       pppp        1    1
#> 2:   pppp       NNNN        2    1
#> 3:   NNNN       tttt        3    1
#> 4:   tttt        N/A        4    1
#> 5:   ssss       rrrr        1    2
#> 6:   rrrr       bbbb        2    2
#> 7:   bbbb        N/A        3    2

Timing a much larger table:

OG_Data <- data.table(
  Record = paste0(rep(c("aaaa", "NNNN", "rrrr", "tttt", "pppp", "ssss", "bbbb"), 1e6), rep(1:1e6, each = 7)),
  NextRecord = paste0(rep(c("pppp", "tttt", "bbbb", "N/A" , "NNNN", "rrrr", "N/A"), 1e6), rep(1:1e6, each = 7))
)
OG_Data$NextRecord[c(seq(4, 7e6, 7), seq(7, 7e6, 7))] <- "N/A"

system.time({
  with(
    with(
      OG_Data,
      seqGroups(which(!Record %chin% NextRecord), match(NextRecord, Record, 0L))
    ),
    OG_Data[idx][, c("Sequence", "Line") := seqLine]
  )
})
#>    user  system elapsed 
#>    1.96    0.10    2.06
  • Related