I have a data frame with a column that points to the next record, sample dataframe below.
OG_Data <- data.frame(
Record = c("aaaa", "NNNN", "rrrr", "tttt", "pppp", "ssss", "bbbb"),
NextRecord = c("pppp", "tttt", "bbbb", "N/A" , "NNNN", "rrrr", "N/A")
)
# Record NextRecord
# aaaa pppp
# NNNN tttt
# rrrr bbbb
# tttt N/A
# pppp NNNN
# ssss rrrr
# bbbb N/A
I want to order this data frame based on a predefined sequence determined by column B (NextRecord) that points to the next record's column A (Record) to get the sequence order and line group.
Desired Output:
# Record NextRecord Sequence Line
# aaaa pppp 1 1
# pppp NNNN 2 1
# NNNN tttt 3 1
# tttt N/A 4 1
# ssss rrrr 1 2
# rrrr bbbb 2 2
# bbbb N/A 3 2
I was thinking of something like this:
OG_Data[1,] %>%
add_row(OG_Data, filter(OG_Data, OG_Data$Record == NextRecord))
But that doesn't work and is not scalable. Also, I am not sure where to start to find the beginning of the line groups.
CodePudding user response:
I bet there are simpler ways, but at least it's fun to approach it as a graph problem.
library(igraph)
g = graph_from_data_frame(OG_Data)
g2 = sapply(V(g)[degree(g, mode = 'in') == 0], function(v) all_simple_paths(g, v, "N/A"))
d2 = OG_Data[{x = unlist(g2); x[!endsWith(names(x), ".N/A")]},]
d2$Line = rep.int(seq_along(g2), lengths(g2) - 1)
Record NextRecord Line
1 aaaa pppp 1
5 pppp NNNN 1
2 NNNN tttt 1
4 tttt N/A 1
6 ssss rrrr 2
3 rrrr bbbb 2
7 bbbb N/A 2
g2
# $aaaa
# 5/8 vertices, named, from b21b8d2:
# [1] aaaa pppp NNNN tttt N/A
# $ssss
# 4/8 vertices, named, from b21b8d2:
# [1] ssss rrrr bbbb N/A
CodePudding user response:
With cumsum
and lag
:
library(dplyr)
OG_Data %>%
mutate(NextRecord = na_if(NextRecord, "N/A"),
Line = cumsum(lag(is.na(NextRecord), default = T))) %>%
group_by(Line) %>%
mutate(Sequence = row_number())
output
Record NextRecord Line Sequence
<chr> <chr> <int> <int>
1 aaaa pppp 1 1
2 NNNN tttt 1 2
3 rrrr bbbb 1 3
4 tttt NA 1 4
5 pppp NNNN 2 1
6 ssss rrrr 2 2
7 bbbb NA 2 3
CodePudding user response:
A fast and scalable approach:
library(data.table)
seqGroups <- function(firstSeq, nextMatch) {
len <- length(nextMatch)
idxOut <- seqOut <- lineOut <- integer(len)
irow <- 0L
for (i in seq_along(firstSeq)) {
idxOut[irow <- irow 1L] <- firstSeq[i]
seqOut[irow] <- 1L
lineOut[irow] <- i
while (nextMatch[idxOut[irow]]) {
idxOut[irow <- irow 1L] <- nextMatch[idxOut[irow]]
seqOut[irow] <- seqOut[irow - 1L] 1L
lineOut[irow] <- i
if (irow > len) stop(paste("cycle detected at row:", idxOut[irow]))
}
}
list(idx = idxOut, seqLine = list(seqOut, lineOut))
}
with(
with(
OG_Data,
seqGroups(which(!Record %chin% NextRecord), match(NextRecord, Record, 0L))
),
setDT(OG_Data)[idx][, c("Sequence", "Line") := seqLine]
)[]
#> Record NextRecord Sequence Line
#> 1: aaaa pppp 1 1
#> 2: pppp NNNN 2 1
#> 3: NNNN tttt 3 1
#> 4: tttt N/A 4 1
#> 5: ssss rrrr 1 2
#> 6: rrrr bbbb 2 2
#> 7: bbbb N/A 3 2
Timing a much larger table:
OG_Data <- data.table(
Record = paste0(rep(c("aaaa", "NNNN", "rrrr", "tttt", "pppp", "ssss", "bbbb"), 1e6), rep(1:1e6, each = 7)),
NextRecord = paste0(rep(c("pppp", "tttt", "bbbb", "N/A" , "NNNN", "rrrr", "N/A"), 1e6), rep(1:1e6, each = 7))
)
OG_Data$NextRecord[c(seq(4, 7e6, 7), seq(7, 7e6, 7))] <- "N/A"
system.time({
with(
with(
OG_Data,
seqGroups(which(!Record %chin% NextRecord), match(NextRecord, Record, 0L))
),
OG_Data[idx][, c("Sequence", "Line") := seqLine]
)
})
#> user system elapsed
#> 1.96 0.10 2.06