Filtering conditional on lag values in R-CodePudding

df is a dataframe where each row is a pair of items (from item1 & item2).

I want to keep the 1st row of the dataframe, and then keep only the 1st rows where the previous value of item2 is the current value of item1. So I except my data to look like output.

I would prefer a tidy(or purrr) way of doing so but open to any suggestions.

df <- structure(list(item1 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
  2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 7L), 
  item2 = c(4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 4L, 5L, 
  6L, 7L, 8L, 5L, 6L, 7L, 8L, 7L, 8L, 7L, 8L, 8L)), row.names = c(NA, 
  -24L), class = c("tbl_df", "tbl", "data.frame"))
df
#>    item1 item2
#> 1      1     4
#> 2      1     5
#> 3      1     6
#> 4      1     7
#> 5      1     8
#> 6      2     4
#> 7      2     5
#> 8      2     6
#> 9      2     7
#> 10     2     8
#> 11     3     4
#> 12     3     5
#> 13     3     6
#> 14     3     7
#> 15     3     8
#> 16     4     5
#> 17     4     6
#> 18     4     7
#> 19     4     8
#> 20     5     7
#> 21     5     8
#> 22     6     7
#> 23     6     8
#> 24     7     8

output <- data.frame(item1 = c(1,4,5,7),
           item2 = c(4,5,7,8))
output
#>   item1 item2
#> 1     1     4
#> 2     4     5
#> 3     5     7
#> 4     7     8

^{Created on 2022-09-22 by the reprex package (v2.0.1)}

CodePudding user response：

Here's a solution using the tidyverse.

Using a lag(..., default = 1) ensures we also output the first row.

library(tidyverse)

df <- tibble(
  item1 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 7L), 
  item2 = c(4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 5L, 6L, 7L, 8L, 7L, 8L, 7L, 8L, 8L)
)

df %>%
  group_by(item1) %>%
  summarize(item2 = first(item2)) %>%
  filter(item1 == lag(item2, default = 1))
#> # A tibble: 4 × 2
#>   item1 item2
#>   <int> <int>
#> 1     1     4
#> 2     4     5
#> 3     5     7
#> 4     7     8

^{Created on 2022-09-22 by the reprex package (v2.0.1)}

CodePudding user response：

This is probably not what you were looking for (not a very tidy solution), but it yilds the desired output.

library(tidyverse)

df <- data.frame(
  item1 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
            2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 7L), 
  item2 = c(4L, 5L, 6L, 7L, 8L, 4L, 5L, 6L, 7L, 8L, 4L, 5L, 
            6L, 7L, 8L, 5L, 6L, 7L, 8L, 7L, 8L, 7L, 8L, 8L)
)

my_filter <- function(df_to_find, df_orig){
  value_to_find <- tail(df_to_find, 1)$item2
  df_found <- df_orig %>%
    filter(item1 == value_to_find) %>%
    head(1)
  
  if(nrow(df_found) > 0){
    # if something found, recall this function
    # with the newly found data appended to the old results
    return(Recall(bind_rows(df_to_find, df_found), df_orig))
  } else{
    # once you reach a state when nothing else is found return the results so far
    # this is called recursion in programming
    return(bind_rows(df_to_find))
  }
  
}

^{Created on 2022-09-22 by the reprex package (v2.0.1)}

CodePudding user response：

Here is another untidy and recursive solution:

last2current = function (x) {
  first = x[1, ]
  first_match = with(x, match(item2[1], item1))
  if (is.na(first_match)) return(first)
  other = x[first_match:nrow(x), ]
  rbind(first, last2current(other))
}

last2current(df)

   item1 item2
1      1     4
16     4     5
20     5     7
24     7     8

Explanation:

This is a recursive function, this meaning that it calls itself. It stores the first row, then looks for the first match of item2[1] on item1 and stores the row number in first_match. If there is no first_match it means we are done, so return(). If there is a match then it does the same procedure on the rows from the first match to the end of the data frame. Finally it cbinds all the rows.

Note that this will fail if there is a row where item1 == item2 since item1[1] is included in match.

CodePudding user response：

This won't be directly vectorizable--I would do it with a simple for loop. This will almost certainly be faster than a recursive solution for any sizable data.

keep = logical(length = nrow(df)) 
keep[1] = TRUE
target = df$item2[1]
for(i in 2:nrow(df)) {
  if(df$item1[i] == target) {
    keep[i] = TRUE
    target = df$item2[i]
  }
}
result = df[keep, ]
result
# # A tibble: 4 × 2
#   item1 item2
#   <int> <int>
# 1     1     4
# 2     4     5
# 3     5     7
# 4     7     8

CodePudding user response：

A base R recursion:

relation <- function(df, row){
  if(is.na(row)) head(row, -1)
  else c(row, relation(df, match(df[row, 2], df[,1]))) 
}

# Starting at row 1
df[relation(df, 1), ]

  item1 item2
1      1     4
16     4     5
20     5     7
24     7     8

# Starting at row 2
df[relation(df, 2), ]
   item1 item2
2      1     5
20     5     7
24     7     8

# Starting at row 4
df[relation(df, 4), ]
   item1 item2
4      1     7
24     7     8