How do I filter on a rolling basis?-CodePudding

How do I conditionally filter/select relevant observations on a rolling basis?

Groups 1 to 52 are the baseline.

Then in Groups 53, I want to filter out all the IDs that appeared in Groups 1 to 52
Then for Groups 54, I want to filter out all the IDs that appear in Groups 2 to 53
Then for Groups 55, I want to filter out all IDs that appear from Groups 3 to 54
And so on and so forth.basically the data set has Groups and an Id, and im trying to select the relevant IDs.

The code below manually creates an example dataset in which final_example_data is the starting output and expected_output is the expected output.

 
example_data <- data.frame(Groups = 1:55,
                           ID = 1)
`%!in%` = Negate(`%in%`)
example_data <-
  example_data %>%
  filter(Groups %in% c(1,4, 7 , 10, 11, 15, 44, 52))
 
example_data2 <- data.frame(Groups = 1:55,
                            ID = 2)
 
example_data2 <-
  example_data2 %>%
  filter(Groups %in% c(1,3,5,7,8,11,15,44,33,55,41))
 
example_data3 <- data.frame(Groups = 1:55,
                            ID = 7)
 
example_data3 <-
  example_data3 %>%
  filter(Groups %in% c(53))
 
example_data4 <-
  data.frame(Groups = 1:55,
             ID = 4) %>%
  filter(Groups == 54)
 
example_data5 <-
  data.frame(Groups = c(1:55), ID = 0) %>%
  filter(Groups %in% c(53,54,55))
 
final_example_data <- rbind(example_data,
                            example_data2,
                            example_data3,
                            example_data4,
                            example_data5)
 
# so this would show that ID 1 is present from Groups 1 to 52, ID 2 is present from Groups 1 to 52, and ID 3 is NOT present from Groups 1 to 52...
 
no_present_in_1_52 <-
  final_example_data %>%
  filter(ID %in% c(7, 0)) %>%
  filter(Groups <= 53)
 
# now which are not present in 2 to 53 but are present in 54
not_present_in_Groups_2_53 <-
  final_example_data %>%
  filter(ID == 4)
 
not_present_in_Groups3_to_54 <-
  final_example_data %>%
  filter(Groups > 54) #but you can see they are present in Groups 3 to 54 visually so they are not included, so nothing for final output for Groups 55
 
expected_output <- rbind(not_present_in_Groups_2_53,no_present_in_1_52)

CodePudding user response：

You may try this tidyverse approach -

library(dplyr)
library(purrr)

baseline <- 52
map_df((baseline   1):max(final_example_data$Groups), ~final_example_data %>%
      filter(!ID %in% ID[Groups < .x], Groups <= .x)) 

#  Groups ID
#1     53  7
#2     53  0
#3     54  4

where

(baseline   1):max(final_example_data$Groups) #returns
#[1] 53 54 55

CodePudding user response：

I renamed final_example_data as fed for clarity:

data.table

library(data.table)

setDT(fed)[,.SD[ID %in% fed[Groups<.BY$Groups, ID]], by=Groups][Groups>52]

   Groups ID
1:     53  7
2:     53  0
3:     54  4

Or base R

Identify the group values beyond your baseline

target_groups = unique(fed$Groups[fed$Groups>52])

loop over them, each time checking if the IDs for that group are in the IDs for any group less than that group; row bind the resulting list of data.frames

do.call(rbind, (lapply(target_groups, function(x) {
  id <- fed$ID[fed$Groups==x]
  id <- id[!id %in% fed$ID[fed$Groups<x]]
  if(length(id)>0) return(data.frame(Group = x,ID = id))
})))

Output: