Select groups with only consecutive runs of a certain value-CodePudding

I have data grouped by 'id', and a column 'x' that can be "yes", "no" or NA.

I want to keep only those 'id' where 'x' (1) contains two "yes", and (2) there are no "no" values between the "yes". NA between the two "yes" is fine.

Some toy data:

data <- data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5),
                   x = c(NA,'yes',NA,'yes',NA,NA,NA,NA,'yes','yes',NA,'no', 'no',NA,NA,'yes',
                       'no','yes','no','yes','no', 'yes',NA, 'no','yes', 'no'))

   id    x
1   1 <NA>
2   1  yes # 1st yes
3   1 <NA>
4   1  yes # 2nd yes, only NA between, yes is considered as consecutive -> keep group 1 
5   1 <NA>
6   1 <NA>
7   2 <NA>
8   2 <NA>
9   2  yes  # 1st yes
10  2  yes  # 2nd yes, yes is consecutive -> keep group 2  
11  2 <NA>
12  3   no
13  3  yes  # 1st yes
14  3 <NA>
15  3 <NA>
16  3  yes  # 2nd yes -> keep group 3
17  4   no
18  4  yes # 1st yes
19  4   no # "no"
20  4  yes # 2nd yes. a "no" between the two 'yes' -> remove group
21  4   no
22  5  yes  # 1st yes
23  5 <NA>
24  5   no # "no"
25  5  yes # 2nd yes. a "no" between the two 'yes' -> remove group 
26  5   no

Desired Output

1   1 <NA>
2   1  yes
3   1 <NA>
4   1  yes
5   1 <NA>
6   1 <NA>
7   2 <NA>
8   2 <NA>
9   2  yes
10  2  yes
11  2 <NA>
12  3   no
13  3  yes
14  3 <NA>
15  3 <NA>
16  3  yes

id 4 and id 5 should be removed as they do not meet the criteria of two consecutive "yes" values for column 'x' per group 'id', irrespective of NA values between two yes values.

I tried using

data1<-data %>% group_by(id) %>% 
  mutate(x_lag = lag(x), 
         is_two_yes = x == 'yes' & x_lag == 'yes') %>% 
  filter(any(is_two_yes)) %>% 
  select(-is_two_yes,-x_lag)

CodePudding user response：

This relies only on lag and lead. To me it makes sense, since you're only aiming at filtering out id's where a no is lead and followed by two yes.

uneligible <- data %>% filter(!is.na(x)) %>% group_by(id) %>% 
  mutate(prev_x=dplyr::lag(x, default="none"),
         next_x=dplyr::lead(x, default="none"),
         is_uneligible=any(x=="no"&prev_x=="yes"&next_x=="yes")) %>% 
           dplyr::filter(is_uneligible) %>% 
           select(id) %>% unique 

# A tibble: 2 x 1
# Groups:   id [2]
id
<dbl>
  4
  5

result <- data %>% filter(!id %in% uneligible$id)

   id    x
1   1 <NA>
2   1  yes
3   1 <NA>
4   1  yes
5   1 <NA>
6   1 <NA>
7   2 <NA>
8   2 <NA>
9   2  yes
10  2  yes
11  2 <NA>
12  3   no
13  3   no
14  3 <NA>
15  3 <NA>
16  3  yes

CodePudding user response：

data <- data.frame(id = rep(1:5, each = 5),
                   x = c(NA, 'yes', NA, 'yes', NA,
                         NA, NA, NA, 'yes', 'yes',
                         NA, 'no', "yes", NA, 'yes', 
                         'no', 'yes', 'no', 'yes', NA, 
                         'yes', NA, 'no','yes', 'no'))

twoYes <- function(x){
  v <- c()
  cum <- 0
  for (i in x){
    if (i == "yes" & !is.na(i)){
      cum <- cum   1      # if met "yes",  cumulatively   1
      v <- c(v, cum)
    }else{
      if(i == "no" & !is.na(i)){
        cum <- 0          # if met "no",  restore to zero
        v <- c(v, cum)
      }else{
        v <- c(v, cum)    # if met "NA", retain value
      }
    }
  }
  return(v)    # therefore, v > 1 means two continuous "yes" met
}

df <- data |> 
  group_by(id) |> 
  mutate(v = twoYes(x)) |> 
  filter(v > 1)

unique(df$id)       # id: 1, 2, 3 have two continuous "yes"

[1] 1 2 3

CodePudding user response：

You can use data.table's inrange function. Here I chose to do it within dplyr because I'm not a user of data.table

df |> 
  group_by(id) |> 
  filter(any(data.table::inrange(which(x == "yes"), 
                                     which.min(x == "no"), 
                                     which.max(x == "no"))))   |> 
  ungroup()


      id x    
   <dbl> <chr>
 1     1 NA   
 2     1 yes  
 3     1 NA   
 4     1 yes  
 5     1 NA   
 6     1 NA   
 7     2 NA   
 8     2 NA   
 9     2 yes  
10     2 yes  
11     2 NA   
12     3 no   
13     3 no   
14     3 NA   
15     3 NA   
16     3 yes