Home > Software design >  Filter those have consecutive values at first
Filter those have consecutive values at first

Time:09-29

I want to extract data that have two consecutive unsuppressed for status and regemintype =1. And then I need to calculate the time taken from second Unsuppressed status to first Suppressed status if there is or up to end status in not observed Suppressed after second Unsuppressed status within group id.

df<-data.frame(id=c(42, 42, 42, 42, 215, 215, 215, 215, 297, 297, 297, 297, 297, 310, 310, 310, 317, 317, 317, 317, 318,318,318),
count = c(11411, 16411,  847, 14429,19083, 4487, 21418, 21418, 161711, 161.711,137399, 34326, 150,
            11183, 54927, 236, 44314, 47636, 55128, 128, 800, 1500, 2100),
FollowUpDate = c("17/07/2017", "18/10/2017", "15/02/2018", "19/07/2019", "26/10/2019", "23/12/2019", "01/05/2021",
"05/06/2021","25/08/2017", "08/10/2018", "13/02/2019", "24/11/2020", "10/03/2021", "18/02/2019","27/10/2019",
"23/02/2020","30/07/2019", "26/11/2019", "15/10/2020", "15/12/2020", "15/02/2018", "27/10/2019","15/10/2020" ),
regemintype= c(1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2,2, 1,1,2),
status =c("Unsuppressed", "Unsuppressed", "Suppressed", "Unsuppressed", "Unsuppressed", "Unsuppressed",
          "Unsuppressed","Unsuppressed", "Unsuppressed", "Suppressed","Unsuppressed", "Unsuppressed", "Suppressed",
          "Unsuppressed","Unsuppressed", "Suppressed", "Unsuppressed", "Unsuppressed", "Unsuppressed", "Suppressed",
          "Suppressed","Unsuppressed", "Unsuppressed" ))

Desired Output:

    id    count FollowUpDate regemintype       status   time
  42  11411.000   17/07/2017           1 Unsuppressed
  42  16411.000   18/10/2017           1 Unsuppressed   **18/10/2017-15/02/2018**
  42    847.000   15/02/2018           1   Suppressed
 215  19083.000   26/10/2019           1 Unsuppressed
 215   4487.000   23/12/2019           1 Unsuppressed   **23/12/2019-05/06/2021**
 215  21418.000   01/05/2021           2 Unsuppressed
 215  21418.000   05/06/2021           2 Unsuppressed
 310  11183.000   18/02/2019           1 Unsuppressed
 310  54927.000   27/10/2019           1 Unsuppressed    **27/10/2019-23/02/2020**
 310    236.000   23/02/2020           1   Suppressed
 317  44314.000   30/07/2019           1 Unsuppressed
 317  47636.000   26/11/2019           1 Unsuppressed    **26/11/2019-15/12/2020**
 317  55128.000   15/10/2020           2 Unsuppressed
 317  128.000       15/12/2020         2  Suppressed

Please help ???

CodePudding user response:

A tidyverse option.

library(dplyr)
library(stringr)

df %>%
  group_by(id) %>%
  mutate(row = row_number(),
         stat = cumsum(status == 'Unsuppressed'),
         check = stat == 2 & row == 2,
         keep = T %in% check,
         X = if_else(row == 2 | row == max(row), FollowUpDate, NA_character_),
         Y = str_c(na.omit(X), collapse = '-'),
         time = if_else(row == 2, str_c('**', Y, '**'), NA_character_)) %>%
  ungroup() %>%
  filter(keep == T) %>%
  select(-row, -X, -Y, stat, check, keep)

# A tibble: 15 x 9
#       id count FollowUpDate regemintype status        stat check keep  time                     
#    <dbl> <dbl> <chr>              <dbl> <chr>        <int> <lgl> <lgl> <chr>                    
# 1     42 11411 17/07/2017             1 Unsuppressed     1 FALSE TRUE  NA                       
# 2     42 16411 18/10/2017             1 Unsuppressed     2 TRUE  TRUE  **18/10/2017-19/07/2019**
# 3     42   847 15/02/2018             1 Suppressed       2 FALSE TRUE  NA                       
# 4     42 14429 19/07/2019             1 Unsuppressed     3 FALSE TRUE  NA                       
# 5    215 19083 26/10/2019             1 Unsuppressed     1 FALSE TRUE  NA                       
# 6    215  4487 23/12/2019             1 Unsuppressed     2 TRUE  TRUE  **23/12/2019-05/06/2021**
# 7    215 21418 01/05/2021             2 Unsuppressed     3 FALSE TRUE  NA                       
# 8    215 21418 05/06/2021             2 Unsuppressed     4 FALSE TRUE  NA                       
# 9    310 11183 18/02/2019             1 Unsuppressed     1 FALSE TRUE  NA                       
# 10   310 54927 27/10/2019             1 Unsuppressed     2 TRUE  TRUE  **27/10/2019-23/02/2020**
# 11   310   236 23/02/2020             1 Suppressed       2 FALSE TRUE  NA                       
# 12   317 44314 30/07/2019             1 Unsuppressed     1 FALSE TRUE  NA                       
# 13   317 47636 26/11/2019             1 Unsuppressed     2 TRUE  TRUE  **26/11/2019-15/12/2020**
# 14   317 55128 15/10/2020             2 Unsuppressed     3 FALSE TRUE  NA                       
# 15   317   128 15/12/2020             2 Suppressed       3 FALSE TRUE  NA  

CodePudding user response:

A different approach, also using tidyverse

library(tidyverse)

df %>% 
  group_by(id) %>%
  mutate(
    FollowUpDate = as.Date(FollowUpDate, '%d/%m/%Y'),
    status_lag = lag(status), 
    ident = status == 'Unsuppressed' & status_lag == 'Unsuppressed' & regemintype == 1, 
    time = ifelse(ident, FollowUpDate - lag(FollowUpDate), NA)
  ) %>%
  filter(any(ident)) %>%
  ungroup() %>%
  select(-status_lag, -ident)
  •  Tags:  
  • r
  • Related