grouped data new column before and after-CodePudding

I have a grouped dataset with patients and a condition column (Caffeinefactor) with yes and no. I now want to create a new column that basically divides Patient in before and after the condition was yes for the first time. So if the Patient has the condition no,no,no,no,yes,yes,no,no,yes and I would like to ne column to look like this: before, before, before, before, after, after, after, after, after.

the first rows of my datagram look like this:

DF = structure(list(PATIENT.ID = c(210625L, 210625L, 210625L, 210625L, 
210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 
210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 220909L, 
220909L, 220909L, 220909L, 220909L, 220909L, 220909L, 220909L, 
220909L, 220909L, 221179L, 221179L, 221179L, 221179L, 221179L, 
221179L, 221179L, 221179L, 221179L, 221179L, 221179L, 221179L, 
221179L, 221179L, 301705L, 301705L, 301705L, 301705L, 301705L, 
301705L, 301705L, 301705L, 301705L, 301705L, 301705L, 301705L, 
301705L, 301705L, 301705L, 303926L, 303926L, 303926L, 303926L
), PATIENT.TREATMENT.NUMBER = c(1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 17L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 12L, 13L, 14L, 15L, 16L, 1L, 2L, 3L, 4L), Caffeinefactor = c("no", 
"no", "no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes", 
"yes", "yes", "no", "no", "yes", "yes", "yes", "yes", "yes", 
"yes", "yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no", 
"no", "no", "no", "no", "no", "no", "yes", "yes", "yes", "yes", 
"yes", "no", "no", "no", "no", "no", "no", "yes", "no", "yes", 
"yes", "yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no"
)), row.names = c(NA, -60L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x7fe7f7002ee0>)

I would have started like this:

daf1 <- df %>% 
  setDT(df) %>% 
  dplyr::mutate(row_id = row_number()) %>% 
  dplyr::group_by(PATIENT.ID) %>% 
  dplyr::mutate(first_yes = cumsum(Caffeinefactor == "yes"), 
         last_no = (Caffeinefactor == "no") * row_number(), 
         last_no = ifelse(first_yes == 0, last_no, 0))

CodePudding user response：

You could do:

library(dplyr)

DF %>% 
  group_by(PATIENT.ID) %>%
  mutate(new_col = c("before", "after")[sign(cumsum(Caffeinefactor == "yes"))   1])
#> # A tibble: 60 x 4
#> # Groups:   PATIENT.ID [5]
#>    PATIENT.ID PATIENT.TREATMENT.NUMBER Caffeinefactor new_col  
#>         <int>                    <int> <chr>          <chr> 
#>  1     210625                        1 no             before
#>  2     210625                        2 no             before
#>  3     210625                        4 no             before
#>  4     210625                        5 no             before
#>  5     210625                        6 yes            after 
#>  6     210625                        7 yes            after 
#>  7     210625                        8 yes            after 
#>  8     210625                        9 no             after 
#>  9     210625                       10 yes            after 
#> 10     210625                       11 yes            after 
#> # ... with 50 more rows

^{Created on 2022-03-23 by the reprex package (v2.0.1)}

CodePudding user response：

library(purrr)

DF |>
  group_by(PATIENT.ID) |>
  mutate(brk = tail(accumulate(Caffeinefactor, ~{
    if (.y == "yes")
      "after"
    else .x
  }, .init = "before"),
  -1))

##># Groups:   PATIENT.ID [5]
##>   PATIENT.ID PATIENT.TREATMENT.NUMBER Caffeinefactor brk   
##>        <int>                    <int> <chr>          <chr> 
##> 1     210625                        1 no             before
##> 2     210625                        2 no             before
##> 3     210625                        4 no             before
##> 4     210625                        5 no             before
##> 5     210625                        6 yes            after 
##> 6     210625                        7 yes            after 
##> 7     210625                        8 yes            after 
##> 8     210625                        9 no             after 
##> 9     210625                       10 yes            after 
##>10     210625                       11 yes            after 
##># … with 50 more rows