I am trying to identify the first occurrence of a default episode for a borrower using r. In the example below I have a borrower in dataset called ID. The final column called STAET_LAST_DEFAULT_ID is the output required as a new column in the data. I would appreciate some assistance.
ACCOUNT_ID PERIOD Arrears_Days DPD90 DEFAULT_DATE START_LAST_DEFAULT_ID
#100100 202001 0 0 0
#100100 202002 12 0 0
#100100 202003 0 0 0
#100100 202004 25 0 0
#100100 202005 31 0 0
#100100 202006 65 0 0
#100100 202007 91 1 202007 0
#100100 202008 120 1 0
#100100 202009 90 1 0
#100100 202010 89 0 0
#100100 202011 0 0 0
#100100 202012 0 0 0
#100100 202101 0 0 0
#100100 202102 31 0 0
#100100 202103 61 0 0
#100100 202104 121 1 202104 1 #This is the answer
#100100 202105 130 1 0
#100100 202106 130 1 0
#100100 202107 150 1 0
#100100 202108 120 1 0
#100100 202109 0 0 0
CodePudding user response:
A possible solution:
library(tidyverse)
df <- data.frame(
ACCOUNT_ID = c(100100L,
100100L,100100L,100100L,100100L,100100L,
100100L,100100L,100100L,100100L,100100L,
100100L,100100L,100100L,100100L,100100L,
100100L,100100L,100100L,100100L,100100L),
PERIOD = c(202001L,
202002L,202003L,202004L,202005L,202006L,
202007L,202008L,202009L,202010L,202011L,
202012L,202101L,202102L,202103L,202104L,
202105L,202106L,202107L,202108L,202109L),
Arrears_Days = c(0L,12L,
0L,25L,31L,65L,91L,120L,90L,89L,0L,0L,
0L,31L,61L,121L,130L,130L,150L,120L,
0L),
DPD90 = c(0L,0L,
0L,0L,0L,0L,1L,1L,1L,0L,0L,0L,0L,0L,
0L,1L,1L,1L,1L,1L,0L),
DEFAULT_DATE = c(NA,NA,
NA,NA,NA,NA,202007L,NA,NA,NA,NA,NA,NA,
NA,NA,202104L,NA,NA,NA,NA,NA),
START_LAST_DEFAULT_ID = c(0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,1L,0L,0L,0L,0L,0L)
)
df %>%
mutate(aux = data.table::rleid(DPD90)) %>%
group_by(ACCOUNT_ID, aux) %>%
mutate(DEFDATE = if_else(DPD90 == 1, first(PERIOD), NA_integer_)) %>%
group_by(aux) %>%
mutate(DEFDATE = if_else(row_number() == 1, DEFDATE, NA_integer_)) %>%
group_by(ACCOUNT_ID) %>%
mutate(Z=if_else(!is.na(DEFDATE) & DEFDATE==max(DEFDATE, na.rm = T),1,0)) %>%
ungroup %>% select(-aux)
#> ACCOUNT_ID PERIOD Arrears_Days DPD90 DEFAULT_DATE START_LAST_DEFAULT_ID
#> 1 100100 202001 0 0 NA 0
#> 2 100100 202002 12 0 NA 0
#> 3 100100 202003 0 0 NA 0
#> 4 100100 202004 25 0 NA 0
#> 5 100100 202005 31 0 NA 0
#> 6 100100 202006 65 0 NA 0
#> 7 100100 202007 91 1 202007 0
#> 8 100100 202008 120 1 NA 0
#> 9 100100 202009 90 1 NA 0
#> 10 100100 202010 89 0 NA 0
#> 11 100100 202011 0 0 NA 0
#> 12 100100 202012 0 0 NA 0
#> 13 100100 202101 0 0 NA 0
#> 14 100100 202102 31 0 NA 0
#> 15 100100 202103 61 0 NA 0
#> 16 100100 202104 121 1 202104 1
#> 17 100100 202105 130 1 NA 0
#> 18 100100 202106 130 1 NA 0
#> 19 100100 202107 150 1 NA 0
#> 20 100100 202108 120 1 NA 0
#> 21 100100 202109 0 0 NA 0
#> DEFDATE Z
#> 1 NA 0
#> 2 NA 0
#> 3 NA 0
#> 4 NA 0
#> 5 NA 0
#> 6 NA 0
#> 7 202007 0
#> 8 NA 0
#> 9 NA 0
#> 10 NA 0
#> 11 NA 0
#> 12 NA 0
#> 13 NA 0
#> 14 NA 0
#> 15 NA 0
#> 16 202104 1
#> 17 NA 0
#> 18 NA 0
#> 19 NA 0
#> 20 NA 0
#> 21 NA 0