Home > Mobile >  Finding the start of the last occurence of default for a borrower in a data frame using r
Finding the start of the last occurence of default for a borrower in a data frame using r

Time:12-05

I am trying to identify the first occurrence of a default episode for a borrower using r. In the example below I have a borrower in dataset called ID. The final column called STAET_LAST_DEFAULT_ID is the output required as a new column in the data. I would appreciate some assistance.

ACCOUNT_ID  PERIOD  Arrears_Days    DPD90   DEFAULT_DATE    START_LAST_DEFAULT_ID
#100100     202001  0               0                       0
#100100     202002  12              0                       0
#100100     202003  0               0                       0
#100100     202004  25              0                       0
#100100     202005  31              0                       0
#100100     202006  65              0                       0
#100100     202007  91              1       202007          0
#100100     202008  120             1                       0
#100100     202009  90              1                       0
#100100     202010  89              0                       0
#100100     202011  0               0                       0
#100100     202012  0               0                       0
#100100     202101  0               0                       0
#100100     202102  31              0                       0
#100100     202103  61              0                       0
#100100     202104  121             1       202104          1 #This is the answer
#100100     202105  130             1                       0
#100100     202106  130             1                       0
#100100     202107  150             1                       0
#100100     202108  120             1                       0
#100100     202109  0               0                       0

CodePudding user response:

A possible solution:

library(tidyverse)

df <- data.frame(
  ACCOUNT_ID = c(100100L,
                 100100L,100100L,100100L,100100L,100100L,
                 100100L,100100L,100100L,100100L,100100L,
                 100100L,100100L,100100L,100100L,100100L,
                 100100L,100100L,100100L,100100L,100100L),
  PERIOD = c(202001L,
             202002L,202003L,202004L,202005L,202006L,
             202007L,202008L,202009L,202010L,202011L,
             202012L,202101L,202102L,202103L,202104L,
             202105L,202106L,202107L,202108L,202109L),
  Arrears_Days = c(0L,12L,
                   0L,25L,31L,65L,91L,120L,90L,89L,0L,0L,
                   0L,31L,61L,121L,130L,130L,150L,120L,
                   0L),
  DPD90 = c(0L,0L,
            0L,0L,0L,0L,1L,1L,1L,0L,0L,0L,0L,0L,
            0L,1L,1L,1L,1L,1L,0L),
  DEFAULT_DATE = c(NA,NA,
                   NA,NA,NA,NA,202007L,NA,NA,NA,NA,NA,NA,
                   NA,NA,202104L,NA,NA,NA,NA,NA),
  START_LAST_DEFAULT_ID = c(0L,0L,
                            0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
                            0L,1L,0L,0L,0L,0L,0L)
)

df %>% 
  mutate(aux = data.table::rleid(DPD90)) %>% 
  group_by(ACCOUNT_ID, aux) %>% 
  mutate(DEFDATE = if_else(DPD90 == 1, first(PERIOD), NA_integer_)) %>% 
  group_by(aux) %>%
  mutate(DEFDATE = if_else(row_number() == 1, DEFDATE, NA_integer_)) %>%
  group_by(ACCOUNT_ID) %>% 
  mutate(Z=if_else(!is.na(DEFDATE) & DEFDATE==max(DEFDATE, na.rm = T),1,0)) %>%
  ungroup %>% select(-aux) 

#>    ACCOUNT_ID PERIOD Arrears_Days DPD90 DEFAULT_DATE START_LAST_DEFAULT_ID
#> 1      100100 202001            0     0           NA                     0
#> 2      100100 202002           12     0           NA                     0
#> 3      100100 202003            0     0           NA                     0
#> 4      100100 202004           25     0           NA                     0
#> 5      100100 202005           31     0           NA                     0
#> 6      100100 202006           65     0           NA                     0
#> 7      100100 202007           91     1       202007                     0
#> 8      100100 202008          120     1           NA                     0
#> 9      100100 202009           90     1           NA                     0
#> 10     100100 202010           89     0           NA                     0
#> 11     100100 202011            0     0           NA                     0
#> 12     100100 202012            0     0           NA                     0
#> 13     100100 202101            0     0           NA                     0
#> 14     100100 202102           31     0           NA                     0
#> 15     100100 202103           61     0           NA                     0
#> 16     100100 202104          121     1       202104                     1
#> 17     100100 202105          130     1           NA                     0
#> 18     100100 202106          130     1           NA                     0
#> 19     100100 202107          150     1           NA                     0
#> 20     100100 202108          120     1           NA                     0
#> 21     100100 202109            0     0           NA                     0
#>    DEFDATE Z
#> 1       NA 0
#> 2       NA 0
#> 3       NA 0
#> 4       NA 0
#> 5       NA 0
#> 6       NA 0
#> 7   202007 0
#> 8       NA 0
#> 9       NA 0
#> 10      NA 0
#> 11      NA 0
#> 12      NA 0
#> 13      NA 0
#> 14      NA 0
#> 15      NA 0
#> 16  202104 1
#> 17      NA 0
#> 18      NA 0
#> 19      NA 0
#> 20      NA 0
#> 21      NA 0
  •  Tags:  
  • r
  • Related