I have a data frame consisting of three columns and the unique values for status are as follows "X" "0" "C" "1" "2" "3" "4" "5". In the beginning, I do not know how to group by each id and create several columns according to the conditions, for instance, a target column that is 1 if the status is 2, 3, 4, 5, and else is zero.
month_balance represents (The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on)
status represents (0: 1-29 days past due, 1: 30-59 days past due, 2: 60-89 days overdue, 3: 90-119 days overdue, 4: 120-149 days overdue, 5: Overdue or bad debts write-offs for more than 150 days C: paid off that month, X: No loan for the month)
df <- data.frame (id = c("5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805"),
month_balance = c("0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14","-15","0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14"),
status = c("C","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X")
)
In the end, I want to reach output as below:
df1 <- data.frame (id = c("5008804","5008805"),
month_begin = c("16","15"),
paid_off = c("13","12"),
num_of_pastdues = c("2","2"),
no_loan = c("1","1"),
target = c("0","0"))
CodePudding user response:
A base R solution can be to create a custom function and apply it on each group, i.e.
MyFunction <- function(x){
month_begin = length(x)
paid_off = sum(x == 'C')
num_of_pastdues = sum(x %in% 0:5)
no_loan = sum(x == 'X')
target = ifelse(any(x %in% 2:5), 1, 0)
return(c(month_begin=month_begin, paid_off=paid_off, num_of_pastdues=num_of_pastdues, no_loan=no_loan, target=target))
}
res <- t(sapply(split(df$status, df$id), MyFunction))
month_begin paid_off num_of_pastdues no_loan target
# 5008804 16 13 2 1 0
# 5008805 15 12 2 1 0
To make it a data frame with the column id then,
res_df <- data.frame(res)
res_df$id <- rownames(res_df)
rownames(res_df) <- NULL
res_df
#month_begin paid_off num_of_pastdues no_loan target id
#1 16 13 2 1 0 5008804
#2 15 12 2 1 0 5008805
CodePudding user response:
Not quite sure how to code for target
, as the status for each id appeared a both target 0 and 1 appeared multiple times.
Here's how I constructed for the other variables:
df %>%
group_by(id) %>%
summarise(
month_begin=max(abs(as.numeric(month_balance))) 1,
paid_off=sum(status=="C"),
num_of_pastdues=sum(status %in% 0:5),
no_loan=sum(status=="X"))
# A tibble: 2 x 5
id month_begin paid_off num_of_pastdues no_loan
<chr> <dbl> <int> <int> <int>
1 5008804 16 13 2 1
2 5008805 15 12 2 1
CodePudding user response:
library(tidyverse)
df <- data.frame (id = c("5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805"),
month_balance = c("0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14","-15","0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14"),
status = c("C","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X")
) %>%
as_tibble()
df %>%
mutate(target = case_when(status %in% c(2, 3, 4, 5) ~ 1,
TRUE ~ 0),
paid_off = case_when(status == "C" ~ 1,
TRUE ~ 0),
no_loan = case_when(status == "X" ~ 1,
TRUE ~ 0)) %>%
group_by(id) %>%
summarise(month_begin = n(),
across(c(paid_off, no_loan, target), sum))
#> # A tibble: 2 x 5
#> id month_begin paid_off no_loan target
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 5008804 16 13 1 0
#> 2 5008805 15 12 1 0
Created on 2022-06-29 by the reprex package (v2.0.1)
CodePudding user response:
You can try using dplyr. First you can create the variables with the conditions you want, then you can use summarize to count the times your condition was met per group.
df <- df %>%
mutate(num_of_pastdues = case_when(
status %in% c(2,3,4,5) ~ 1,
TRUE ~ 0
)) %>%
mutate(no_loan = case_when(
status == "X" ~ 1,
TRUE ~ 0
)) %>%
mutate(paid_off = case_when(
status == "C" ~ 1,
TRUE ~ 0
)) %>%
group_by(id) %>%
summarise(num_of_pastdues = sum(num_of_pastdues), no_loan = sum(no_loan), paid_off = sum(paid_off))