I have DF, with unique ID's and TYPE, and I intend, from an interval between dates (DT_STARD and DT_END), to search and count how many times they appear. This is my DF_1
:
ID <- c(111,222,222,333,444,444)
TYPE <- c('A1','A1','B1','B1','A1','B1')
DT_START <- c(as.Date(c('2022/07/20','2022/07/18','2022/07/10','2022/07/05','2022/07/03','2022/07/01') ,"%Y/%m/%d"))
DT_END <- c(as.Date(c('2021/07/20','2021/07/18','2021/07/10','2021/07/05','2021/07/03','2021/07/01') ,"%Y/%m/%d"))
DF_1 <- data.frame(ID,TYPE,DT_START,DT_END)
And I want to look up the date range in that DF_2
:
ID <- c(111,111,111,111,222,222,444,444,444)
TYPE <- c('A1','A1','A1','A1','A1','A1','A1','B1','B1')
DT <- c(as.Date(c('2022/06/01','2022/05/15','2022/01/01','2021/06/01','2022/03/02','2021/12/21','2021/12/29','2022/06/30','2022/06/15') ,"%Y/%m/%d"))
DF_2 <- data.frame(ID,TYPE,DT)
And this is the result I want to get:
ID <- c(111,222,222,333,444,444)
TYPE <- c('A1','A1','B1','B1','A1','B1')
DT_START <- c(as.Date(c('2022/07/20','2022/07/18','2022/07/10','2022/07/05','2022/07/03','2022/07/01') ,"%Y/%m/%d"))
DT_END <- c(as.Date(c('2021/07/20','2021/07/18','2021/07/10','2021/07/05','2021/07/03','2021/07/01') ,"%Y/%m/%d"))
COUNT <- c(3,2,0,0,1,2)
DF_3 <- data.frame(ID,TYPE,DT_START,DT_END,COUNT)
How do I look up and count a from a date range?
CodePudding user response:
You can use left_join
from package dplyr
for this, then creating a indicator and summarise that:
library(dplyr)
ID <- c(111,222,222,333,444,444)
TYPE <- c('A1','A1','B1','B1','A1','B1')
DT_END <- c(as.Date(c('2022/07/20','2022/07/18','2022/07/10','2022/07/05','2022/07/03','2022/07/01') ,"%Y/%m/%d"))
DT_START <- c(as.Date(c('2021/07/20','2021/07/18','2021/07/10','2021/07/05','2021/07/03','2021/07/01') ,"%Y/%m/%d"))
DF_1 <- data.frame(ID,TYPE,DT_START,DT_END)
ID <- c(111,111,111,111,222,222,444,444,444)
TYPE <- c('A1','A1','A1','A1','A1','A1','A1','B1','B1')
DT <- c(as.Date(c('2022/06/01','2022/05/15','2022/01/01','2021/06/01','2022/03/02','2021/12/21','2021/12/29','2022/06/30','2022/06/15') ,"%Y/%m/%d"))
DF_2 <- data.frame(ID,TYPE,DT)
#left join the data frames
left_join(DF_1, DF_2) %>%
#determine if DT is outside of DT_START and DT_END. If so, or NA, assign 0, else assign 1
mutate(in_range = ifelse(is.na(DT) | (DT <= DT_START & DT >= DT_END), yes = 0, no = 1)) %>%
#group by and summarise
group_by(ID, TYPE, DT_START, DT_END) %>%
summarise(COUNT = sum(in_range)) %>%
ungroup %>%
as.data.frame
#> ID TYPE DT_START DT_END COUNT
#> 1 111 A1 2021-07-20 2022-07-20 4
#> 2 222 A1 2021-07-18 2022-07-18 2
#> 3 222 B1 2021-07-10 2022-07-10 0
#> 4 333 B1 2021-07-05 2022-07-05 0
#> 5 444 A1 2021-07-03 2022-07-03 1
#> 6 444 B1 2021-07-01 2022-07-01 2
CodePudding user response:
Here is another solution. The main difference with @VvdL solution is the use of drop
argument inside group_by
to avoid dropping groups without observations when performing the filter
. These groups will later contain zeros in the COUNT
column.
library(dplyr)
DF_1 |>
# Join with DF1 using ID and TYPE columns
left_join(DF_2,
by =c("ID", "TYPE")) |>
# Group by ID and TYPE, DT_End and DT_START so these columns
# are mantained in the summarise
# Add drop = F to avoid removing groups without any observation
# after the filter
group_by(ID, TYPE, DT_END,DT_START, .drop = F) |>
# Filter for condition of interest
filter(DT >= DT_END & DT <= DT_START) |>
# Count
summarise(COUNT = n())
# A tibble: 6 x 5
# Groups: ID, TYPE, DT_END [6]
# ID TYPE DT_END DT_START COUNT
# <dbl> <chr> <date> <date> <int>
#1 111 A1 2021-07-20 2022-07-20 3
#2 222 A1 2021-07-18 2022-07-18 2
#3 222 B1 2021-07-10 2022-07-10 0
#4 333 B1 2021-07-05 2022-07-05 0
#5 444 A1 2021-07-03 2022-07-03 1
#6 444 B1 2021-07-01 2022-07-01 2