I have two df.
- 't_dates' has sequence of dates.
- 'client_for_gg' has client_id and a start and end date for each client.
The output would tell me for each day in my t_dates$date how many clients had that date fall within their start and end date.
library(tidyverse)
library(lubridate)
t_dates <- seq.Date(from = as.Date('2022-11-01'),
to = as.Date('2022-11-15'),
by = "day") %>%
data.frame(date = .)
client_for_gg <- data.frame(client_id = c("x_555", "x_666", "x_777", "x_888", "x_999")
, start = c("2022-01-01", "2022-01-01", "2022-11-05", "2022-11-07", "2022-11-10")
, end = c("2022-11-03", "2022-11-12", "2022-12-01", "2022-12-01", "2022-12-01")) %>%
mutate(start = as.Date(start)
, end = as.Date(end))
df <- t_dates %>%
mutate(count = sum(as.Date(t_dates$date) %within%
lubridate::interval(client_for_gg$start, client_for_gg$end)))
However, in my output my counts all come to 10 on each day. Please help - thank you.
CodePudding user response:
library(dplyr)
client_for_gg %>%
rowwise() %>%
mutate(count = sum(t_dates$date >= start & t_dates$date <= end ))
# A tibble: 5 x 4
# Rowwise:
client_id start end count
<chr> <date> <date> <int>
1 x_555 2022-01-01 2022-11-03 3
2 x_666 2022-01-01 2022-11-12 12
3 x_777 2022-11-05 2022-12-01 11
4 x_888 2022-11-07 2022-12-01 9
5 x_999 2022-11-10 2022-12-01 6
CodePudding user response:
This worked.
> t_dates %>%
rowwise() %>%
mutate(count = sum(client_for_gg$start <= date & client_for_gg$end >= date))
# A tibble: 15 x 2
# Rowwise:
date count
<date> <int>
1 2022-11-01 2
2 2022-11-02 2
3 2022-11-03 2
4 2022-11-04 1
5 2022-11-05 2
6 2022-11-06 2
7 2022-11-07 3
8 2022-11-08 3
9 2022-11-09 3
10 2022-11-10 4
11 2022-11-11 4
12 2022-11-12 4
13 2022-11-13 3
14 2022-11-14 3
15 2022-11-15 3
CodePudding user response:
library(data.table)
setDT(client_for_gg)[setDT(t_dates)[,d:=date], on =.(start<=d, end>=d)][, .N, date]
Output:
date N
1: 2022-11-01 2
2: 2022-11-02 2
3: 2022-11-03 2
4: 2022-11-04 1
5: 2022-11-05 2
6: 2022-11-06 2
7: 2022-11-07 3
8: 2022-11-08 3
9: 2022-11-09 3
10: 2022-11-10 4
11: 2022-11-11 4
12: 2022-11-12 4
13: 2022-11-13 3
14: 2022-11-14 3
15: 2022-11-15 3
CodePudding user response:
In the devel
version of dplyr
, we can use join_by
which can also do non-equi joins
library(dplyr)
t_dates %>%
full_join(client_for_gg, by =join_by(date >= start, date <= end)) %>%
count(client_id, start, end, name = 'count')
-output
client_id start end count
1 x_555 2022-01-01 2022-11-03 3
2 x_666 2022-01-01 2022-11-12 12
3 x_777 2022-11-05 2022-12-01 11
4 x_888 2022-11-07 2022-12-01 9
5 x_999 2022-11-10 2022-12-01 6