Suppose I have a data frame with the following synthetic data:
> set.seed(1)
>
> patient_id = seq(101, 150)
> admit_date = sample(seq(as.Date('2021/01/01'), as.Date('2021/01/10'), by="day"), 50, replace = T)
> release_date = sample(seq(as.Date('2021/01/11'), as.Date('2021/01/31'), by="day"), 50, replace = T)
> all = data.frame(patient_id, admit_date, release_date)
>
> all
patient_id admit_date release_date
1 101 2021-01-03 2021-01-21
2 102 2021-01-04 2021-01-29
.
.
.
49 149 2021-01-08 2021-01-28
50 150 2021-01-07 2021-01-23
In the example above, the release date is after the admitted data.
I would like to efficiently create a table which counts the number of patients that are present on a given date?
I have the following solution which I would like to improve:
> date_range = sort(unique(c(all$admit_date, all$release_date)))
> count_vector = vector()
>
> for(i in 1:length(date_range)){
> current_date = date_range
> count_vector[i] = length(which((all$admit_date <= date_range[i]) &
> (all$release_date >= date_range[i])) )
> }
>
> date_count = data.frame(date_range, count_vector)
> date_count
date_range count_vector
1 2021-01-01 3
2 2021-01-02 7
3 2021-01-03 12
.
.
.
27 2021-01-29 7
28 2021-01-30 2
29 2021-01-31 1
This solution is fine for small data sets, however is it possible to improve on my existing solution to efficiently obtain the counts required for large data sets?
CodePudding user response:
You can create a vector with sequences of dates for each patient and pivot it (unnest):
set.seed(1)
patient_id = seq(101, 150)
admit_date = sample(seq(as.Date('2021/01/01'),
as.Date('2021/01/10'),
by="day"), 50, replace = T)
release_date = sample(seq(as.Date('2021/01/11'),
as.Date('2021/01/31'),
by="day"), 50, replace = T)
all = data.frame(patient_id, admit_date, release_date)
library(tidyverse)
date_pivot <- all %>%
as_tibble() %>%
mutate(seq_dates = purrr::map2(admit_date,
release_date,
function(x,y)
seq(x,y, by = 1))) %>%
tidyr::unnest(cols = seq_dates)
> date_pivot
# A tibble: 801 × 4
patient_id admit_date release_date seq_dates
<int> <date> <date> <date>
1 101 2021-01-05 2021-01-22 2021-01-05
2 101 2021-01-05 2021-01-22 2021-01-06
3 101 2021-01-05 2021-01-22 2021-01-07
4 101 2021-01-05 2021-01-22 2021-01-08
5 101 2021-01-05 2021-01-22 2021-01-09
6 101 2021-01-05 2021-01-22 2021-01-10
7 101 2021-01-05 2021-01-22 2021-01-11
8 101 2021-01-05 2021-01-22 2021-01-12
9 101 2021-01-05 2021-01-22 2021-01-13
Then it's easier to count number of patients each day as it's one patient per day inline:
count_dates <- date_pivot %>%
group_by(d_date = seq_dates) %>%
summarise(n_pat = n_distinct(patient_id))
> count_dates
# A tibble: 31 × 2
d_date n_pat
<date> <int>
1 2021-01-01 3
2 2021-01-02 6
3 2021-01-03 9
4 2021-01-04 13
5 2021-01-05 19
6 2021-01-06 25
7 2021-01-07 31
8 2021-01-08 34
9 2021-01-09 42
This gives the same output as your code:
> date_range = sort(unique(c(all$admit_date, all$release_date)))
> count_vector = vector()
> for(i in 1:length(date_range)){
current_date = date_range
count_vector[i] = length(which((all$admit_date <= date_range[i]) &
(all$release_date >= date_range[i])) )
}
> count_vector[1:9]
[1] 3 6 9 13 19 25 31 34 42
You can take or not into account admit_date and release_date in your count:
count_dates_inside_visit <- date_pivot %>%
filter(seq_dates > admit_date, seq_dates < release_date) %>%
group_by(d_date = seq_dates) %>%
summarise(n_pat = n_distinct(patient_id))
CodePudding user response:
You can join the two datasets with fuzzyjoin on range and count
occurrence of each date.
library(fuzzyjoin)
library(dplyr)
date_range = sort(unique(c(all$admit_date, all$release_date)))
data.frame(date_range) %>%
fuzzy_inner_join(all,
by = c('date_range' = 'admit_date',
'date_range' = 'release_date'),
match_fun = c(`>=`, `<=`)) %>%
count(date_range)
# date_range n
#1 2021-01-01 3
#2 2021-01-02 6
#3 2021-01-03 9
#4 2021-01-04 13
#5 2021-01-05 19
#6 2021-01-06 25
#7 2021-01-07 31
#8 2021-01-08 34
#9 2021-01-09 42
#10 2021-01-10 50
#11 2021-01-11 50
#...
#...