Calculating overlapping dates in R (dplyr)-CodePudding

I have a dataset of individuals (CSN), each of whom has had anywhere from zero to multiple interventions performed during a hospital admission (in this case, central lines placed), each with a start and an end date. I am trying to generate a new date range that will calculate any overlapping dates. In other words, I'm trying to calculate the total date range when an individual had a central line in place.

Data for example:

structure(list(CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), First_day = structure(c(1514937600, 
1514937600, 1515024000, 1515024000, 1515110400, 1515974400, 1516147200, 
1516147200, 1516147200, 1516233600, 1516233600, 1517097600, 1517097600, 
1517702400, 1517356800, 1518220800, 1519257600, 1519948800, 1520812800, 
1521504000, 1522022400), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), Last_day = structure(c(1515628800, 1515110400, 1515542400, 
1515542400, 1515628800, 1516579200, 1516320000, 1517184000, 1516233600, 
1517184000, 1517702400, 1517184000, 1517616000, 1517702400, 1518220800, 
1518825600, 1519689600, 1520812800, 1521763200, 1522108800, 1522108800
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-21L), class = c("tbl_df", "tbl", "data.frame"))

Ideally, the output would return a single date range for all overlapping dates, but if there were a stretch of days that are missed by each, then a new interval would be created. So, for group 1, rows 1-5 would all have start = 2018-01-03 and end = 2018-01-11, but then row 6 would have start = 2018-01-15 and end = 2018-01-22.

I've tried to do the following:

df %>% 
  arrange(CSN_id, First_day) %>% 
  mutate(First_day = ymd(First_day),
         Last_day = ymd(Last_day),
         start = ymd("1970-01-01"),
         end = ymd("1970-01-01")) %>% 
  group_by(CSN_id) %>% 
  rowwise() %>% 
  mutate(test = if_else(row_number() == 1, interval(First_day, Last_day), interval(lag(start), lag(end))),
         start = if_else(row_number() == 1, First_day,
                         if_else(First_day <= lag(end), lag(First_day), First_day)),
         end = if_else(row_number() == 1, Last_day,
                       if_else(Last_day %within% lag(test) == TRUE, lag(end), Last_day)))

However, I don't think the lag function is working as intended, and it always returns Last_day for some reason. I tried getting rid of rowwise, but then the intervals get messed up (persistently stuck in 1970s).

The output I'm getting is:

structure(list(CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), First_day = structure(c(17534, 
17534, 17535, 17535, 17536, 17546, 17548, 17548, 17548, 17549, 
17549, 17559, 17559, 17566, 17562, 17572, 17584, 17592, 17602, 
17610, 17616), class = "Date"), Last_day = structure(c(17542, 
17536, 17541, 17541, 17542, 17553, 17550, 17560, 17549, 17560, 
17566, 17560, 17565, 17566, 17572, 17579, 17589, 17602, 17613, 
17617, 17617), class = "Date"), start = structure(c(17534, 17534, 
17535, 17535, 17536, 17546, 17548, 17548, 17548, 17549, 17549, 
17559, 17559, 17566, 17562, 17572, 17584, 17592, 17602, 17610, 
17616), class = "Date"), end = structure(c(17542, 17536, 17541, 
17541, 17542, 17553, 17550, 17560, 17549, 17560, 17566, 17560, 
17565, 17566, 17572, 17579, 17589, 17602, 17613, 17617, 17617
), class = "Date"), test = new("Interval", .Data = c(691200, 
172800, 518400, 518400, 518400, 604800, 172800, 1036800, 86400, 
950400, 1468800, 86400, 518400, 0, 864000, 604800, 432000, 864000, 
950400, 604800, 86400), start = structure(c(1514937600, 1514937600, 
1515024000, 1515024000, 1515110400, 1515974400, 1516147200, 1516147200, 
1516147200, 1516233600, 1516233600, 1517097600, 1517097600, 1517702400, 
1517356800, 1518220800, 1519257600, 1519948800, 1520812800, 1521504000, 
1522022400), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    tzone = "UTC")), class = c("rowwise_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -21L), groups = structure(list(
    CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
        14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -21L), class = c("tbl_df", 
"tbl", "data.frame")))

Is there something obvious I'm missing? Any help would be much appreciated!

CodePudding user response：

I'm not exactly sure what your desired output is, but you can try this approach:

dat %>% 
  arrange(CSN_id,First_day,Last_day) %>% 
  group_by(CSN_id, First_day) %>%
  summarize(Last_day=max(Last_day,na.rm=T)) %>% 
  mutate(interval=as.numeric(First_day- lag(Last_day))>0,
         interval=cumsum(if_else(is.na(interval),FALSE,interval)) 1) %>% 
  group_by(CSN_id,interval) %>% 
  summarize(start = min(First_day),
         end = max(Last_day))

Output:

  CSN_id interval start               end                
   <int>    <dbl> <dttm>              <dttm>             
1      1        1 2018-01-03 00:00:00 2018-01-11 00:00:00
2      1        2 2018-01-15 00:00:00 2018-01-22 00:00:00
3      2        1 2018-01-17 00:00:00 2018-01-19 00:00:00
4      3        1 2018-01-17 00:00:00 2018-02-04 00:00:00
5      3        2 2018-02-04 00:00:00 2018-02-04 00:00:00
6      4        1 2018-01-31 00:00:00 2018-02-17 00:00:00
7      4        2 2018-02-22 00:00:00 2018-02-27 00:00:00
8      4        3 2018-03-02 00:00:00 2018-03-27 00:00:00

If you prefer to retain all the original rows, and all the dates are dates and not datetimes, you could also do something like this:

dat %>% 
  mutate(across(First_day:Last_day, ~as.Date(.x))) %>% 
  arrange(CSN_id,First_day,Last_day) %>% 
  group_by(CSN_id) %>%
  mutate(interval=as.numeric(First_day- lag(Last_day))>0,
         interval=cumsum(if_else(is.na(interval),FALSE,interval)) 1) %>% 
  group_by(CSN_id,interval) %>% 
  mutate(start = min(First_day),
            end = max(Last_day))

Output:

   CSN_id First_day  Last_day   interval start      end       
    <int> <date>     <date>        <dbl> <date>     <date>    
 1      1 2018-01-03 2018-01-05        1 2018-01-03 2018-01-11
 2      1 2018-01-03 2018-01-11        1 2018-01-03 2018-01-11
 3      1 2018-01-04 2018-01-10        1 2018-01-03 2018-01-11
 4      1 2018-01-04 2018-01-10        1 2018-01-03 2018-01-11
 5      1 2018-01-05 2018-01-11        1 2018-01-03 2018-01-11
 6      1 2018-01-15 2018-01-22        2 2018-01-15 2018-01-22
 7      2 2018-01-17 2018-01-19        1 2018-01-17 2018-01-19
 8      3 2018-01-17 2018-01-18        1 2018-01-17 2018-02-04
 9      3 2018-01-17 2018-01-29        1 2018-01-17 2018-02-04
10      3 2018-01-18 2018-01-29        1 2018-01-17 2018-02-04
# ... with 11 more rows

CodePudding user response：

Here is another option using the IRanges package on Bioconductor. The collapse_date_ranges function is taken from here, and I just adjusted according

library(data.table)
library(tidyverse)

collapse_date_ranges <- function(w, min.gapwidth = 1L) {
  IRanges::IRanges(start = as.integer(as.Date(w$First_day)), 
                   end = as.integer(as.Date(w$Last_day))) %>% 
    IRanges::reduce(min.gapwidth = min.gapwidth) %>% 
    as.data.table() %>% 
    .[, lapply(.SD, lubridate::as_date),
      .SDcols = c("start", "end")]
}


split(df, df$CSN_id) %>% 
  map(., ~collapse_date_ranges(., 0L)) %>% 
  bind_rows(., .id = 'id')

Output

   id      start        end
1:  1 2018-01-03 2018-01-11
2:  1 2018-01-15 2018-01-22
3:  2 2018-01-17 2018-01-19
4:  3 2018-01-17 2018-02-04
5:  4 2018-01-31 2018-02-17
6:  4 2018-02-22 2018-02-27
7:  4 2018-03-02 2018-03-27

If you want to have this in the original dataframe, then we can join the data back to the original dataframe, then use fill to add the dates to each row.

split(df, df$CSN_id) %>% 
  map(., ~collapse_date_ranges(., 0L)) %>% 
  bind_rows(., .id = 'CSN_id2') %>% 
  data.frame %>% 
  mutate(CSN_id2 = as.integer(CSN_id2)) %>% 
  full_join(df, ., by = c("CSN_id" = "CSN_id2", "First_day" = "start"), keep = TRUE) %>% 
  select(-CSN_id2) %>% 
  group_by(CSN_id) %>% 
  fill(start, end, .direction = "down")

Output

   CSN_id First_day           Last_day            start      end       
    <int> <dttm>              <dttm>              <date>     <date>    
 1      1 2018-01-03 00:00:00 2018-01-11 00:00:00 2018-01-03 2018-01-11
 2      1 2018-01-03 00:00:00 2018-01-05 00:00:00 2018-01-03 2018-01-11
 3      1 2018-01-04 00:00:00 2018-01-10 00:00:00 2018-01-03 2018-01-11
 4      1 2018-01-04 00:00:00 2018-01-10 00:00:00 2018-01-03 2018-01-11
 5      1 2018-01-05 00:00:00 2018-01-11 00:00:00 2018-01-03 2018-01-11
 6      1 2018-01-15 00:00:00 2018-01-22 00:00:00 2018-01-15 2018-01-22
 7      2 2018-01-17 00:00:00 2018-01-19 00:00:00 2018-01-17 2018-01-19
 8      3 2018-01-17 00:00:00 2018-01-29 00:00:00 2018-01-17 2018-02-04
 9      3 2018-01-17 00:00:00 2018-01-18 00:00:00 2018-01-17 2018-02-04
10      3 2018-01-18 00:00:00 2018-01-29 00:00:00 2018-01-17 2018-02-04
# … with 11 more rows