R (dplyr): find all rows in row-specific range-CodePudding

I have a dataset where each row is identified by a hospitalization id. Each row contains information on the hospitalization id, hospital id where it took place as well as date of admission and date of discharge of the given hospitalization.

I would like to know, for each hospitalization, the id of all other hospitalizations concluded in the 30 days before the beginning of the given hospitalization in the given hospital where it took place.

Below is a simple example of 6 hospitalizations taking place in 2 hospitals.

library("tidyverse")

df <- data.frame(hospitalization_id = c(1, 2, 3, 
                                        4, 5, 6),
                 hospital_id = c("A", "A", "A", 
                                 "B", "B", "B"),
                 date_start = as.Date(c("2000-01-01", "2000-01-12", "2000-01-20",
                                        "2000-02-10", "2000-02-12", "2000-02-12")),
                 date_end = as.Date(c("2000-01-03", "2000-01-18", "2000-01-22",
                                      "2000-02-11", "2000-02-14", "2000-01-17")))

I created the dates in the 30-day interval prior to the beginning of each given hospitalization.

df_with_date_range <- df %>%
  mutate(date_range1 = date_start - 31,
         date_range2 = date_start - 1)

I am trying to write a code that adds a column with all the hospitalizations that took place in this 30-day interval range. My desired output is below:

df_final <- df_with_date_range %>%
  filter(hospitalization_id == 3) %>%
  bind_rows(df_with_date_range) %>%
  arrange(hospitalization_id) %>%
  mutate(hospitalization_id_in_range = c(NA, 1, 1, 2, NA, 4, 4))

In hospital A:

there was no hospitalization in the 30-day period before hospitalization 1
hospitalization 1 took place in the 30-day period before hospitalization 2
hospitalization 1 & 2 took place in the 30-day period before hospitalization 3

In hospital B:

there was no hospitalization in the 30-day period before hospitalization 4
hospitalization 4 took place in the 30-day period before hospitalization 5
hospitalization 4 took place in the 30-day period before hospitalization 6

CodePudding user response：

A tidy solution involving rowwise filtering of the original data frame and the between function:

df_with_date_range %>%
  rowwise() %>%
  mutate(hospital_id_in_range = pmap(list(date_range1, date_range2, hospital_id),
                   function(x, y, z) ungroup(filter(rowwise(df_with_date_range),
                                                 between(date_start, x, y),
                                                 hospital_id == z))$hospitalization_id)) %>%
  unnest(hospital_id_in_range, keep_empty = TRUE)

# # A tibble: 7 × 7
#   hospitalization_id hospital_id date_start date_end   date_range1 date_range2 hospital_id_in_range
#                <dbl> <chr>       <date>     <date>     <date>      <date>                     <dbl>
# 1                  1 A           2000-01-01 2000-01-03 1999-12-01  1999-12-31                    NA
# 2                  2 A           2000-01-12 2000-01-18 1999-12-12  2000-01-11                     1
# 3                  3 A           2000-01-20 2000-01-22 1999-12-20  2000-01-19                     1
# 4                  3 A           2000-01-20 2000-01-22 1999-12-20  2000-01-19                     2
# 5                  4 B           2000-02-10 2000-02-11 2000-01-10  2000-02-09                    NA
# 6                  5 B           2000-02-12 2000-02-14 2000-01-12  2000-02-11                     4
# 7                  6 B           2000-02-12 2000-01-17 2000-01-12  2000-02-11                     4

CodePudding user response：

With data.table:

library(data.table)

setDT(df)
df[,from:=date_start-30]

df[df,.(hospital_id,
        hospitalization_id = i.hospitalization_id, 
        date_start = i.date_start,
        date_end = i.date_end,
        prev_hospitalization_id = x.hospitalization_id,
        prev_date_start = x.date_start),
      on=.(hospital_id,date_start>=from,date_start<date_start)]

   hospital_id hospitalization_id date_start   date_end prev_hospitalization_id prev_date_start
        <char>              <num>     <Date>     <Date>                   <num>          <Date>
1:           A                  1 2000-01-01 2000-01-03                      NA            <NA>
2:           A                  2 2000-01-12 2000-01-18                       1      2000-01-01
3:           A                  3 2000-01-20 2000-01-22                       1      2000-01-01
4:           A                  3 2000-01-20 2000-01-22                       2      2000-01-12
5:           B                  4 2000-02-10 2000-02-11                      NA            <NA>
6:           B                  5 2000-02-12 2000-02-14                       4      2000-02-10
7:           B                  6 2000-02-12 2000-01-17                       4      2000-02-10

The x and i prefixes refer to the X[i,j,by] syntax specific to data.table.
This means in the example above that x is the outer df and i the inner df.

CodePudding user response：

A solution using dplyr and base. Relies on cur_data() and uncount.

# functions ---------------------------------------------------------------

get_previous_ids <- function(df) {
  sapply(1:nrow(df), function(i) df$hospitalization_id[df$date_start>=df$date_range1[i] & df$date_start<=df$date_range2[i]])
}

add_nas <- function(mylist) {
  lapply(mylist, function(x) {
    res <- x
    if(length(x)==0) res <- NA
    res
  })
}

# mutations ---------------------------------------------------------------

res <- df_with_date_range %>% group_by(hospital_id) %>%
  mutate(prev_id = add_nas(get_previous_ids(cur_data())), 
         nb_ids= sapply(prev_id, length))
         
res %>% ungroup() %>% uncount(nb_ids) %>% group_by(hospitalization_id) %>% 
  mutate(rowid = row_number(), hospitalization_id_in_range=prev_id[[1]][rowid])

    # A tibble: 7 x 9
    # Groups:   hospitalization_id [6]
      hospitalization_id hospital_id date_start date_end   date_range1 date_range2 prev_id   rowid hospitalization_id~
                   <dbl> <chr>       <date>     <date>     <date>      <date>      <list>    <int>               <dbl>
    1                  1 A           2000-01-01 2000-01-03 1999-12-01  1999-12-31  <lgl [1]>     1                  NA
    2                  2 A           2000-01-12 2000-01-18 1999-12-12  2000-01-11  <dbl [1]>     1                   1
    3                  3 A           2000-01-20 2000-01-22 1999-12-20  2000-01-19  <dbl [2]>     1                   1
    4                  3 A           2000-01-20 2000-01-22 1999-12-20  2000-01-19  <dbl [2]>     2                   2
    5                  4 B           2000-02-10 2000-02-11 2000-01-10  2000-02-09  <lgl [1]>     1                  NA
    6                  5 B           2000-02-12 2000-02-14 2000-01-12  2000-02-11  <dbl [1]>     1                   4
    7                  6 B           2000-02-12 2000-01-17 2000-01-12  2000-02-11  <dbl [1]>     1                   4