R: Compute rolling mean over parameters with different windows at once-CodePudding

If this a dataframe that looks like this:

          ID   ParameterID Time                   value   group_end_time      group_start_time   
       <dbl>       <dbl> <dttm>                    <dbl> <dttm>              <dttm>             
 1         1           1 2022-01-01 10:05:00           1 2022-01-01 10:20:00 2022-01-01 10:05:00
 2         1           2 2022-01-01 10:05:00           1 2022-01-01 10:20:00 2022-01-01 09:50:00
 3         1           1 2022-01-01 10:10:00           2 2022-01-01 10:20:00 2022-01-01 10:05:00
 4         1           2 2022-01-01 10:10:00           2 2022-01-01 10:20:00 2022-01-01 09:50:00
 5         1           1 2022-01-01 10:15:00           3 2022-01-01 10:20:00 2022-01-01 10:05:00
 6         1           1 2022-01-01 10:20:00           4 2022-01-01 10:20:00 2022-01-01 10:05:00
 7         1           1 2022-01-01 10:25:00           5 2022-01-01 10:35:00 2022-01-01 10:20:00
 8         1           1 2022-01-01 10:30:00           6 2022-01-01 10:35:00 2022-01-01 10:20:00
 9         1           2 2022-01-01 10:30:00           3 2022-01-01 10:35:00 2022-01-01 10:05:00
10         1           2 2022-01-01 11:36:00           4 2022-01-01 11:50:00 2022-01-01 11:20:00

I now want to compute the mean over each ParameterID with the following meaning. For each group_end_time of a parameter, the mean of value should be computed including all observations of this ParameterID with Time >= group_start_time & Time < group_end_time. I have working approach by introducing a custom summarise function:

df %>%
  group_by(ID, ParameterID, group_end_time) %>%
  summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .))



aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
  ret <- full_data %>%
    filter(ID == id[[1]] & ParameterID == par_id[[1]] &
             Time < end_time[[1]] & Time >= start_time[[1]]) %>%
    group_by(PatientID, ParameterID) %>%
    summarise(mean = mean(value, na.rm = TRUE)
    )
  return(ret)
}

So the output is this:

ret
# A tibble: 5 × 4
# Groups:   PatientID, ParameterID [2]
        ID   ParameterID group_end_time       mean
      <dbl>       <dbl> <dttm>              <dbl>
1         1           1 2022-01-01 10:20:00   2  
2         1           2 2022-01-01 10:20:00   1.5
3         1           1 2022-01-01 10:35:00   5  
4         1           2 2022-01-01 10:35:00   2  
5         1           2 2022-01-01 11:50:00   4

While this works, it's extremly slow for huge datasets, so my approach is not practible. Do you have any idea to spead things up?

CodePudding user response：

Using df shown reproducibly in the Note at the end and SQL the benchmark below shows it runs 20x faster. This may or may not hold on larger data sets but you can try it. Just copy and paste the Note into a fresh instance of R and then copy and paste the code below.

library(dplyr)
library(microbenchmark)
library(sqldf)

aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
  ret <- full_data %>%
    filter(ID == id[[1]] & ParameterID == par_id[[1]] &
             Time < end_time[[1]] & Time >= start_time[[1]]) %>%
    group_by(ID, ParameterID) %>%
    summarise(mean = mean(value, na.rm = TRUE)
    )
  return(ret)
}

SQL <- "select A.ID, A.ParameterID, A.[group_end_time], avg(B.value) mean
  from df A
  left join df B on A.ID = B.ID and 
     A.ParameterID = B.ParameterID and 
     B.Time >= A.[group_start_time] and B.Time < A.[group_end_time]
  group by A.ID, A.ParameterID, A.[group_end_time]
  order by A.[group_end_time], A.ParameterID"

microbenchmark(times = 10,
  sql = sqldf(SQL), 
  orig = df %>%
  group_by(ID, ParameterID, group_end_time) %>%
  summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .))
)
## Unit: milliseconds
##  expr       min        lq      mean     median        uq       max neval cld
##   sql   34.9131   35.3101   59.1878   36.72465   41.7658  199.0033    10  a 
##  orig 1237.8522 1249.5308 1328.3235 1293.59445 1352.1843 1665.4883    10   b

Note

df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    ParameterID = c(1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), 
    Time = structure(c(1641049500, 1641049500, 1641049800, 1641049800, 
    1641050100, 1641050400, 1641050700, 1641051000, 1641051000, 
    1641054960), class = c("POSIXct", "POSIXt"), tzone = ""), 
    value = c(1L, 1L, 2L, 2L, 3L, 4L, 5L, 6L, 3L, 4L), group_end_time = structure(c(1641050400, 
    1641050400, 1641050400, 1641050400, 1641050400, 1641050400, 
    1641051300, 1641051300, 1641051300, 1641055800), class = c("POSIXct", 
    "POSIXt"), tzone = ""), group_start_time = structure(c(1641049500, 
    1641048600, 1641049500, 1641048600, 1641049500, 1641049500, 
    1641050400, 1641050400, 1641049500, 1641054000), class = c("POSIXct", 
    "POSIXt"), tzone = "")), row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10"), class = "data.frame")

CodePudding user response：

We could do it with apply from base R.

library(dplyr)

intervals_times <- 
df |> 
  select(ID, ParameterID, group_start_time, group_end_time) |> 
  dplyr::distinct()

foo <- function(a,x){
 
  y <- x [with(x, 
            ID == a[1] &
            ParameterID  == a[2] &
            Time >= a[3] &
            Time < a[4]
            ),]
  mean(y$value)
}

bind_cols(intervals_times,mean = apply(intervals_times, 1, foo, x = df ))
#>   ID ParameterID group_start_time   group_end_time mean
#> 1  1           1 01/01/2022 10:05 01/01/2022 10:20  2.0
#> 2  1           1 01/01/2022 10:20 01/01/2022 10:35  5.0
#> 3  1           2 01/01/2022 09:50 01/01/2022 10:20  1.5
#> 4  1           2 01/01/2022 10:05 01/01/2022 10:35  2.0
#> 5  1           2 01/01/2022 11:20 01/01/2022 11:50  4.0

Data

df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    ParameterID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), 
    Time = c("01/01/2022 10:05", "01/01/2022 10:10", "01/01/2022 10:15", 
    "01/01/2022 10:20", "01/01/2022 10:25", "01/01/2022 10:30", 
    "01/01/2022 10:05", "01/01/2022 10:10", "01/01/2022 10:30", 
    "01/01/2022 11:36"), value = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 
    2L, 3L, 4L), group_start_time = c("01/01/2022 10:05", "01/01/2022 10:05", 
    "01/01/2022 10:05", "01/01/2022 10:05", "01/01/2022 10:20", 
    "01/01/2022 10:20", "01/01/2022 09:50", "01/01/2022 09:50", 
    "01/01/2022 10:05", "01/01/2022 11:20"), group_end_time = c("01/01/2022 10:20", 
    "01/01/2022 10:20", "01/01/2022 10:20", "01/01/2022 10:20", 
    "01/01/2022 10:35", "01/01/2022 10:35", "01/01/2022 10:20", 
    "01/01/2022 10:20", "01/01/2022 10:35", "01/01/2022 11:50"
    )), class = "data.frame", row.names = c(NA, -10L))

Comparing the original function with apply:

aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
  ret <- full_data %>%
    filter(ID == id[[1]] & ParameterID == par_id[[1]] &
             Time < end_time[[1]] & Time >= start_time[[1]]) %>%
    group_by(ID, ParameterID) %>%
    summarise(mean = mean(value, na.rm = TRUE)
    )
  return(ret)
}
microbenchmark::microbenchmark(times = 10,
               op = df %>%
                 group_by(ID, ParameterID, group_end_time) %>%
                 summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .)),
               apply = bind_cols(intervals_times,mean = apply(intervals_times, 1, foo, x = df ))
)
#> Unit: microseconds
#>   expr      min       lq      mean   median       uq      max neval cld
#>     op 394028.1 397951.0 416450.02 406269.5 410794.9 482036.4    10   b
#>  apply    778.1    808.1    898.77    855.0    870.7   1374.0    10  a