If this a dataframe that looks like this:
ID ParameterID Time value group_end_time group_start_time
<dbl> <dbl> <dttm> <dbl> <dttm> <dttm>
1 1 1 2022-01-01 10:05:00 1 2022-01-01 10:20:00 2022-01-01 10:05:00
2 1 2 2022-01-01 10:05:00 1 2022-01-01 10:20:00 2022-01-01 09:50:00
3 1 1 2022-01-01 10:10:00 2 2022-01-01 10:20:00 2022-01-01 10:05:00
4 1 2 2022-01-01 10:10:00 2 2022-01-01 10:20:00 2022-01-01 09:50:00
5 1 1 2022-01-01 10:15:00 3 2022-01-01 10:20:00 2022-01-01 10:05:00
6 1 1 2022-01-01 10:20:00 4 2022-01-01 10:20:00 2022-01-01 10:05:00
7 1 1 2022-01-01 10:25:00 5 2022-01-01 10:35:00 2022-01-01 10:20:00
8 1 1 2022-01-01 10:30:00 6 2022-01-01 10:35:00 2022-01-01 10:20:00
9 1 2 2022-01-01 10:30:00 3 2022-01-01 10:35:00 2022-01-01 10:05:00
10 1 2 2022-01-01 11:36:00 4 2022-01-01 11:50:00 2022-01-01 11:20:00
I now want to compute the mean over each ParameterID
with the following meaning. For each group_end_time
of a parameter, the mean of value
should be computed including all observations of this ParameterID
with Time >= group_start_time & Time < group_end_time
. I have working approach by introducing a custom summarise function:
df %>%
group_by(ID, ParameterID, group_end_time) %>%
summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .))
aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
ret <- full_data %>%
filter(ID == id[[1]] & ParameterID == par_id[[1]] &
Time < end_time[[1]] & Time >= start_time[[1]]) %>%
group_by(PatientID, ParameterID) %>%
summarise(mean = mean(value, na.rm = TRUE)
)
return(ret)
}
So the output is this:
ret
# A tibble: 5 × 4
# Groups: PatientID, ParameterID [2]
ID ParameterID group_end_time mean
<dbl> <dbl> <dttm> <dbl>
1 1 1 2022-01-01 10:20:00 2
2 1 2 2022-01-01 10:20:00 1.5
3 1 1 2022-01-01 10:35:00 5
4 1 2 2022-01-01 10:35:00 2
5 1 2 2022-01-01 11:50:00 4
While this works, it's extremly slow for huge datasets, so my approach is not practible. Do you have any idea to spead things up?
CodePudding user response:
Using df
shown reproducibly in the Note at the end and SQL the benchmark below shows it runs 20x faster. This may or may not hold on larger data sets but you can try it. Just copy and paste the Note into a fresh instance of R and then copy and paste the code below.
library(dplyr)
library(microbenchmark)
library(sqldf)
aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
ret <- full_data %>%
filter(ID == id[[1]] & ParameterID == par_id[[1]] &
Time < end_time[[1]] & Time >= start_time[[1]]) %>%
group_by(ID, ParameterID) %>%
summarise(mean = mean(value, na.rm = TRUE)
)
return(ret)
}
SQL <- "select A.ID, A.ParameterID, A.[group_end_time], avg(B.value) mean
from df A
left join df B on A.ID = B.ID and
A.ParameterID = B.ParameterID and
B.Time >= A.[group_start_time] and B.Time < A.[group_end_time]
group by A.ID, A.ParameterID, A.[group_end_time]
order by A.[group_end_time], A.ParameterID"
microbenchmark(times = 10,
sql = sqldf(SQL),
orig = df %>%
group_by(ID, ParameterID, group_end_time) %>%
summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .))
)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## sql 34.9131 35.3101 59.1878 36.72465 41.7658 199.0033 10 a
## orig 1237.8522 1249.5308 1328.3235 1293.59445 1352.1843 1665.4883 10 b
Note
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
ParameterID = c(1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L),
Time = structure(c(1641049500, 1641049500, 1641049800, 1641049800,
1641050100, 1641050400, 1641050700, 1641051000, 1641051000,
1641054960), class = c("POSIXct", "POSIXt"), tzone = ""),
value = c(1L, 1L, 2L, 2L, 3L, 4L, 5L, 6L, 3L, 4L), group_end_time = structure(c(1641050400,
1641050400, 1641050400, 1641050400, 1641050400, 1641050400,
1641051300, 1641051300, 1641051300, 1641055800), class = c("POSIXct",
"POSIXt"), tzone = ""), group_start_time = structure(c(1641049500,
1641048600, 1641049500, 1641048600, 1641049500, 1641049500,
1641050400, 1641050400, 1641049500, 1641054000), class = c("POSIXct",
"POSIXt"), tzone = "")), row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10"), class = "data.frame")
CodePudding user response:
We could do it with apply
from base R.
library(dplyr)
intervals_times <-
df |>
select(ID, ParameterID, group_start_time, group_end_time) |>
dplyr::distinct()
foo <- function(a,x){
y <- x [with(x,
ID == a[1] &
ParameterID == a[2] &
Time >= a[3] &
Time < a[4]
),]
mean(y$value)
}
bind_cols(intervals_times,mean = apply(intervals_times, 1, foo, x = df ))
#> ID ParameterID group_start_time group_end_time mean
#> 1 1 1 01/01/2022 10:05 01/01/2022 10:20 2.0
#> 2 1 1 01/01/2022 10:20 01/01/2022 10:35 5.0
#> 3 1 2 01/01/2022 09:50 01/01/2022 10:20 1.5
#> 4 1 2 01/01/2022 10:05 01/01/2022 10:35 2.0
#> 5 1 2 01/01/2022 11:20 01/01/2022 11:50 4.0
Data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
ParameterID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
Time = c("01/01/2022 10:05", "01/01/2022 10:10", "01/01/2022 10:15",
"01/01/2022 10:20", "01/01/2022 10:25", "01/01/2022 10:30",
"01/01/2022 10:05", "01/01/2022 10:10", "01/01/2022 10:30",
"01/01/2022 11:36"), value = c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L), group_start_time = c("01/01/2022 10:05", "01/01/2022 10:05",
"01/01/2022 10:05", "01/01/2022 10:05", "01/01/2022 10:20",
"01/01/2022 10:20", "01/01/2022 09:50", "01/01/2022 09:50",
"01/01/2022 10:05", "01/01/2022 11:20"), group_end_time = c("01/01/2022 10:20",
"01/01/2022 10:20", "01/01/2022 10:20", "01/01/2022 10:20",
"01/01/2022 10:35", "01/01/2022 10:35", "01/01/2022 10:20",
"01/01/2022 10:20", "01/01/2022 10:35", "01/01/2022 11:50"
)), class = "data.frame", row.names = c(NA, -10L))
Comparing the original function with apply
:
aggregation_function <- function(id, par_id, end_time, start_time, full_data) {
ret <- full_data %>%
filter(ID == id[[1]] & ParameterID == par_id[[1]] &
Time < end_time[[1]] & Time >= start_time[[1]]) %>%
group_by(ID, ParameterID) %>%
summarise(mean = mean(value, na.rm = TRUE)
)
return(ret)
}
microbenchmark::microbenchmark(times = 10,
op = df %>%
group_by(ID, ParameterID, group_end_time) %>%
summarise(aggregation_function(ID, ParameterID, group_end_time, group_start_time, .)),
apply = bind_cols(intervals_times,mean = apply(intervals_times, 1, foo, x = df ))
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> op 394028.1 397951.0 416450.02 406269.5 410794.9 482036.4 10 b
#> apply 778.1 808.1 898.77 855.0 870.7 1374.0 10 a