I have the following sample dataframe called df (dput
below):
group date indicator
1 A 2022-11-01 01:00:00 FALSE
2 A 2022-11-01 03:00:00 FALSE
3 A 2022-11-01 04:00:00 TRUE
4 A 2022-11-01 05:00:00 FALSE
5 A 2022-11-01 06:00:00 TRUE
6 A 2022-11-01 07:00:00 FALSE
7 A 2022-11-01 10:00:00 FALSE
8 A 2022-11-01 12:00:00 FALSE
9 B 2022-11-01 01:00:00 FALSE
10 B 2022-11-01 02:00:00 FALSE
11 B 2022-11-01 03:00:00 FALSE
12 B 2022-11-01 06:00:00 TRUE
13 B 2022-11-01 07:00:00 FALSE
14 B 2022-11-01 08:00:00 FALSE
15 B 2022-11-01 11:00:00 TRUE
16 B 2022-11-01 13:00:00 FALSE
I would like to calculate the difference in hours between dates with their nearest conditioned rows which have indicator == TRUE
per group. Also, the rows with TRUE should return 0 as output. Here you can see the desired output called df_desired:
group date indicator diff_hours
1 A 2022-11-01 01:00:00 FALSE 3
2 A 2022-11-01 03:00:00 FALSE 1
3 A 2022-11-01 04:00:00 TRUE 0
4 A 2022-11-01 05:00:00 FALSE 1
5 A 2022-11-01 06:00:00 TRUE 0
6 A 2022-11-01 07:00:00 FALSE 1
7 A 2022-11-01 10:00:00 FALSE 4
8 A 2022-11-01 12:00:00 FALSE 6
9 B 2022-11-01 01:00:00 FALSE 5
10 B 2022-11-01 02:00:00 FALSE 4
11 B 2022-11-01 03:00:00 FALSE 3
12 B 2022-11-01 06:00:00 TRUE 0
13 B 2022-11-01 07:00:00 FALSE 1
14 B 2022-11-01 08:00:00 FALSE 2
15 B 2022-11-01 11:00:00 TRUE 0
16 B 2022-11-01 13:00:00 FALSE 2
So I was wondering if anyone knows how to calculate the difference between dates in hours with respect to their nearest conditioned row per group?
Here dput
of df and df_desired:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B"), date = structure(c(1667260800,
1667268000, 1667271600, 1667275200, 1667278800, 1667282400, 1667293200,
1667300400, 1667260800, 1667264400, 1667268000, 1667278800, 1667282400,
1667286000, 1667296800, 1667304000), class = c("POSIXct", "POSIXt"
), tzone = ""), indicator = c(FALSE, FALSE, TRUE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
TRUE, FALSE)), class = "data.frame", row.names = c(NA, -16L))
df_desired <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B"), date = structure(c(1667260800,
1667268000, 1667271600, 1667275200, 1667278800, 1667282400, 1667293200,
1667300400, 1667260800, 1667264400, 1667268000, 1667278800, 1667282400,
1667286000, 1667296800, 1667304000), class = c("POSIXct", "POSIXt"
), tzone = ""), indicator = c(FALSE, FALSE, TRUE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
TRUE, FALSE), diff_hours = c(3, 1, 0, 1, 0, 1, 4, 6, 5, 4, 3,
0, 1, 2, 0, 2)), class = "data.frame", row.names = c(NA, -16L
))
CodePudding user response:
With map_dbl
:
library(dplyr)
library(purrr)
df %>%
group_by(group) %>%
mutate(diff_hours = map_dbl(date, ~ min(abs(.x - date[indicator]))))
output
# A tibble: 16 × 4
# Groups: group [2]
group date indicator diff_hours
<chr> <dttm> <lgl> <dbl>
1 A 2022-11-01 01:00:00 FALSE 3
2 A 2022-11-01 03:00:00 FALSE 1
3 A 2022-11-01 04:00:00 TRUE 0
4 A 2022-11-01 05:00:00 FALSE 1
5 A 2022-11-01 06:00:00 TRUE 0
6 A 2022-11-01 07:00:00 FALSE 1
7 A 2022-11-01 10:00:00 FALSE 4
8 A 2022-11-01 12:00:00 FALSE 6
9 B 2022-11-01 01:00:00 FALSE 5
10 B 2022-11-01 02:00:00 FALSE 4
11 B 2022-11-01 03:00:00 FALSE 3
12 B 2022-11-01 06:00:00 TRUE 0
13 B 2022-11-01 07:00:00 FALSE 1
14 B 2022-11-01 08:00:00 FALSE 2
15 B 2022-11-01 11:00:00 TRUE 0
16 B 2022-11-01 13:00:00 FALSE 2
CodePudding user response:
You can try data.table
like below (there should be options more efficient than mine)
- Using
findInterval
orroll = "nearest"
setDT(df)[
,
diff_hours := abs(
difftime(date,
date[indicator][pmax(1, findInterval(date, date[indicator]))],
units = "hours"
)
),
group
][]
or
setDT(df)[
,
diffhours := abs(
difftime(date,
.SD[!!indicator][.SD,
date,
by = group,
on = "date",
roll = "nearest",
mult = "first"
][, date],
units = "hours"
)
)
][]
which gives
group date indicator diff_hours
1: A 2022-11-01 01:00:00 FALSE 3 hours
2: A 2022-11-01 03:00:00 FALSE 1 hours
3: A 2022-11-01 04:00:00 TRUE 0 hours
4: A 2022-11-01 05:00:00 FALSE 1 hours
5: A 2022-11-01 06:00:00 TRUE 0 hours
6: A 2022-11-01 07:00:00 FALSE 1 hours
7: A 2022-11-01 10:00:00 FALSE 4 hours
8: A 2022-11-01 12:00:00 FALSE 6 hours
9: B 2022-11-01 01:00:00 FALSE 5 hours
10: B 2022-11-01 02:00:00 FALSE 4 hours
11: B 2022-11-01 03:00:00 FALSE 3 hours
12: B 2022-11-01 06:00:00 TRUE 0 hours
13: B 2022-11-01 07:00:00 FALSE 1 hours
14: B 2022-11-01 08:00:00 FALSE 2 hours
15: B 2022-11-01 11:00:00 TRUE 0 hours
16: B 2022-11-01 13:00:00 FALSE 2 hours
- Using
outer
(not efficient due to the use ofapply
)
setDT(df)[
,
diff_hours := apply(abs(outer(date, date[indicator], `-`)), 1, min) / 3600,
group
][]
and you will see
group date indicator diff_hours
1: A 2022-11-01 01:00:00 FALSE 3
2: A 2022-11-01 03:00:00 FALSE 1
3: A 2022-11-01 04:00:00 TRUE 0
4: A 2022-11-01 05:00:00 FALSE 1
5: A 2022-11-01 06:00:00 TRUE 0
6: A 2022-11-01 07:00:00 FALSE 1
7: A 2022-11-01 10:00:00 FALSE 4
8: A 2022-11-01 12:00:00 FALSE 6
9: B 2022-11-01 01:00:00 FALSE 5
10: B 2022-11-01 02:00:00 FALSE 4
11: B 2022-11-01 03:00:00 FALSE 3
12: B 2022-11-01 06:00:00 TRUE 0
13: B 2022-11-01 07:00:00 FALSE 1
14: B 2022-11-01 08:00:00 FALSE 2
15: B 2022-11-01 11:00:00 TRUE 0
16: B 2022-11-01 13:00:00 FALSE 2
CodePudding user response:
Solution based on tidyr::fill()
:
library(dplyr)
library(tidyr)
df %>%
arrange(group, date) %>%
mutate(
ind_prev = if_else(indicator, date, as.POSIXct(NA)),
ind_next = ind_prev
) %>%
group_by(group) %>%
fill(ind_prev, .direction = "down") %>%
fill(ind_next, .direction = "up") %>%
ungroup() %>%
mutate(
across(
ind_prev:ind_next,
~ abs(as.numeric(date - .x, unit = "hours"))
),
diff_hours = pmin(ind_prev, ind_next, na.rm = TRUE)
) %>%
select(!ind_prev:ind_next)
# A tibble: 16 × 4
group date indicator diff_hours
<chr> <dttm> <lgl> <dbl>
1 A 2022-11-01 00:00:00 FALSE 3
2 A 2022-11-01 02:00:00 FALSE 1
3 A 2022-11-01 03:00:00 TRUE 0
4 A 2022-11-01 04:00:00 FALSE 1
5 A 2022-11-01 05:00:00 TRUE 0
6 A 2022-11-01 06:00:00 FALSE 1
7 A 2022-11-01 09:00:00 FALSE 4
8 A 2022-11-01 11:00:00 FALSE 6
9 B 2022-11-01 00:00:00 FALSE 5
10 B 2022-11-01 01:00:00 FALSE 4
11 B 2022-11-01 02:00:00 FALSE 3
12 B 2022-11-01 05:00:00 TRUE 0
13 B 2022-11-01 06:00:00 FALSE 1
14 B 2022-11-01 07:00:00 FALSE 2
15 B 2022-11-01 10:00:00 TRUE 0
16 B 2022-11-01 12:00:00 FALSE 2
CodePudding user response:
Here are a few of the previous approaches in base R:
#Maël answer in base R
by(df, df$group, \(d) transform(
d, diff_hours = sapply(d$date, \(x) min(abs(x - d$date[d[["indicator"]]])))
)) |>
do.call(what = rbind.data.frame)
#> group date indicator diff_hours
#> A.1 A 2022-10-31 20:00:00 FALSE 3
#> A.2 A 2022-10-31 22:00:00 FALSE 1
#> A.3 A 2022-10-31 23:00:00 TRUE 0
#> A.4 A 2022-11-01 00:00:00 FALSE 1
#> A.5 A 2022-11-01 01:00:00 TRUE 0
#> A.6 A 2022-11-01 02:00:00 FALSE 1
#> A.7 A 2022-11-01 05:00:00 FALSE 4
#> A.8 A 2022-11-01 07:00:00 FALSE 6
#> B.9 B 2022-10-31 20:00:00 FALSE 5
#> B.10 B 2022-10-31 21:00:00 FALSE 4
#> B.11 B 2022-10-31 22:00:00 FALSE 3
#> B.12 B 2022-11-01 01:00:00 TRUE 0
#> B.13 B 2022-11-01 02:00:00 FALSE 1
#> B.14 B 2022-11-01 03:00:00 FALSE 2
#> B.15 B 2022-11-01 06:00:00 TRUE 0
#> B.16 B 2022-11-01 08:00:00 FALSE 2
#ThomasIsCoding answer in base
transform(df, diff_hours = apply(abs(outer(df$date, df$date[df$indicator], `-`))/3600, 1, min))
#> group date indicator diff_hours
#> 1 A 2022-10-31 20:00:00 FALSE 3
#> 2 A 2022-10-31 22:00:00 FALSE 1
#> 3 A 2022-10-31 23:00:00 TRUE 0
#> 4 A 2022-11-01 00:00:00 FALSE 1
#> 5 A 2022-11-01 01:00:00 TRUE 0
#> 6 A 2022-11-01 02:00:00 FALSE 1
#> 7 A 2022-11-01 05:00:00 FALSE 1
#> 8 A 2022-11-01 07:00:00 FALSE 1
#> 9 B 2022-10-31 20:00:00 FALSE 3
#> 10 B 2022-10-31 21:00:00 FALSE 2
#> 11 B 2022-10-31 22:00:00 FALSE 1
#> 12 B 2022-11-01 01:00:00 TRUE 0
#> 13 B 2022-11-01 02:00:00 FALSE 1
#> 14 B 2022-11-01 03:00:00 FALSE 2
#> 15 B 2022-11-01 06:00:00 TRUE 0
#> 16 B 2022-11-01 08:00:00 FALSE 2
CodePudding user response:
A base R solution using the apply
function family. First split
the set by groups, then compare all dates with indicator == TRUE dates, then pick the min
.
cbind(df, diff_hours = unlist(lapply(split(df, df$group), function(grp)
apply(sapply(grp[grp$indicator == T, "date"], function(date_T)
abs(difftime(date_T, grp$date, u = "hour"))), 1, min))))
group date indicator diff_hours
A1 A 2022-11-01 01:00:00 FALSE 3
A2 A 2022-11-01 03:00:00 FALSE 1
A3 A 2022-11-01 04:00:00 TRUE 0
A4 A 2022-11-01 05:00:00 FALSE 1
A5 A 2022-11-01 06:00:00 TRUE 0
A6 A 2022-11-01 07:00:00 FALSE 1
A7 A 2022-11-01 10:00:00 FALSE 4
A8 A 2022-11-01 12:00:00 FALSE 6
B1 B 2022-11-01 01:00:00 FALSE 5
B2 B 2022-11-01 02:00:00 FALSE 4
B3 B 2022-11-01 03:00:00 FALSE 3
B4 B 2022-11-01 06:00:00 TRUE 0
B5 B 2022-11-01 07:00:00 FALSE 1
B6 B 2022-11-01 08:00:00 FALSE 2
B7 B 2022-11-01 11:00:00 TRUE 0
B8 B 2022-11-01 13:00:00 FALSE 2