I would like to calculate the time since a condition is met, but each time the condition is not met, the time should be back to 0 again. It would be great to achieve this with dplyr
but I am open to any suggestions.
With code it's easier to see:
library(dplyr)
d <- structure(list(date = structure(c(17105, 17182, 17275, 17359,
17437, 17472, 17500, 17539,
17624, 17658, 17693, 17742,
17828, 17877, 18004, 18053,
18087, 18130, 18186, 18214,
18298, 18415, 18527, 18583,
18610),
class = "Date"),
condition = c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE,
TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
TRUE, FALSE, TRUE, TRUE)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,-25L))
# I can easily calculate the time since the last observation when the
# condition has been met:
dd <- d %>%
mutate(time_condition = case_when(
condition == FALSE ~ date - date, #So it is 0
condition == TRUE & lag(condition) == FALSE ~ date - date, # again, we want 0
condition == TRUE & lag(condition) == TRUE ~ date - lag(date)),
time_condition = as.numeric(time_condition))
# This is how it looks now
dd %>%
print(n = 25)
#> # A tibble: 25 × 3
#> date condition time_condition
#> <date> <lgl> <dbl>
#> 1 2016-10-31 FALSE 0
#> 2 2017-01-16 FALSE 0
#> 3 2017-04-19 FALSE 0
#> 4 2017-07-12 FALSE 0
#> 5 2017-09-28 TRUE 0
#> 6 2017-11-02 TRUE 35
#> 7 2017-11-30 TRUE 28
#> 8 2018-01-08 TRUE 39
#> 9 2018-04-03 TRUE 85
#> 10 2018-05-07 FALSE 0
#> 11 2018-06-11 FALSE 0
#> 12 2018-07-30 FALSE 0
#> 13 2018-10-24 TRUE 0
#> 14 2018-12-12 FALSE 0
#> 15 2019-04-18 TRUE 0
#> 16 2019-06-06 TRUE 49
#> 17 2019-07-10 TRUE 34
#> 18 2019-08-22 TRUE 43
#> 19 2019-10-17 TRUE 56
#> 20 2019-11-14 TRUE 28
#> 21 2020-02-06 FALSE 0
#> 22 2020-06-02 TRUE 0
#> 23 2020-09-22 FALSE 0
#> 24 2020-11-17 TRUE 0
#> 25 2020-12-14 TRUE 27
What I want is a sort of cumsum()
that resets to 0 when the condition is not met anymore. The data should look like this:
should_be <- c(0, 0, 0, 0, 0, 35, 35 28, 35 28 39, 35 28 39 85,
0, 0, 0, 0, 0, 0, 49, 49 34, 49 34 43, 49 34 43 56,
49 34 43 56 28, 0, 0, 0, 0, 27)
dd %>%
mutate(time_condition_wanted = should_be) %>%
print(n = 25)
#> # A tibble: 25 × 4
#> date condition time_condition time_condition_wanted
#> <date> <lgl> <dbl> <dbl>
#> 1 2016-10-31 FALSE 0 0
#> 2 2017-01-16 FALSE 0 0
#> 3 2017-04-19 FALSE 0 0
#> 4 2017-07-12 FALSE 0 0
#> 5 2017-09-28 TRUE 0 0
#> 6 2017-11-02 TRUE 35 35
#> 7 2017-11-30 TRUE 28 63
#> 8 2018-01-08 TRUE 39 102
#> 9 2018-04-03 TRUE 85 187
#> 10 2018-05-07 FALSE 0 0
#> 11 2018-06-11 FALSE 0 0
#> 12 2018-07-30 FALSE 0 0
#> 13 2018-10-24 TRUE 0 0
#> 14 2018-12-12 FALSE 0 0
#> 15 2019-04-18 TRUE 0 0
#> 16 2019-06-06 TRUE 49 49
#> 17 2019-07-10 TRUE 34 83
#> 18 2019-08-22 TRUE 43 126
#> 19 2019-10-17 TRUE 56 182
#> 20 2019-11-14 TRUE 28 210
#> 21 2020-02-06 FALSE 0 0
#> 22 2020-06-02 TRUE 0 0
#> 23 2020-09-22 FALSE 0 0
#> 24 2020-11-17 TRUE 0 0
#> 25 2020-12-14 TRUE 27 27
Created on 2021-11-12 by the reprex package (v2.0.1)
CodePudding user response:
Here is another version -
library(dplyr)
dd %>%
group_by(grp = cumsum(time_condition == 0)) %>%
mutate(result = cumsum(time_condition)) %>%
ungroup %>%
select(-grp)
This returns -
# date condition time_condition result
#1 2016-10-31 FALSE 0 0
#2 2017-01-16 FALSE 0 0
#3 2017-04-19 FALSE 0 0
#4 2017-07-12 FALSE 0 0
#5 2017-09-28 TRUE 0 0
#6 2017-11-02 TRUE 35 35
#7 2017-11-30 TRUE 28 63
#8 2018-01-08 TRUE 39 102
#9 2018-04-03 TRUE 85 187
#10 2018-05-07 FALSE 0 0
#11 2018-06-11 FALSE 0 0
#12 2018-07-30 FALSE 0 0
#13 2018-10-24 TRUE 0 0
#14 2018-12-12 FALSE 0 0
#15 2019-04-18 TRUE 0 0
#16 2019-06-06 TRUE 49 49
#17 2019-07-10 TRUE 34 83
#18 2019-08-22 TRUE 43 126
#19 2019-10-17 TRUE 56 182
#20 2019-11-14 TRUE 28 210
#21 2020-02-06 FALSE 0 0
#22 2020-06-02 TRUE 0 0
#23 2020-09-22 FALSE 0 0
#24 2020-11-17 TRUE 0 0
#25 2020-12-14 TRUE 27 27
CodePudding user response:
I just came up with (a bit complicated) solution for the problem and post it here for posterity. It would be great if there was an easier solution.
library(dplyr)
d <- structure(list(date = structure(c(17105, 17182, 17275, 17359,
17437, 17472, 17500, 17539,
17624, 17658, 17693, 17742,
17828, 17877, 18004, 18053,
18087, 18130, 18186, 18214,
18298, 18415, 18527, 18583,
18610),
class = "Date"),
condition = c(FALSE, FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE, TRUE,
TRUE, FALSE, FALSE, FALSE,
TRUE, FALSE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, FALSE,
TRUE, FALSE, TRUE, TRUE)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA,-25L))
dd <- d %>%
mutate(time_condition = case_when(
condition == FALSE ~ date - date, #So it is 0
condition == TRUE & lag(condition) == FALSE ~ date - date, # again, we want 0
condition == TRUE & lag(condition) == TRUE ~ date - lag(date)),
time_condition = as.numeric(time_condition))
should_be <- c(0, 0, 0, 0, 0, 35, 35 28, 35 28 39, 35 28 39 85,
0, 0, 0, 0, 0, 0, 49, 49 34, 49 34 43, 49 34 43 56,
49 34 43 56 28, 0, 0, 0, 0, 27)
I can create a change
variable, that increases each time the status changes. After that, cumsum()
can be used over the dataset group_by
that change
variable.
dd %>%
mutate(time_condition_wanted = should_be) %>%
# create that increases at each status change
mutate(change = if_else(condition != lag(condition), 1, 0),
change = if_else(is.na(change), 0, change),
change = cumsum(change)) %>%
group_by(change) %>%
mutate(y = cumsum(time_condition)) %>%
print(n = 25)
#> # A tibble: 25 × 6
#> # Groups: change [10]
#> date condition time_condition time_condition_wanted change y
#> <date> <lgl> <dbl> <dbl> <dbl> <dbl>
#> 1 2016-10-31 FALSE 0 0 0 0
#> 2 2017-01-16 FALSE 0 0 0 0
#> 3 2017-04-19 FALSE 0 0 0 0
#> 4 2017-07-12 FALSE 0 0 0 0
#> 5 2017-09-28 TRUE 0 0 1 0
#> 6 2017-11-02 TRUE 35 35 1 35
#> 7 2017-11-30 TRUE 28 63 1 63
#> 8 2018-01-08 TRUE 39 102 1 102
#> 9 2018-04-03 TRUE 85 187 1 187
#> 10 2018-05-07 FALSE 0 0 2 0
#> 11 2018-06-11 FALSE 0 0 2 0
#> 12 2018-07-30 FALSE 0 0 2 0
#> 13 2018-10-24 TRUE 0 0 3 0
#> 14 2018-12-12 FALSE 0 0 4 0
#> 15 2019-04-18 TRUE 0 0 5 0
#> 16 2019-06-06 TRUE 49 49 5 49
#> 17 2019-07-10 TRUE 34 83 5 83
#> 18 2019-08-22 TRUE 43 126 5 126
#> 19 2019-10-17 TRUE 56 182 5 182
#> 20 2019-11-14 TRUE 28 210 5 210
#> 21 2020-02-06 FALSE 0 0 6 0
#> 22 2020-06-02 TRUE 0 0 7 0
#> 23 2020-09-22 FALSE 0 0 8 0
#> 24 2020-11-17 TRUE 0 0 9 0
#> 25 2020-12-14 TRUE 27 27 9 27
Created on 2021-11-12 by the reprex package (v2.0.1)