Remove the cumulative sum by combination of dates and breaks-CodePudding

I have a data set (ds) with the variables ``,date, values and break. values is a cumulative sum until break == "True" and after the data start the process of cumulative sum again. In my example:

# Packages
library(dplyr)
library(lubridate)


# My data set
ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/accumulated_values_table")
ds$date <- ymd(ds$date)
str(ds)

# 'data.frame': 4458 obs. of  4 variables:
# $ id    : int  0 1 2 2 1 0 3 2 4 1 ...
# $ date  : Date, format: "2020-11-26" "2020-11-26" "2020-11-26" "2020-11-27" ...
# $ values: int  36 33 27 22 34 28 5 18 28 24 ...
# $ break.: chr  "False" "False" "False" "False" ...

I'd like to find any way to remove the cumulative sum and the desired output is the daily number of values by id. I need some help for any function to make the subtraction of day 1 - day until my column break == "True" for each id. For example to my id=3:

# Actual dataset
ds.3 <- ds %>% filter(id==3)
head(ds.3)
#   id       date values break.
# 1  3 2020-11-27      5   True
# 2  3 2020-12-03     16  False
# 3  3 2020-12-05     18  False
# 4  3 2020-12-08      7   True
# 5  3 2020-12-09     27  False
# 6  3 2020-12-19     18  False

# Desirable dataset
#   id       date values break.
# 1  3 2020-11-27      5   True
# 2  3 2020-12-03     16  False
# 3  3 2020-12-05      2  False
# 4  3 2020-12-08      5   True
# 5  3 2020-12-09     27  False
# 6  3 2020-12-19      0  False * PS: if the sum is negative, the the value is equal to 0

Please any ideas?

CodePudding user response：

I'm inferring that we want to group by rows between "True" (by id too) and reassign values only where break. is not "True".

dplyr

library(dplyr)
out <- ds %>%
  mutate(break. = break. == "True") %>%
  group_by(id) %>%
  mutate(grp = cumsum(break.)) %>%
  group_by(id, grp, break.) %>%
  mutate(values2 = if (break.[1]) values else pmax(0, c(values[1], diff(values)))) %>%
  ungroup()
out
# # A tibble: 4,458 x 6
#       id date       values break.   grp values2
#    <int> <chr>       <int> <lgl>  <int>   <dbl>
#  1     0 2020-11-26     36 FALSE      0      36
#  2     1 2020-11-26     33 FALSE      0      33
#  3     2 2020-11-26     27 FALSE      0      27
#  4     2 2020-11-27     22 FALSE      0       0
#  5     1 2020-11-27     34 TRUE       1      34
#  6     0 2020-11-27     28 FALSE      0       0
#  7     3 2020-11-27      5 TRUE       1       5
#  8     2 2020-11-28     18 FALSE      0       0
#  9     4 2020-11-28     28 FALSE      0      28
# 10     1 2020-11-28     24 FALSE      1      24
# # ... with 4,448 more rows

And we can look at just id == 3:

filter(out, id == 3)
# # A tibble: 17 x 6
#       id date       values break.   grp values2
#    <int> <chr>       <int> <lgl>  <int>   <dbl>
#  1     3 2020-11-27      5 TRUE       1       5
#  2     3 2020-12-03     16 FALSE      1      16
#  3     3 2020-12-05     18 FALSE      1       2
#  4     3 2020-12-08      7 TRUE       2       7
#  5     3 2020-12-09     27 FALSE      2      27
#  6     3 2020-12-19     18 FALSE      2       0
#  7     3 2020-12-21      2 FALSE      2       0
#  8     3 2020-12-23     18 FALSE      2      16
#  9     3 2020-12-27     15 FALSE      2       0
# 10     3 2020-12-29     11 FALSE      2       0
# 11     3 2021-01-10     11 FALSE      2       0
# 12     3 2021-01-12     14 FALSE      2       3
# 13     3 2021-01-27     10 FALSE      2       0
# 14     3 2021-01-29     15 FALSE      2       5
# 15     3 2021-01-31     15 FALSE      2       0
# 16     3 2021-02-08     16 FALSE      2       1
# 17     3 2021-02-09      8 FALSE      2       0

base R

ds$break. <- ds$break. == "True"
ds$grp <- ave(ds$break., ds$id, FUN = cumsum)
ds$values2 <- ifelse(
  ds$break. == "True", ds$values,
  ave(ds$values, ds[c("id","grp","break.")],
      FUN = function(z) pmax(0, c(z[1], diff(z))))
  )
subset(ds, id == 3)
#     id       date values break. grp values2
# 7    3 2020-11-27      5   TRUE   1       5
# 35   3 2020-12-03     16  FALSE   1      16
# 48   3 2020-12-05     18  FALSE   1       2
# 64   3 2020-12-08      7   TRUE   2       7
# 74   3 2020-12-09     27  FALSE   2      27
# 182  3 2020-12-19     18  FALSE   2       0
# 201  3 2020-12-21      2  FALSE   2       0
# 220  3 2020-12-23     18  FALSE   2      16
# 249  3 2020-12-27     15  FALSE   2       0
# 277  3 2020-12-29     11  FALSE   2       0
# 380  3 2021-01-10     11  FALSE   2       0
# 390  3 2021-01-12     14  FALSE   2       3
# 509  3 2021-01-27     10  FALSE   2       0
# 524  3 2021-01-29     15  FALSE   2       5
# 539  3 2021-01-31     15  FALSE   2       0
# 608  3 2021-02-08     16  FALSE   2       1
# 619  3 2021-02-09      8  FALSE   2       0