Home > front end >  Remove the cumulative sum by combination of dates and breaks
Remove the cumulative sum by combination of dates and breaks

Time:09-24

I have a data set (ds) with the variables ``,date, values and break. values is a cumulative sum until break == "True" and after the data start the process of cumulative sum again. In my example:

# Packages
library(dplyr)
library(lubridate)


# My data set
ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/accumulated_values_table")
ds$date <- ymd(ds$date)
str(ds)

# 'data.frame': 4458 obs. of  4 variables:
# $ id    : int  0 1 2 2 1 0 3 2 4 1 ...
# $ date  : Date, format: "2020-11-26" "2020-11-26" "2020-11-26" "2020-11-27" ...
# $ values: int  36 33 27 22 34 28 5 18 28 24 ...
# $ break.: chr  "False" "False" "False" "False" ...

I'd like to find any way to remove the cumulative sum and the desired output is the daily number of values by id. I need some help for any function to make the subtraction of day 1 - day until my column break == "True" for each id. For example to my id=3:

# Actual dataset
ds.3 <- ds %>% filter(id==3)
head(ds.3)
#   id       date values break.
# 1  3 2020-11-27      5   True
# 2  3 2020-12-03     16  False
# 3  3 2020-12-05     18  False
# 4  3 2020-12-08      7   True
# 5  3 2020-12-09     27  False
# 6  3 2020-12-19     18  False

# Desirable dataset
#   id       date values break.
# 1  3 2020-11-27      5   True
# 2  3 2020-12-03     16  False
# 3  3 2020-12-05      2  False
# 4  3 2020-12-08      5   True
# 5  3 2020-12-09     27  False
# 6  3 2020-12-19      0  False * PS: if the sum is negative, the the value is equal to 0

Please any ideas?

CodePudding user response:

I'm inferring that we want to group by rows between "True" (by id too) and reassign values only where break. is not "True".

dplyr

library(dplyr)
out <- ds %>%
  mutate(break. = break. == "True") %>%
  group_by(id) %>%
  mutate(grp = cumsum(break.)) %>%
  group_by(id, grp, break.) %>%
  mutate(values2 = if (break.[1]) values else pmax(0, c(values[1], diff(values)))) %>%
  ungroup()
out
# # A tibble: 4,458 x 6
#       id date       values break.   grp values2
#    <int> <chr>       <int> <lgl>  <int>   <dbl>
#  1     0 2020-11-26     36 FALSE      0      36
#  2     1 2020-11-26     33 FALSE      0      33
#  3     2 2020-11-26     27 FALSE      0      27
#  4     2 2020-11-27     22 FALSE      0       0
#  5     1 2020-11-27     34 TRUE       1      34
#  6     0 2020-11-27     28 FALSE      0       0
#  7     3 2020-11-27      5 TRUE       1       5
#  8     2 2020-11-28     18 FALSE      0       0
#  9     4 2020-11-28     28 FALSE      0      28
# 10     1 2020-11-28     24 FALSE      1      24
# # ... with 4,448 more rows

And we can look at just id == 3:

filter(out, id == 3)
# # A tibble: 17 x 6
#       id date       values break.   grp values2
#    <int> <chr>       <int> <lgl>  <int>   <dbl>
#  1     3 2020-11-27      5 TRUE       1       5
#  2     3 2020-12-03     16 FALSE      1      16
#  3     3 2020-12-05     18 FALSE      1       2
#  4     3 2020-12-08      7 TRUE       2       7
#  5     3 2020-12-09     27 FALSE      2      27
#  6     3 2020-12-19     18 FALSE      2       0
#  7     3 2020-12-21      2 FALSE      2       0
#  8     3 2020-12-23     18 FALSE      2      16
#  9     3 2020-12-27     15 FALSE      2       0
# 10     3 2020-12-29     11 FALSE      2       0
# 11     3 2021-01-10     11 FALSE      2       0
# 12     3 2021-01-12     14 FALSE      2       3
# 13     3 2021-01-27     10 FALSE      2       0
# 14     3 2021-01-29     15 FALSE      2       5
# 15     3 2021-01-31     15 FALSE      2       0
# 16     3 2021-02-08     16 FALSE      2       1
# 17     3 2021-02-09      8 FALSE      2       0

base R

ds$break. <- ds$break. == "True"
ds$grp <- ave(ds$break., ds$id, FUN = cumsum)
ds$values2 <- ifelse(
  ds$break. == "True", ds$values,
  ave(ds$values, ds[c("id","grp","break.")],
      FUN = function(z) pmax(0, c(z[1], diff(z))))
  )
subset(ds, id == 3)
#     id       date values break. grp values2
# 7    3 2020-11-27      5   TRUE   1       5
# 35   3 2020-12-03     16  FALSE   1      16
# 48   3 2020-12-05     18  FALSE   1       2
# 64   3 2020-12-08      7   TRUE   2       7
# 74   3 2020-12-09     27  FALSE   2      27
# 182  3 2020-12-19     18  FALSE   2       0
# 201  3 2020-12-21      2  FALSE   2       0
# 220  3 2020-12-23     18  FALSE   2      16
# 249  3 2020-12-27     15  FALSE   2       0
# 277  3 2020-12-29     11  FALSE   2       0
# 380  3 2021-01-10     11  FALSE   2       0
# 390  3 2021-01-12     14  FALSE   2       3
# 509  3 2021-01-27     10  FALSE   2       0
# 524  3 2021-01-29     15  FALSE   2       5
# 539  3 2021-01-31     15  FALSE   2       0
# 608  3 2021-02-08     16  FALSE   2       1
# 619  3 2021-02-09      8  FALSE   2       0
  • Related