I would ultimately like to have df2 with certain dates and the cumulative sum of values connected to those date ranges from df1.
df1 = data.frame("date"=c("10/01/2020","10/02/2020","10/03/2020","10/04/2020","10/05/2020",
"10/06/2020","10/07/2020","10/08/2020","10/09/2020","10/10/2020"),
"value"=c(1:10))
df1
> df1
date value
1 10/01/2020 1
2 10/02/2020 2
3 10/03/2020 3
4 10/04/2020 4
5 10/05/2020 5
6 10/06/2020 6
7 10/07/2020 7
8 10/08/2020 8
9 10/09/2020 9
10 10/10/2020 10
df2 = data.frame("date"=c("10/05/2020","10/10/2020"))
df2
> df2
date
1 10/05/2020
2 10/10/2020
I realize this is incorrect, but I am not sure how to define df2$value as the sums of certain df1$value rows:
df2$value = filter(df1, c(sum(1:5),sum(6:10)))
df2
I would like the output to look like this:
> df2
date value
1 10/05/2020 15
2 10/10/2020 40
CodePudding user response:
Here is another approach using dplyr
and lubridate
:
library(lubridate)
library(dplyr)
df1 %>%
mutate(date = dmy(date)) %>%
mutate(date = if_else(date == "2020-05-10" |
date == "2020-10-10", date, NA_Date_)) %>%
fill(date, .direction = "up") %>%
group_by(date) %>%
summarise(value = sum(value))
date value
<date> <int>
1 2020-05-10 15
2 2020-10-10 40
CodePudding user response:
We may use a non-equi join after converting the 'date' columns to Date
class
library(lubridate)
library(data.table)
setDT(df1)[, date := mdy(date)]
setDT(df2)[, date := mdy(date)]
df2[, start_date := fcoalesce(shift(date) days(1), floor_date(date, 'month'))]
df1[df2,.(value = sum(value)), on = .( date >= start_date,
date <= date), by = .EACHI][, -1, with = FALSE]
date value
<Date> <int>
1: 2020-10-05 15
2: 2020-10-10 40
Or another option is creating a group with findInterval
and then do the group by sum
library(dplyr)
df1 %>%
group_by(grp = findInterval(date, df2$date, left.open = TRUE)) %>%
summarise(date = last(date), value = sum(value)) %>%
select(-grp)
-output
# A tibble: 2 × 2
date value
<date> <int>
1 2020-10-05 15
2 2020-10-10 40