I have the following dataframe called df (dput
below):
# A tibble: 14 × 5
group date indicator value diff_hours
<chr> <dttm> <lgl> <dbl> <dbl>
1 A 2022-11-01 01:00:00 FALSE 2 4
2 A 2022-11-01 02:00:00 FALSE 1 3
3 A 2022-11-01 03:00:00 FALSE 4 2
4 A 2022-11-01 04:00:00 FALSE 1 1
5 A 2022-11-01 05:00:00 TRUE 3 0
6 A 2022-11-01 06:00:00 FALSE 1 1
7 A 2022-11-01 07:00:00 FALSE 3 2
8 B 2022-11-01 01:00:00 FALSE 1 4
9 B 2022-11-01 02:00:00 FALSE 2 3
10 B 2022-11-01 03:00:00 FALSE 3 2
11 B 2022-11-01 04:00:00 FALSE 1 1
12 B 2022-11-01 05:00:00 TRUE 4 0
13 B 2022-11-01 06:00:00 FALSE 1 1
14 B 2022-11-01 07:00:00 FALSE 5 2
I would like to calculate the slope (lm(value ~ diff_hours)
for every n rows with respect to the conditioned rows indicator == TRUE
. The rows with TRUE should have a slope of NA. Here is the desired output called df_desired with n = 2 (dput
below):
# A tibble: 14 × 6
# Groups: group [2]
group date indicator value diff_hours slope
<chr> <dttm> <lgl> <dbl> <dbl> <dbl>
1 A 2022-11-01 01:00:00 FALSE 2 4 1
2 A 2022-11-01 02:00:00 FALSE 1 3 1
3 A 2022-11-01 03:00:00 FALSE 4 2 3
4 A 2022-11-01 04:00:00 FALSE 1 1 3
5 A 2022-11-01 05:00:00 TRUE 3 0 NA
6 A 2022-11-01 06:00:00 FALSE 1 1 2
7 A 2022-11-01 07:00:00 FALSE 3 2 2
8 B 2022-11-01 01:00:00 FALSE 1 4 -1
9 B 2022-11-01 02:00:00 FALSE 2 3 -1
10 B 2022-11-01 03:00:00 FALSE 3 2 2
11 B 2022-11-01 04:00:00 FALSE 1 1 2
12 B 2022-11-01 05:00:00 TRUE 4 0 NA
13 B 2022-11-01 06:00:00 FALSE 1 1 4
14 B 2022-11-01 07:00:00 FALSE 5 2 4
For example, lm(c(2,1)~c(4,3))=1
for rows 1 and 2. So I was wondering if anyone knows how to calculate the slope of every n rows with respect to the conditioned rows per group?
dput
of df and df_desired:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "B",
"B", "B", "B", "B", "B", "B"), date = structure(c(1667260800,
1667264400, 1667268000, 1667271600, 1667275200, 1667278800, 1667282400,
1667260800, 1667264400, 1667268000, 1667271600, 1667275200, 1667278800,
1667282400), class = c("POSIXct", "POSIXt"), tzone = ""), indicator = c(FALSE,
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, TRUE, FALSE, FALSE), value = c(2, 1, 4, 1, 3, 1, 3, 1,
2, 3, 1, 4, 1, 5), diff_hours = c(4, 3, 2, 1, 0, 1, 2, 4, 3,
2, 1, 0, 1, 2)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), groups = structure(list(group = c("A",
"B"), .rows = structure(list(1:7, 8:14), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
df_desired <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "B",
"B", "B", "B", "B", "B", "B"), date = structure(c(1667260800,
1667264400, 1667268000, 1667271600, 1667275200, 1667278800, 1667282400,
1667260800, 1667264400, 1667268000, 1667271600, 1667275200, 1667278800,
1667282400), class = c("POSIXct", "POSIXt"), tzone = ""), indicator = c(FALSE,
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, TRUE, FALSE, FALSE), value = c(2, 1, 4, 1, 3, 1, 3, 1,
2, 3, 1, 4, 1, 5), diff_hours = c(4, 3, 2, 1, 0, 1, 2, 4, 3,
2, 1, 0, 1, 2), slope = c(1, 1, 3, 3, NA, 2, 2, -1, -1, 2, 2,
NA, 4, 4)), row.names = c(NA, -14L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(group = c("A",
"B"), .rows = structure(list(1:7, 8:14), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
CodePudding user response:
No need to use lag
or cumsum
; using rep
suffices.
library(dplyr)
N <- 2
df %>%
ungroup() %>%
group_by(indicator) %>%
mutate(grp = rep(1:((n()/N)), each = N)) %>%
group_by(indicator, grp) %>%
mutate(slope = lm(c(value) ~ c(diff_hours))$coefficients[[2]])
#> # A tibble: 14 x 7
#> # Groups: indicator, grp [7]
#> group date indicator value diff_hours grp slope
#> <chr> <dttm> <lgl> <dbl> <dbl> <int> <dbl>
#> 1 A 2022-10-31 20:00:00 FALSE 2 4 1 1
#> 2 A 2022-10-31 21:00:00 FALSE 1 3 1 1
#> 3 A 2022-10-31 22:00:00 FALSE 4 2 2 3
#> 4 A 2022-10-31 23:00:00 FALSE 1 1 2 3
#> 5 A 2022-11-01 00:00:00 TRUE 3 0 1 NA
#> 6 A 2022-11-01 01:00:00 FALSE 1 1 3 2
#> 7 A 2022-11-01 02:00:00 FALSE 3 2 3 2
#> 8 B 2022-10-31 20:00:00 FALSE 1 4 4 -1.00
#> 9 B 2022-10-31 21:00:00 FALSE 2 3 4 -1.00
#> 10 B 2022-10-31 22:00:00 FALSE 3 2 5 2
#> 11 B 2022-10-31 23:00:00 FALSE 1 1 5 2
#> 12 B 2022-11-01 00:00:00 TRUE 4 0 1 NA
#> 13 B 2022-11-01 01:00:00 FALSE 1 1 6 4
#> 14 B 2022-11-01 02:00:00 FALSE 5 2 6 4
CodePudding user response:
n <- 2
df %>%
group_by(grp = cumsum(indicator | lag(indicator, def=0)))%>%
mutate(grp1 = (row_number() 1) %/% n)%>%
group_by(grp,grp1)%>%
mutate(slope = coef(lm(value~diff_hours))[2])
# A tibble: 14 × 8
# Groups: grp, grp1 [8]
group date indicator value diff_ho…¹ grp grp1 slope
<chr> <dttm> <lgl> <dbl> <dbl> <int> <dbl> <dbl>
1 A 2022-10-31 17:00:00 FALSE 2 4 0 1 1
2 A 2022-10-31 18:00:00 FALSE 1 3 0 1 1
3 A 2022-10-31 19:00:00 FALSE 4 2 0 2 3
4 A 2022-10-31 20:00:00 FALSE 1 1 0 2 3
5 A 2022-10-31 21:00:00 TRUE 3 0 1 1 NA
6 A 2022-10-31 22:00:00 FALSE 1 1 2 1 2
7 A 2022-10-31 23:00:00 FALSE 3 2 2 1 2
8 B 2022-10-31 17:00:00 FALSE 1 4 2 2 -1.00
9 B 2022-10-31 18:00:00 FALSE 2 3 2 2 -1.00
10 B 2022-10-31 19:00:00 FALSE 3 2 2 3 2
11 B 2022-10-31 20:00:00 FALSE 1 1 2 3 2
12 B 2022-10-31 21:00:00 TRUE 4 0 3 1 NA
13 B 2022-10-31 22:00:00 FALSE 1 1 4 1 4
14 B 2022-10-31 23:00:00 FALSE 5 2 4 1 4
# … with abbreviated variable name ¹diff_hours