Calculate Slope for every n rows with respect to a conditioned row per group-CodePudding

I have the following dataframe called df (dput below):

# A tibble: 14 × 5
   group date                indicator value diff_hours
   <chr> <dttm>              <lgl>     <dbl>      <dbl>
 1 A     2022-11-01 01:00:00 FALSE         2          4
 2 A     2022-11-01 02:00:00 FALSE         1          3
 3 A     2022-11-01 03:00:00 FALSE         4          2
 4 A     2022-11-01 04:00:00 FALSE         1          1
 5 A     2022-11-01 05:00:00 TRUE          3          0
 6 A     2022-11-01 06:00:00 FALSE         1          1
 7 A     2022-11-01 07:00:00 FALSE         3          2
 8 B     2022-11-01 01:00:00 FALSE         1          4
 9 B     2022-11-01 02:00:00 FALSE         2          3
10 B     2022-11-01 03:00:00 FALSE         3          2
11 B     2022-11-01 04:00:00 FALSE         1          1
12 B     2022-11-01 05:00:00 TRUE          4          0
13 B     2022-11-01 06:00:00 FALSE         1          1
14 B     2022-11-01 07:00:00 FALSE         5          2

I would like to calculate the slope (lm(value ~ diff_hours) for every n rows with respect to the conditioned rows indicator == TRUE. The rows with TRUE should have a slope of NA. Here is the desired output called df_desired with n = 2 (dput below):

# A tibble: 14 × 6
# Groups:   group [2]
   group date                indicator value diff_hours slope
   <chr> <dttm>              <lgl>     <dbl>      <dbl> <dbl>
 1 A     2022-11-01 01:00:00 FALSE         2          4     1
 2 A     2022-11-01 02:00:00 FALSE         1          3     1
 3 A     2022-11-01 03:00:00 FALSE         4          2     3
 4 A     2022-11-01 04:00:00 FALSE         1          1     3
 5 A     2022-11-01 05:00:00 TRUE          3          0    NA
 6 A     2022-11-01 06:00:00 FALSE         1          1     2
 7 A     2022-11-01 07:00:00 FALSE         3          2     2
 8 B     2022-11-01 01:00:00 FALSE         1          4    -1
 9 B     2022-11-01 02:00:00 FALSE         2          3    -1
10 B     2022-11-01 03:00:00 FALSE         3          2     2
11 B     2022-11-01 04:00:00 FALSE         1          1     2
12 B     2022-11-01 05:00:00 TRUE          4          0    NA
13 B     2022-11-01 06:00:00 FALSE         1          1     4
14 B     2022-11-01 07:00:00 FALSE         5          2     4

For example, lm(c(2,1)~c(4,3))=1 for rows 1 and 2. So I was wondering if anyone knows how to calculate the slope of every n rows with respect to the conditioned rows per group?

dput of df and df_desired:

df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B", "B"), date = structure(c(1667260800, 
1667264400, 1667268000, 1667271600, 1667275200, 1667278800, 1667282400, 
1667260800, 1667264400, 1667268000, 1667271600, 1667275200, 1667278800, 
1667282400), class = c("POSIXct", "POSIXt"), tzone = ""), indicator = c(FALSE, 
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, TRUE, FALSE, FALSE), value = c(2, 1, 4, 1, 3, 1, 3, 1, 
2, 3, 1, 4, 1, 5), diff_hours = c(4, 3, 2, 1, 0, 1, 2, 4, 3, 
2, 1, 0, 1, 2)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), groups = structure(list(group = c("A", 
"B"), .rows = structure(list(1:7, 8:14), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))

df_desired <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B", "B"), date = structure(c(1667260800, 
1667264400, 1667268000, 1667271600, 1667275200, 1667278800, 1667282400, 
1667260800, 1667264400, 1667268000, 1667271600, 1667275200, 1667278800, 
1667282400), class = c("POSIXct", "POSIXt"), tzone = ""), indicator = c(FALSE, 
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, TRUE, FALSE, FALSE), value = c(2, 1, 4, 1, 3, 1, 3, 1, 
2, 3, 1, 4, 1, 5), diff_hours = c(4, 3, 2, 1, 0, 1, 2, 4, 3, 
2, 1, 0, 1, 2), slope = c(1, 1, 3, 3, NA, 2, 2, -1, -1, 2, 2, 
NA, 4, 4)), row.names = c(NA, -14L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), groups = structure(list(group = c("A", 
"B"), .rows = structure(list(1:7, 8:14), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))

CodePudding user response：

No need to use lag or cumsum; using rep suffices.

library(dplyr)

N <- 2

df %>% 
  ungroup() %>% 
  group_by(indicator) %>% 
  mutate(grp = rep(1:((n()/N)), each = N)) %>% 
  group_by(indicator, grp) %>% 
  mutate(slope = lm(c(value) ~ c(diff_hours))$coefficients[[2]])

#> # A tibble: 14 x 7
#> # Groups:   indicator, grp [7]
#>    group date                indicator value diff_hours   grp slope
#>    <chr> <dttm>              <lgl>     <dbl>      <dbl> <int> <dbl>
#>  1 A     2022-10-31 20:00:00 FALSE         2          4     1  1   
#>  2 A     2022-10-31 21:00:00 FALSE         1          3     1  1   
#>  3 A     2022-10-31 22:00:00 FALSE         4          2     2  3   
#>  4 A     2022-10-31 23:00:00 FALSE         1          1     2  3   
#>  5 A     2022-11-01 00:00:00 TRUE          3          0     1 NA   
#>  6 A     2022-11-01 01:00:00 FALSE         1          1     3  2   
#>  7 A     2022-11-01 02:00:00 FALSE         3          2     3  2   
#>  8 B     2022-10-31 20:00:00 FALSE         1          4     4 -1.00
#>  9 B     2022-10-31 21:00:00 FALSE         2          3     4 -1.00
#> 10 B     2022-10-31 22:00:00 FALSE         3          2     5  2   
#> 11 B     2022-10-31 23:00:00 FALSE         1          1     5  2   
#> 12 B     2022-11-01 00:00:00 TRUE          4          0     1 NA   
#> 13 B     2022-11-01 01:00:00 FALSE         1          1     6  4   
#> 14 B     2022-11-01 02:00:00 FALSE         5          2     6  4

CodePudding user response：

n <- 2
df %>%
  group_by(grp = cumsum(indicator | lag(indicator, def=0)))%>%
  mutate(grp1 = (row_number()   1) %/% n)%>%
  group_by(grp,grp1)%>%
  mutate(slope = coef(lm(value~diff_hours))[2])


# A tibble: 14 × 8
# Groups:   grp, grp1 [8]
   group date                indicator value diff_ho…¹   grp  grp1 slope
   <chr> <dttm>              <lgl>     <dbl>     <dbl> <int> <dbl> <dbl>
 1 A     2022-10-31 17:00:00 FALSE         2         4     0     1  1   
 2 A     2022-10-31 18:00:00 FALSE         1         3     0     1  1   
 3 A     2022-10-31 19:00:00 FALSE         4         2     0     2  3   
 4 A     2022-10-31 20:00:00 FALSE         1         1     0     2  3   
 5 A     2022-10-31 21:00:00 TRUE          3         0     1     1 NA   
 6 A     2022-10-31 22:00:00 FALSE         1         1     2     1  2   
 7 A     2022-10-31 23:00:00 FALSE         3         2     2     1  2   
 8 B     2022-10-31 17:00:00 FALSE         1         4     2     2 -1.00
 9 B     2022-10-31 18:00:00 FALSE         2         3     2     2 -1.00
10 B     2022-10-31 19:00:00 FALSE         3         2     2     3  2   
11 B     2022-10-31 20:00:00 FALSE         1         1     2     3  2   
12 B     2022-10-31 21:00:00 TRUE          4         0     3     1 NA   
13 B     2022-10-31 22:00:00 FALSE         1         1     4     1  4   
14 B     2022-10-31 23:00:00 FALSE         5         2     4     1  4   
# … with abbreviated variable name ¹diff_hours