How to create a column that sum up the rows by group-CodePudding

the data example is like this below:

Above A and B is the original data, and C is the column that I want to create. C is the group sum of B based on the same and adjacent A value.Even though there are some A=0, if they are not adjacent, then it should be not sum together.

CodePudding user response：

tidyverse

library(tidyverse)
df <- data.frame(
           A = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
           B = c(1L, 1L, 10L, 5L, 2L, 3L, 20L, 30L)
       )

df %>% 
  group_by(grp = data.table::rleid(A)) %>% 
  mutate(res = sum(B)) %>% 
  ungroup() %>% 
  select(-grp)
#> # A tibble: 8 × 3
#>       A     B   res
#>   <int> <int> <int>
#> 1     0     1     2
#> 2     0     1     2
#> 3     1    10    15
#> 4     1     5    15
#> 5     0     2     5
#> 6     0     3     5
#> 7     1    20    50
#> 8     1    30    50

^{Created on 2022-04-27 by the reprex package (v2.0.1)}

data.table

library(data.table)

setDT(df)[, res := sum(B), by = rleid(A)][]
#>    A  B res
#> 1: 0  1   2
#> 2: 0  1   2
#> 3: 1 10  15
#> 4: 1  5  15
#> 5: 0  2   5
#> 6: 0  3   5
#> 7: 1 20  50
#> 8: 1 30  50

^{Created on 2022-04-27 by the reprex package (v2.0.1)}

base

df$res <- with(df, ave(B, data.table::rleid(A), FUN = sum))

df
#>   A  B res
#> 1 0  1   2
#> 2 0  1   2
#> 3 1 10  15
#> 4 1  5  15
#> 5 0  2   5
#> 6 0  3   5
#> 7 1 20  50
#> 8 1 30  50

^{Created on 2022-04-27 by the reprex package (v2.0.1)}

CodePudding user response：

library(tidyverse)

data <- tribble(
  ~A, ~B, ~C,
  0, 1, 2,
  0, 1, 2,
  1, 10, 15,
  1, 5, 15,
  0, 2, 5,
  0, 3, 5,
  1, 20, 50,
  1, 30, 50
)
data <- data %>% select(-C)

rle_group <- function(x) {
  rle <- rle(x)
  rle$values <- rle$values %>%
    length() %>%
    seq()
  inverse.rle(rle)
}

data %>%
  mutate(
    group = rle_group(A)
  ) %>%
  group_by(group) %>%
  mutate(
    C = sum(B)
  )
#> # A tibble: 8 × 4
#> # Groups:   group [4]
#>       A     B group     C
#>   <dbl> <dbl> <int> <dbl>
#> 1     0     1     1     2
#> 2     0     1     1     2
#> 3     1    10     2    15
#> 4     1     5     2    15
#> 5     0     2     3     5
#> 6     0     3     3     5
#> 7     1    20     4    50
#> 8     1    30     4    50

^{Created on 2022-04-27 by the reprex package (v2.0.0)}

CodePudding user response：

A dplyr solution without any rle functions:

library(dplyr)

df %>%
  group_by(grp = cumsum(c(TRUE, diff(A) != 0))) %>%
  mutate(C = sum(B)) %>%
  ungroup() %>%
  select(-grp)

# A tibble: 8 x 3
      A     B     C
  <int> <int> <int>
1     0     1     2
2     0     1     2
3     1    10    15
4     1     5    15
5     0     2     5
6     0     3     5
7     1    20    50
8     1    30    50

Its base equivalent:

within(df, C <- ave(B, cumsum(c(TRUE, diff(A) != 0)), FUN = sum))

  A  B  C
1 0  1  2
2 0  1  2
3 1 10 15
4 1  5 15
5 0  2  5
6 0  3  5
7 1 20 50
8 1 30 50