Add a sums column to dataset subset and conditiionated to other two columns values-CodePudding

I am working with a huge dataset, but just to simplify what I'd like to do, I will use the following one.

testDF <- data.frame(v1 = rep(c('a', 'b', 'c', 'd', 'e', 'f'), 2),
                     v2 = rep(c(1,0),6))

Let's assume you could subset it like this.

When the first value of v1 assumes the same value (for example in the I would like to add a third column reporting sum of the second column values. The output will be like this:

 testDF 
   v1 v2  tc
1   a  1   2
2   b  0   0
3   c  1   2
4   d  0   0 
5   e  1   2
6   f  0   0
7   a  1   2
8   b  0   0
9   c  1   2
10  d  0   0
11  e  1   2

Which operation I could by perpetuating the dplyr code?

Thanks

CodePudding user response：

A similar approach to @jpsmith, except using map

testDF %>%
  group_split(v1) %>%
  map(~if(nrow(.x) > 1) mutate(.x, v3 = sum(v2)) else .x)
#> [[1]]
#> # A tibble: 1 x 2
#>      v1    v2
#>   <dbl> <dbl>
#> 1     1     1
#> 
#> [[2]]
#> # A tibble: 2 x 3
#>      v1    v2    v3
#>   <dbl> <dbl> <dbl>
#> 1     3     0     0
#> 2     3     0     0
#> 
#> [[3]]
#> # A tibble: 1 x 2
#>      v1    v2
#>   <dbl> <dbl>
#> 1     4     1
#> 
#> [[4]]
#> # A tibble: 3 x 3
#>      v1    v2    v3
#>   <dbl> <dbl> <dbl>
#> 1     5     1     2
#> 2     5     1     2
#> 3     5     0     2
#> 
#> [[5]]
#> # A tibble: 2 x 3
#>      v1    v2    v3
#>   <dbl> <dbl> <dbl>
#> 1     7     0     1
#> 2     7     1     1
#> 
#> [[6]]
#> # A tibble: 1 x 2
#>      v1    v2
#>   <dbl> <dbl>
#> 1     8     1
#> 
#> [[7]]
#> # A tibble: 1 x 2
#>      v1    v2
#>   <dbl> <dbl>
#> 1     9     0
#> 
#> [[8]]
#> # A tibble: 1 x 2
#>      v1    v2
#>   <dbl> <dbl>
#> 1    NA     0
Ā
#> Error in eval(expr, envir, enclos): object 'A' not found

^{Created on 2022-11-06 with reprex v2.0.2}

CodePudding user response：

If I am understanding you correctly, you could try this brute-force for loop, though I am sure there are more elegant solutions.

library(dplyr)

test_list <- testDF %>% 
  group_split(v1)

# Convert list elements to df
test_list_df <- lapply(test_list, as.data.frame)

# If more than one row, create new variable ("third_column") and sum v2

for(xx in seq_along(test_list_df)){
  if(nrow(test_list[[xx]]) > 1){
    test_list_df[[xx]]$third_column <- sum(test_list_df[[xx]]["v2"])
}}

(I am a bit unclear as to what "For each subset, when the first value of v1 is the same..." - I took that to mean if there was more than one row in the subset.)

Output

# [[1]]
# v1 v2
# 1  1  1
# 
# [[2]]
# v1 v2 third_column
# 1  3  0            0
# 2  3  0            0
# 
# [[3]]
# v1 v2
# 1  4  1
# 
# [[4]]
# v1 v2 third_column
# 1  5  1            2
# 2  5  1            2
# 3  5  0            2
# 
# [[5]]
# v1 v2 third_column
# 1  7  0            1
# 2  7  1            1
# 
# [[6]]
# v1 v2
# 1  8  1
# 
# [[7]]
# v1 v2
# 1  9  0
# 
# [[8]]
# v1 v2
# 1 NA  0

CodePudding user response：

Is this what you need?

testDF %>% 
  group_by(v1) %>%
  mutate(third_col = sum(v2, na.rm = TRUE)) %>%
  arrange(v1)
# A tibble: 12 × 3
# Groups:   v1 [8]
      v1    v2 third_col
   <dbl> <dbl>     <dbl>
 1     1     1         1
 2     3     0         0
 3     3     0         0
 4     4     1         1
 5     5     1         2
 6     5     1         2
 7     5     0         2
 8     7     0         1
 9     7     1         1
10     8     1         1
11     9     0         0
12    NA     0         0

If you need to split by group, then just replace arrange with group_split:

testDF %>% 
  group_by(v1) %>%
  mutate(third_col = sum(v2, na.rm = TRUE)) %>%
  group_split(v1)
<list_of<
  tbl_df<
    v1       : double
    v2       : double
    third_col: double
  >
>[8]>
[[1]]
# A tibble: 1 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     1     1         1

[[2]]
# A tibble: 2 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     3     0         0
2     3     0         0

[[3]]
# A tibble: 1 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     4     1         1

[[4]]
# A tibble: 3 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     5     1         2
2     5     1         2
3     5     0         2

[[5]]
# A tibble: 2 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     7     0         1
2     7     1         1

[[6]]
# A tibble: 1 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     8     1         1

[[7]]
# A tibble: 1 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1     9     0         0

[[8]]
# A tibble: 1 × 3
     v1    v2 third_col
  <dbl> <dbl>     <dbl>
1    NA     0         0