Converting Multiple DPLYR Commands into a Single DPLYR Command-CodePudding

I am working with the R programming language.

I have the following dataset ("my_data"):

structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B", 
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A", 
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B", 
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B", 
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207, 
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017, 
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893, 
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761, 
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809, 
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787, 
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581, 
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076, 
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826, 
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775, 
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225, 
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958, 
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")

head(my_data)

   idd group_1          v1 group_2
1    1       B 15.72967370       1
2    2       B -4.33377705       2
3    3       A -0.55185019       3
4    4       B  2.66888123       4
5    5       B 12.10907264       5
6    6       A  0.01079273       6
7    7       A 20.77850323       7
8    8       A -1.98974383       8
9    9       B 12.16637035       9
10  10       A 11.43087030      10
11  11       A -0.65750091       1
12  12       B  5.71376589       2

For this dataset, I want to perform the following steps in "dplyr":

For each grouping of 10 rows, find the sum of "v1" for group_1 = "A" and group_2 = "B"
For each of these groupings, create a new variable ("v2") that is : "A" if sum(group_1 = A) > sum(group_1 = B), "B" if sum(group_1 = A) < sum(group_1 = B) or "0" if sum(group_1 = A) = sum(group_1 = B)

I know how to do this manually in R:

#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows

rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]

# STEP 2: find out values of "v2"

library(dplyr)

dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))

dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))

dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))

dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))

dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))

# STEP 3: append "v2" to first 5 files:

rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2

# STEP 4: create final file:

final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)

As a result, the final file looks something like this:

  idd group_1          v1 group_2 v2
1    1       B 15.72967370       1  B
2    2       B -4.33377705       2  B
3    3       A -0.55185019       3  B
4    4       B  2.66888123       4  B
5    5       B 12.10907264       5  B
6    6       A  0.01079273       6  B
7    7       A 20.77850323       7  B
8    8       A -1.98974383       8  B
9    9       B 12.16637035       9  B
10  10       A 11.43087030      10  B
11  11       A -0.65750091       1  A

My Question: Can someone please show me how to perform Steps 1 to Step 4 in a single "dplyr" command?

Thanks!

CodePudding user response：

First I'll create a group_index to group every 10 rows together.
Then group_by the relevant columns and calculate sum.
Remove the grouping layer of group_1, since we need to compare the values in A and B.
If the unique length of sum is equal to "1", that means they are the same, then input "0" in column v2. If they are not the same, output the maximum category stored in group_1.
Finally remove the sum column and sort by idd.

This method is able to solve problem with more than two groups in group_1.

The first 20 rows are shown here for example.

library(tidyverse)

df %>% 
  mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>% 
  group_by(group_index, group_1) %>% 
  mutate(sum = sum(v1)) %>% 
  group_by(group_index) %>%
  mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
  ungroup() %>% 
  select(-c(sum, group_index))

# A tibble: 20 x 5
     idd group_1      v1 group_2 v2   
   <int> <chr>     <dbl>   <int> <chr>
 1     1 B       15.7          1 B    
 2     2 B       -4.33         2 B    
 3     3 A       -0.552        3 B    
 4     4 B        2.67         4 B    
 5     5 B       12.1          5 B    
 6     6 A        0.0108       6 B    
 7     7 A       20.8          7 B    
 8     8 A       -1.99         8 B    
 9     9 B       12.2          9 B    
10    10 A       11.4         10 B    
11    11 A       -0.658        1 A    
12    12 B        5.71         2 A    
13    13 B        3.44         3 A    
14    14 B       19.6          4 A    
15    15 A       25.6          5 A    
16    16 A       -0.408        6 A    
17    17 A       19.3          7 A    
18    18 A        9.25         8 A    
19    19 B       -9.67         9 A    
20    20 B        2.44        10 A

CodePudding user response：

Here is alternative method.

library(tidyverse)

df %>% 
  mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>% 
  group_by(group_index) %>%
  mutate(
   v2 = 
    if_else(sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']), 'A', 
     if_else(sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']), 'B', '0'))
)