I am working with the R programming language.
I have the following dataset ("my_data"):
structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B",
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A",
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B",
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B",
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207,
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017,
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893,
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761,
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809,
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787,
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581,
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076,
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826,
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775,
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225,
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958,
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")
head(my_data)
idd group_1 v1 group_2
1 1 B 15.72967370 1
2 2 B -4.33377705 2
3 3 A -0.55185019 3
4 4 B 2.66888123 4
5 5 B 12.10907264 5
6 6 A 0.01079273 6
7 7 A 20.77850323 7
8 8 A -1.98974383 8
9 9 B 12.16637035 9
10 10 A 11.43087030 10
11 11 A -0.65750091 1
12 12 B 5.71376589 2
For this dataset, I want to perform the following steps in "dplyr":
- For each grouping of 10 rows, find the sum of "v1" for group_1 = "A" and group_2 = "B"
- For each of these groupings, create a new variable ("v2") that is : "A" if sum(group_1 = A) > sum(group_1 = B), "B" if sum(group_1 = A) < sum(group_1 = B) or "0" if sum(group_1 = A) = sum(group_1 = B)
I know how to do this manually in R:
#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows
rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]
# STEP 2: find out values of "v2"
library(dplyr)
dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))
dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))
dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))
dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))
dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))
# STEP 3: append "v2" to first 5 files:
rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2
# STEP 4: create final file:
final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)
As a result, the final file looks something like this:
idd group_1 v1 group_2 v2
1 1 B 15.72967370 1 B
2 2 B -4.33377705 2 B
3 3 A -0.55185019 3 B
4 4 B 2.66888123 4 B
5 5 B 12.10907264 5 B
6 6 A 0.01079273 6 B
7 7 A 20.77850323 7 B
8 8 A -1.98974383 8 B
9 9 B 12.16637035 9 B
10 10 A 11.43087030 10 B
11 11 A -0.65750091 1 A
My Question: Can someone please show me how to perform Steps 1 to Step 4 in a single "dplyr" command?
Thanks!
CodePudding user response:
- First I'll create a
group_index
to group every 10 rows together. - Then
group_by
the relevant columns and calculate sum. - Remove the grouping layer of
group_1
, since we need to compare the values inA
andB
. - If the unique length of sum is equal to "1", that means they are the same, then input "0" in column
v2
. If they are not the same, output the maximum category stored ingroup_1
. - Finally remove the
sum
column and sort byidd
.
This method is able to solve problem with more than two groups in group_1
.
The first 20 rows are shown here for example.
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index, group_1) %>%
mutate(sum = sum(v1)) %>%
group_by(group_index) %>%
mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
ungroup() %>%
select(-c(sum, group_index))
# A tibble: 20 x 5
idd group_1 v1 group_2 v2
<int> <chr> <dbl> <int> <chr>
1 1 B 15.7 1 B
2 2 B -4.33 2 B
3 3 A -0.552 3 B
4 4 B 2.67 4 B
5 5 B 12.1 5 B
6 6 A 0.0108 6 B
7 7 A 20.8 7 B
8 8 A -1.99 8 B
9 9 B 12.2 9 B
10 10 A 11.4 10 B
11 11 A -0.658 1 A
12 12 B 5.71 2 A
13 13 B 3.44 3 A
14 14 B 19.6 4 A
15 15 A 25.6 5 A
16 16 A -0.408 6 A
17 17 A 19.3 7 A
18 18 A 9.25 8 A
19 19 B -9.67 9 A
20 20 B 2.44 10 A
CodePudding user response:
Here is alternative method.
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index) %>%
mutate(
v2 =
if_else(sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']), 'A',
if_else(sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']), 'B', '0'))
)