assume this is my dataset
library(gtools)
library(dplyr)
df <- data.frame(grp=c(0.5,0.6,1,2,2,2,4.5,10,22,"kids","Parents","Teachers"),
f1= c(1,0,3,2,4,0,3,0,1,6,8,4),
f2= c(1,0,3,1,4,0,1,0,1,5,8,4),
f3= c(1,0,3,2,4,6,1,2,1,6,8,4))
df
grp f1 f2 f3
1 0.5 1 1 1
2 0.6 0 0 0
3 1 3 3 3
4 2 2 1 2
5 2 4 4 4
6 2 0 0 6
7 4.5 3 1 1
8 10 0 0 2
9 22 1 1 1
10 kids 6 5 6
11 Parents 8 8 8
12 Teachers 4 4 4
and this is my desired output
df_final
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4
This is what I did commenting my questions:
############ how NOT to splot set into two subsets of data
df_1 <- df %>%
filter(grepl('kids|Parents|Teachers', grp))
df_1
grp f1 f2 f3
1 kids 6 5 6
2 Parents 8 8 8
3 Teachers 4 4 4
df_2 <- df %>%
filter(!grepl('kids|Parents|Teachers', grp)) %>%
mutate(across(.cols = grp, .fns = as.numeric)) %>%
mutate(grp= cut(grp, breaks=c(-999,2,10,21,999) , labels=c("<=1", "2-9","10-19",">20"), right=F))
df_2
grp f1 f2 f3
1 <=1 1 1 1
2 <=1 0 0 0
3 <=1 3 3 3
4 2-9 2 1 2
5 2-9 4 4 4
6 2-9 0 0 6
7 2-9 3 1 1
8 10-19 0 0 2
9 >20 1 1 1
### how to pipe both aggregate and mixedorder/sort instead of separate lined of codes
df_2 <- aggregate(.~grp, data = df_2, FUN=sum)
df2[mixedorder(df2$grp, decreasing = T),]
df_2
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
### how to make sure 10-19 does not come before 2-9 in case of actual dataset
grp a b d
1 <=1 53 48 53
2 10-15 65 63 65
3 2-9 30 40 30
df_final <- rbind(df_2, df_1)
df_final
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4
Is there any neat way to get from original df to df_final all in dplyr by just piping commands?
how NOT to splot set into two subsets of data?
how to pipe both aggregate and mixedorder/sort instead of separate lined of codes?
how to make sure 10-19 does not come before 2-9 in case of actual dataset?
CodePudding user response:
Here is one option - create a second column ('grp2') with the cut
values on the numeric
elements only, then coalesce
with the original column, while appending the levels
, and then do a group_by
summarise
with across
. In this way, we don't have to use mixedsort
, as the cut
already had the grouping sorted
library(dplyr)
library(stringr)
df %>%
mutate(grp2 = case_when(str_detect(grp, '^[0-9.] $')
~ cut(as.numeric(grp), breaks=c(-999,2,10,21,999) ,
labels=c("<=1", "2-9","10-19",">20"), right=FALSE))) %>%
mutate(grp =factor(coalesce(grp2, grp),
levels = c(levels(grp2), unique(grp[is.na(grp2)]))), .keep = "unused") %>%
group_by(grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 7 × 4
grp f1 f2 f3
<fct> <dbl> <dbl> <dbl>
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4