DATA = data.frame("TRIMESTER" = c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),
"STUDENT" = c(1,2,3,4,5,6,7,1,2,3,5,9,10,11,3,7,10,6,12,15,17,16,21))
WANT = data.frame("TRIMESTER" = c(1,2,3),
"NEW_ENROLL" = c(7,3,5),
"TOTAL_ENROLL" = c(7,10,15))
I Have 'DATA' and want to make 'WANT' which has three columns and for every 'TRIMESTER' you count the number of NEW 'STUDENT' and then for 'TOTAL_ENROLL' you just count the total number of unique 'STUDENT' every trimester.
My attempt only counts the number for each TRIMESTER.
library(dplyr)
DATA %>%
group_by(TRIMESTER) %>%
count()
CodePudding user response:
- For variety we can use
Base R
aggregate
withtransform
transform(aggregate(. ~ TRIMESTER , DATA[!duplicated(DATA$STUDENT),] , length),
TOTAL_ENROLL = cumsum(STUDENT))
- Output
TRIMESTER STUDENT TOTAL_ENROLL
1 1 7 7
2 2 3 10
3 3 5 15
CodePudding user response:
Here is a way.
suppressPackageStartupMessages(library(dplyr))
DATA <- data.frame("TRIMESTER" = c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),
"STUDENT" = c(1,2,3,4,5,6,7,1,2,3,5,9,10,11,3,7,10,6,12,15,17,16,21))
DATA %>%
mutate(NEW_ENROLL = !duplicated(STUDENT)) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = sum(NEW_ENROLL)) %>%
ungroup() %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
#> # A tibble: 3 × 3
#> TRIMESTER NEW_ENROLL TOTAL_ENROLL
#> <dbl> <int> <int>
#> 1 1 7 7
#> 2 2 3 10
#> 3 3 5 15
Created on 2022-08-14 by the reprex package (v2.0.1)
CodePudding user response:
We replace
the duplicated
elements in 'STUDENT' to NA
, grouped by TRIMESTER
, get the sum
of nonNA elements and finally do the cumulative sum (cumsum
)
library(dplyr)
DATA %>%
mutate(STUDENT = replace(STUDENT, duplicated(STUDENT), NA)) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = sum(!is.na(STUDENT)), .groups= 'drop') %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
-output
# A tibble: 3 × 3
TRIMESTER NEW_ENROLL TOTAL_ENROLL
<dbl> <int> <int>
1 1 7 7
2 2 3 10
3 3 5 15
Or with distinct
distinct(DATA, STUDENT, .keep_all = TRUE) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = n(), .groups = 'drop') %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
# A tibble: 3 × 3
TRIMESTER NEW_ENROLL TOTAL_ENROLL
<dbl> <int> <int>
1 1 7 7
2 2 3 10
3 3 5 15