R Count New And Existing Groups-CodePudding

HAVE=data.frame("TEACHER"=c(1,1,1,1,1,2,2,2,2),
"STUDENT"=c(1,1,1,2,2,3,3,3,4),
"TRIMESTER"=c(1,2,3,2,3,3,4,5,4))


WANT=data.frame("TRIMESTER"=c(1,2,3,4,5),
"NEWSTUDENTS"=c(1,1,1,1,0),
"TOTALSTUDENTS"=c(1,2,3,4,4),
"NEWTEACHER"=c(1,0,1,0,0),
"TOTALTEACHER"=c(1,1,2,2,2))

I wish to convert HAVE to WANT and I wish to do it by taking the count of NEWSTUDENTS and TOTALSTUDENTS, NEWSTUDENTS is when a STUDENT appears first in the TRIMESTER and do the same for TEACHER

CodePudding user response：

We may loop across the columns, create 'NEW' columns using duplicated and then do a group by sum

library(dplyr)
library(stringr)
HAVE %>%
    # order by TRIMESTER
    arrange(TRIMESTER) %>%
    # loop across TEACHER, STUDENT, create logical columns
    # by modifying the .names
    mutate(across(c(TEACHER, STUDENT), ~ !duplicated(.x), 
      .names = "NEW{.col}")) %>% 
    # grouped by TRIMESTER
    group_by(TRIMESTER) %>% 
    # get the sum of 'NEW' columns
   summarise(across(starts_with("NEW"), sum), .groups = 'drop') %>% 
   # loop over the 'NEW' columns, get the cumulative sum
   # create new columns by modifying the .names
   mutate(across(starts_with('NEW'), cumsum,
      .names = "{str_replace(.col, 'NEW', 'TOTAL')}"))

-output

# A tibble: 5 × 5
  TRIMESTER NEWTEACHER NEWSTUDENT TOTALTEACHER TOTALSTUDENT
      <dbl>      <int>      <int>        <int>        <int>
1         1          1          1            1            1
2         2          0          1            1            2
3         3          1          1            2            3
4         4          0          1            2            4
5         5          0          0            2            4

CodePudding user response：

This is an extension of this other answer of mine.

suppressPackageStartupMessages(library(dplyr))

HAVE <- data.frame("TEACHER"=c(1,1,1,1,1,2,2,2,2),
                "STUDENT"=c(1,1,1,2,2,3,3,3,4),
                "TRIMESTER"=c(1,2,3,2,3,3,4,5,4))

WANT <- data.frame("TRIMESTER"=c(1,2,3,4,5),
                "NEWSTUDENTS"=c(1,1,1,1,0),
                "TOTALSTUDENTS"=c(1,2,3,4,4),
                "NEWTEACHER"=c(1,0,1,0,0),
                "TOTALTEACHER"=c(1,1,2,2,2))

HAVE %>% 
  mutate(NEWSTUDENTS = !duplicated(STUDENT),
         NEWTEACHER = !duplicated(TEACHER)) %>%
  group_by(TRIMESTER) %>%
  summarise(NEWSTUDENTS = sum(NEWSTUDENTS),
            NEWTEACHER = sum(NEWTEACHER)) %>%
  ungroup() %>%
  mutate(TOTALSTUDENTS = cumsum(NEWSTUDENTS),
         TOTALTEACHER = cumsum(NEWTEACHER)) %>%
  relocate(TOTALSTUDENTS, .before = NEWTEACHER)
#> # A tibble: 5 × 5
#>   TRIMESTER NEWSTUDENTS TOTALSTUDENTS NEWTEACHER TOTALTEACHER
#>       <dbl>       <int>         <int>      <int>        <int>
#> 1         1           1             1          1            1
#> 2         2           1             2          0            1
#> 3         3           1             3          1            2
#> 4         4           1             4          0            2
#> 5         5           0             4          0            2

^{Created on 2022-08-18 by the reprex package (v2.0.1)}