How to use stringr or grepl to Make new character variable?-CodePudding

I have data on subject codes and grades for students. Each has to take 4 subjects out of which English is mandatory. It is represented by code 301.

Columns: SUB1, SUB2, and so on represent subject codes for other modules and the next column represents the marks.

Based on these codes I am trying to do two things:

First thing:

I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.

The issue that I am facing is that the codes are spread across columns and I facing difficulty to standardize them.

Second thing:

Based on the identified course, I am trying to sum the grades obtained by each student in these subjects. However, I am want to add the English grade to it as well.

Example of the data:

structure(list(Roll.No. = c(12653771L, 12653813L, 12653787L, 
12653850L, 12653660L, 12653553L, 12653902L, 12653888L, 12653440L, 
12653487L, 12653469L, 12653832L, 12653382L, 12653814L, 12653587L, 
12653508L, 12653449L, 12653445L, 12653776L, 12653806L), SUB = c(301L, 
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L), MRK = c(93L, 
82L, 85L, 74L, 85L, 80L, 67L, 77L, 78L, 94L, 89L, 65L, 88L, 89L, 
82L, 85L, 91L, 77L, 97L, 76L), SUB.1 = c(30L, 30L, 30L, 30L, 
42L, 41L, 30L, 30L, 41L, 41L, 41L, 30L, 41L, 30L, 42L, 41L, 41L, 
41L, 30L, 30L), MRK.1 = c(74L, 97L, 75L, 73L, 72L, 81L, 62L, 
71L, 63L, 75L, 93L, 74L, 89L, 91L, 67L, 87L, 81L, 94L, 86L, 69L
), SUB.2 = c(48L, 41L, 48L, 54L, 43L, 42L, 48L, 48L, 42L, 42L, 
42L, 41L, 42L, 41L, 43L, 42L, 42L, 42L, 48L, 48L), MRK.2 = c(76L, 
95L, 79L, 74L, 72L, 75L, 67L, 74L, 60L, 72L, 93L, 56L, 79L, 68L, 
68L, 91L, 62L, 75L, 95L, 67L), SUB.3 = c(54L, 54L, 54L, 55L, 
44L, 43L, 54L, 54L, 43L, 43L, 43L, 54L, 43L, 54L, 44L, 43L, 43L, 
43L, 54L, 54L), MRK.3 = c(80L, 96L, 77L, 44L, 94L, 69L, 63L, 
74L, 57L, 67L, 80L, 67L, 72L, 89L, 95L, 87L, 68L, 82L, 94L, 69L
), SUB.4 = c(55L, 55L, 55L, 64L, 265L, 48L, 55L, 55L, 48L, 48L, 
283L, 55L, 48L, 55L, 64L, 283L, 283L, 48L, 55L, 55L), MRK.4 = c(45L, 
95L, 46L, 76L, 91L, 74L, 44L, 52L, 82L, 92L, 92L, 60L, 81L, 49L, 
83L, 89L, 90L, 83L, 93L, 61L), SUB.5 = c(NA, 64L, NA, 41L, 48L, 
NA, NA, NA, NA, NA, NA, 64L, NA, 49L, NA, 48L, 49L, NA, NA, NA
), MRK.5 = c(NA, "97", NA, "AB", "87", NA, NA, NA, NA, NA, NA, 
"71", NA, "97", NA, "83", "98", NA, NA, NA)), row.names = c(NA, 
20L), class = "data.frame")

CodePudding user response：

This should also answer: your first question:

df <- df %>% 
# convert to numeric and induce NAs in MRK.5 (losing AB)
mutate(MRK.5 = as.numeric(MRK.5)) %>% 
# convert NAs to 0
  mutate(across(where(anyNA), ~ replace_na(., 0))) %>% 
# create a new string of subjects 
  mutate(subjects = str_c(SUB.1, SUB.2, SUB.3, SUB.4, SUB.5, sep = "_")) %>%
# use case_when to specify course 
 mutate(course = case_when(str_detect(subjects, "42") & str_detect(subjects, "43") & str_detect(subjects, "44") ~ "PCB",
                            str_detect(subjects, "41") & str_detect(subjects, "42") & str_detect(subjects, "43") ~ "PCM",
                            str_detect(subjects, "30") & str_detect(subjects, "54") & str_detect(subjects, "55") ~ "Commerce",
                            TRUE ~ "Other"))

I am not sure whether you want the summary scores for just the three subjects plus English in each course, or for all subjects, but with a bit of wrangling you can the following dataframe, from where you should be able to get what you need.

df %>%  select(-subjects) %>% 
# create consistent naming for the pivot separation
  rename(SUB.E = SUB, MRK.E = MRK) %>% 
# pivot into tidy form
  pivot_longer(-c(Roll.No.,course),
               names_to = c("Var", ".value"),
               names_sep = "\\.") %>% 
  pivot_longer(-c(Roll.No.,course,Var), names_to = "sub") %>% 
  pivot_wider(names_from = Var, values_from = value)

Giving:

A tibble: 120 × 5
   Roll.No. course   sub     SUB   MRK
      <int> <chr>    <chr> <dbl> <dbl>
 1 12653771 Commerce E       301    93
 2 12653771 Commerce 1        30    74
 3 12653771 Commerce 2        48    76
 4 12653771 Commerce 3        54    80
 5 12653771 Commerce 4        55    45
 6 12653771 Commerce 5         0     0
 7 12653813 Commerce E       301    82
 8 12653813 Commerce 1        30    97
 9 12653813 Commerce 2        41    95
10 12653813 Commerce 3        54    96

CodePudding user response：

Answering your first question assuming that you are interested in students participating in ALL of these courses:

I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.

library(tidyverse)
df %>%
  mutate(across(-Roll.No., ~ as.numeric(.))) %>%
  rowwise() %>%
  mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
                                     sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
                                     sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
                                     TRUE ~ NA_character_)) %>%
  ungroup()

gives:

# A tibble: 20 x 14
   Roll.No.   SUB   MRK SUB.1 MRK.1 SUB.2 MRK.2 SUB.3 MRK.3 SUB.4 MRK.4 SUB.5 MRK.5 subject_summary
      <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>          
 1 12653771   301    93    30    74    48    76    54    80    55    45    NA    NA Commerce       
 2 12653813   301    82    30    97    41    95    54    96    55    95    64    97 Commerce       
 3 12653787   301    85    30    75    48    79    54    77    55    46    NA    NA Commerce       
 4 12653850   301    74    30    73    54    74    55    44    64    76    41    NA Commerce       
 5 12653660   301    85    42    72    43    72    44    94   265    91    48    87 PCB            
 6 12653553   301    80    41    81    42    75    43    69    48    74    NA    NA PCM            
 7 12653902   301    67    30    62    48    67    54    63    55    44    NA    NA Commerce       
 8 12653888   301    77    30    71    48    74    54    74    55    52    NA    NA Commerce       
 9 12653440   301    78    41    63    42    60    43    57    48    82    NA    NA PCM            
10 12653487   301    94    41    75    42    72    43    67    48    92    NA    NA PCM            
11 12653469   301    89    41    93    42    93    43    80   283    92    NA    NA PCM            
12 12653832   301    65    30    74    41    56    54    67    55    60    64    71 Commerce       
13 12653382   301    88    41    89    42    79    43    72    48    81    NA    NA PCM            
14 12653814   301    89    30    91    41    68    54    89    55    49    49    97 Commerce       
15 12653587   301    82    42    67    43    68    44    95    64    83    NA    NA PCB            
16 12653508   301    85    41    87    42    91    43    87   283    89    48    83 PCM            
17 12653449   301    91    41    81    42    62    43    68   283    90    49    98 PCM            
18 12653445   301    77    41    94    42    75    43    82    48    83    NA    NA PCM            
19 12653776   301    97    30    86    48    95    54    94    55    93    NA    NA Commerce       
20 12653806   301    76    30    69    48    67    54    69    55    61    NA    NA Commerce

You second question can be answered by continuing above code (but showing the full code here for convenience):

df %>%
  mutate(across(-Roll.No., ~ as.numeric(.))) %>%
  rowwise() %>%
  mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
                                     sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
                                     sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
                                     TRUE ~ NA_character_)) %>%
  ungroup() %>%
  pivot_longer(cols = -c(Roll.No., subject_summary, SUB, MRK),
               names_pattern = "(.*)(\\d{1})",
               names_to = c(".value", "course")) %>%
  group_by(Roll.No.) %>%
  summarize(subject_summary = first(subject_summary),
            grade = case_when(all(subject_summary == "PCB") ~ sum(MRK.[SUB. %in% c(42, 43, 44)])   first(MRK),
                              all(subject_summary == "PCM") ~ sum(MRK.[SUB. %in% c(41, 42, 43)])   first(MRK),
                              all(subject_summary == "Commerce") ~ sum(MRK.[SUB. %in% c(55, 54, 30)])   first(MRK)))

gives:

# A tibble: 20 x 3
   Roll.No. subject_summary grade
      <int> <chr>           <dbl>
 1 12653382 PCM               328
 2 12653440 PCM               258
 3 12653445 PCM               328
 4 12653449 PCM               302
 5 12653469 PCM               355
 6 12653487 PCM               308
 7 12653508 PCM               350
 8 12653553 PCM               305
 9 12653587 PCB               312
10 12653660 PCB               323
11 12653771 Commerce          292
12 12653776 Commerce          370
13 12653787 Commerce          283
14 12653806 Commerce          275
15 12653813 Commerce          370
16 12653814 Commerce          318
17 12653832 Commerce          266
18 12653850 Commerce          265
19 12653888 Commerce          274
20 12653902 Commerce          236