I have data on subject codes and grades for students. Each has to take 4 subjects out of which English is mandatory. It is represented by code 301.
Columns: SUB1, SUB2, and so on represent subject codes for other modules and the next column represents the marks.
Based on these codes I am trying to do two things:
First thing:
I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.
The issue that I am facing is that the codes are spread across columns and I facing difficulty to standardize them.
Second thing:
Based on the identified course, I am trying to sum the grades obtained by each student in these subjects. However, I am want to add the English grade to it as well.
Example of the data:
structure(list(Roll.No. = c(12653771L, 12653813L, 12653787L,
12653850L, 12653660L, 12653553L, 12653902L, 12653888L, 12653440L,
12653487L, 12653469L, 12653832L, 12653382L, 12653814L, 12653587L,
12653508L, 12653449L, 12653445L, 12653776L, 12653806L), SUB = c(301L,
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L,
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L), MRK = c(93L,
82L, 85L, 74L, 85L, 80L, 67L, 77L, 78L, 94L, 89L, 65L, 88L, 89L,
82L, 85L, 91L, 77L, 97L, 76L), SUB.1 = c(30L, 30L, 30L, 30L,
42L, 41L, 30L, 30L, 41L, 41L, 41L, 30L, 41L, 30L, 42L, 41L, 41L,
41L, 30L, 30L), MRK.1 = c(74L, 97L, 75L, 73L, 72L, 81L, 62L,
71L, 63L, 75L, 93L, 74L, 89L, 91L, 67L, 87L, 81L, 94L, 86L, 69L
), SUB.2 = c(48L, 41L, 48L, 54L, 43L, 42L, 48L, 48L, 42L, 42L,
42L, 41L, 42L, 41L, 43L, 42L, 42L, 42L, 48L, 48L), MRK.2 = c(76L,
95L, 79L, 74L, 72L, 75L, 67L, 74L, 60L, 72L, 93L, 56L, 79L, 68L,
68L, 91L, 62L, 75L, 95L, 67L), SUB.3 = c(54L, 54L, 54L, 55L,
44L, 43L, 54L, 54L, 43L, 43L, 43L, 54L, 43L, 54L, 44L, 43L, 43L,
43L, 54L, 54L), MRK.3 = c(80L, 96L, 77L, 44L, 94L, 69L, 63L,
74L, 57L, 67L, 80L, 67L, 72L, 89L, 95L, 87L, 68L, 82L, 94L, 69L
), SUB.4 = c(55L, 55L, 55L, 64L, 265L, 48L, 55L, 55L, 48L, 48L,
283L, 55L, 48L, 55L, 64L, 283L, 283L, 48L, 55L, 55L), MRK.4 = c(45L,
95L, 46L, 76L, 91L, 74L, 44L, 52L, 82L, 92L, 92L, 60L, 81L, 49L,
83L, 89L, 90L, 83L, 93L, 61L), SUB.5 = c(NA, 64L, NA, 41L, 48L,
NA, NA, NA, NA, NA, NA, 64L, NA, 49L, NA, 48L, 49L, NA, NA, NA
), MRK.5 = c(NA, "97", NA, "AB", "87", NA, NA, NA, NA, NA, NA,
"71", NA, "97", NA, "83", "98", NA, NA, NA)), row.names = c(NA,
20L), class = "data.frame")
CodePudding user response:
This should also answer: your first question:
df <- df %>%
# convert to numeric and induce NAs in MRK.5 (losing AB)
mutate(MRK.5 = as.numeric(MRK.5)) %>%
# convert NAs to 0
mutate(across(where(anyNA), ~ replace_na(., 0))) %>%
# create a new string of subjects
mutate(subjects = str_c(SUB.1, SUB.2, SUB.3, SUB.4, SUB.5, sep = "_")) %>%
# use case_when to specify course
mutate(course = case_when(str_detect(subjects, "42") & str_detect(subjects, "43") & str_detect(subjects, "44") ~ "PCB",
str_detect(subjects, "41") & str_detect(subjects, "42") & str_detect(subjects, "43") ~ "PCM",
str_detect(subjects, "30") & str_detect(subjects, "54") & str_detect(subjects, "55") ~ "Commerce",
TRUE ~ "Other"))
I am not sure whether you want the summary scores for just the three subjects plus English in each course, or for all subjects, but with a bit of wrangling you can the following dataframe, from where you should be able to get what you need.
df %>% select(-subjects) %>%
# create consistent naming for the pivot separation
rename(SUB.E = SUB, MRK.E = MRK) %>%
# pivot into tidy form
pivot_longer(-c(Roll.No.,course),
names_to = c("Var", ".value"),
names_sep = "\\.") %>%
pivot_longer(-c(Roll.No.,course,Var), names_to = "sub") %>%
pivot_wider(names_from = Var, values_from = value)
Giving:
A tibble: 120 × 5
Roll.No. course sub SUB MRK
<int> <chr> <chr> <dbl> <dbl>
1 12653771 Commerce E 301 93
2 12653771 Commerce 1 30 74
3 12653771 Commerce 2 48 76
4 12653771 Commerce 3 54 80
5 12653771 Commerce 4 55 45
6 12653771 Commerce 5 0 0
7 12653813 Commerce E 301 82
8 12653813 Commerce 1 30 97
9 12653813 Commerce 2 41 95
10 12653813 Commerce 3 54 96
CodePudding user response:
Answering your first question assuming that you are interested in students participating in ALL of these courses:
I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.
library(tidyverse)
df %>%
mutate(across(-Roll.No., ~ as.numeric(.))) %>%
rowwise() %>%
mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
TRUE ~ NA_character_)) %>%
ungroup()
gives:
# A tibble: 20 x 14
Roll.No. SUB MRK SUB.1 MRK.1 SUB.2 MRK.2 SUB.3 MRK.3 SUB.4 MRK.4 SUB.5 MRK.5 subject_summary
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 12653771 301 93 30 74 48 76 54 80 55 45 NA NA Commerce
2 12653813 301 82 30 97 41 95 54 96 55 95 64 97 Commerce
3 12653787 301 85 30 75 48 79 54 77 55 46 NA NA Commerce
4 12653850 301 74 30 73 54 74 55 44 64 76 41 NA Commerce
5 12653660 301 85 42 72 43 72 44 94 265 91 48 87 PCB
6 12653553 301 80 41 81 42 75 43 69 48 74 NA NA PCM
7 12653902 301 67 30 62 48 67 54 63 55 44 NA NA Commerce
8 12653888 301 77 30 71 48 74 54 74 55 52 NA NA Commerce
9 12653440 301 78 41 63 42 60 43 57 48 82 NA NA PCM
10 12653487 301 94 41 75 42 72 43 67 48 92 NA NA PCM
11 12653469 301 89 41 93 42 93 43 80 283 92 NA NA PCM
12 12653832 301 65 30 74 41 56 54 67 55 60 64 71 Commerce
13 12653382 301 88 41 89 42 79 43 72 48 81 NA NA PCM
14 12653814 301 89 30 91 41 68 54 89 55 49 49 97 Commerce
15 12653587 301 82 42 67 43 68 44 95 64 83 NA NA PCB
16 12653508 301 85 41 87 42 91 43 87 283 89 48 83 PCM
17 12653449 301 91 41 81 42 62 43 68 283 90 49 98 PCM
18 12653445 301 77 41 94 42 75 43 82 48 83 NA NA PCM
19 12653776 301 97 30 86 48 95 54 94 55 93 NA NA Commerce
20 12653806 301 76 30 69 48 67 54 69 55 61 NA NA Commerce
You second question can be answered by continuing above code (but showing the full code here for convenience):
df %>%
mutate(across(-Roll.No., ~ as.numeric(.))) %>%
rowwise() %>%
mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
TRUE ~ NA_character_)) %>%
ungroup() %>%
pivot_longer(cols = -c(Roll.No., subject_summary, SUB, MRK),
names_pattern = "(.*)(\\d{1})",
names_to = c(".value", "course")) %>%
group_by(Roll.No.) %>%
summarize(subject_summary = first(subject_summary),
grade = case_when(all(subject_summary == "PCB") ~ sum(MRK.[SUB. %in% c(42, 43, 44)]) first(MRK),
all(subject_summary == "PCM") ~ sum(MRK.[SUB. %in% c(41, 42, 43)]) first(MRK),
all(subject_summary == "Commerce") ~ sum(MRK.[SUB. %in% c(55, 54, 30)]) first(MRK)))
gives:
# A tibble: 20 x 3
Roll.No. subject_summary grade
<int> <chr> <dbl>
1 12653382 PCM 328
2 12653440 PCM 258
3 12653445 PCM 328
4 12653449 PCM 302
5 12653469 PCM 355
6 12653487 PCM 308
7 12653508 PCM 350
8 12653553 PCM 305
9 12653587 PCB 312
10 12653660 PCB 323
11 12653771 Commerce 292
12 12653776 Commerce 370
13 12653787 Commerce 283
14 12653806 Commerce 275
15 12653813 Commerce 370
16 12653814 Commerce 318
17 12653832 Commerce 266
18 12653850 Commerce 265
19 12653888 Commerce 274
20 12653902 Commerce 236