I need to calculate new columns based on data that look like this:
structure(list(english_score = c(3L, 4L, 3L, 3L, 4L, 3L, 4L,
2L, 4L, 2L, 3L, 3L, 2L, 2L, 3L, 4L, 3L, 3L, 4L, 3L, 4L, 3L, 2L
), math_score = c(4L, 4L, 3L, 4L, 4L, 4L, 3L, 2L, 3L, 3L, 4L,
2L, 4L, 2L, 4L, 2L, 3L, 3L, 2L, 2L, 2L, 4L, 2L), science_score = c(3L,
4L, 4L, 4L, 3L, 4L, 4L, 3L, 3L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
2L, 3L, 2L, 3L, 3L, 4L)), row.names = c(NA, -23L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000002478ee34d50>)
I want to produce something like this:
structure(list(english_score = c(3L, 4L, 3L, 3L, 4L, 3L, 4L,
2L, 4L, 2L, 3L, 3L, 2L, 2L, 3L, 4L, 3L, 3L, 4L, 3L, 4L, 3L, 2L
), math_score = c(4L, 4L, 3L, 4L, 4L, 4L, 3L, 2L, 3L, 3L, 4L,
2L, 4L, 2L, 4L, 2L, 3L, 3L, 2L, 2L, 2L, 4L, 2L), science_score = c(3L,
4L, 4L, 4L, 3L, 4L, 4L, 3L, 3L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
2L, 3L, 2L, 3L, 3L, 4L), english_level = c("Level C", "Level D",
"Level C", "Level C", "Level D", "Level C", "Level D", "Level B",
"Level D", "Level B", "Level C", "Level C", "Level B", "Level B",
"Level C", "Level D", "Level C", "Level C", "Level D", "Level C",
"Level D", "Level C", "Level B"), math_level = c("Level D", "Level D",
"Level C", "Level D", "Level D", "Level D", "Level C", "Level B",
"Level C", "Level C", "Level D", "Level B", "Level D", "Level B",
"Level D", "Level B", "Level C", "Level C", "Level B", "Level B",
"Level B", "Level D", "Level B"), science_level = c("Level C",
"Level D", "Level D", "Level D", "Level C", "Level D", "Level D",
"Level C", "Level C", "Level B", "Level C", "Level D", "Level D",
"Level D", "Level D", "Level D", "Level D", "Level B", "Level C",
"Level B", "Level C", "Level C", "Level D")), row.names = c(NA,
-23L), class = c("data.table", "data.frame"), .internal.selfref = <pointer:
0x000002478ee34d50>)
So far, my approach has been to use a function to calculate the levels of the new variables...
myfunction<-function(x){case_when(x<2~"Level A",
x>1 & x<3~"Level B",
x>2 & x<4~"Level C",
x>3~"Level D")}
....and then, create the new variables and assign their names one by one.
DT[, english_level:=lapply(.SD, myfunction), .SDcols='english_score']
DT[, math_level:=lapply(.SD, myfunction), .SDcols='math_score']
DT[, science_level:=lapply(.SD, myfunction), .SDcols='science_score']
How can I simplify this process, preferably using data.table?
CodePudding user response:
I would do this (I've called your data DT
as utils::data()
is a base R function):
score_cols <- grep("_score$", names(DT), value = TRUE)
level_cols <- sub("_score", "_level", score_cols)
DT[,
(level_cols) := lapply(.SD, myfunction),
.SDcols = score_cols
]
Also your myfunction()
uses dplyr::case_when()
. This will work but some dplyr
functions clash with data.table
(between()
, first()
and last()
with the versions I currently have). You can replace this with data.table::fcase()
.
myfunction <- function(x) {
fcase(
x < 2, "Level A",
x > 1 & x < 3, "Level B",
x > 2 & x < 4, "Level C",
x > 3, "Level D"
)
}
This should be faster than the dplyr
version as well.
Additionally, with this particular function, you could in fact replace the case when type logic with assigning the n
th letter in the alphabet as a grade:
assign_letter_grade <- function(x) {
paste("Level", LETTERS[x])
}
CodePudding user response:
Here's one option where you avoid the need to create your own function. Instead you could create a mapping table and then map each score to the grade level.
grade_map = data.table(
score = c(1:5),
lvl = paste(rep("Level", 5), c("A","B","C","D","F"))
)
lvl_cols = gsub("score","level", names(orig))
score_cols = names(orig)
# Group by Row
orig[,
(lvl_cols) := lapply(.SD, function(x) {grade_map[score %in% x]$lvl}),
by = 1:nrow(orig),
.SDcols = score_cols
]
# Using merge
orig[,
(lvl_cols) := lapply(.SD, function(x) {
merge(data.table(score = x), grade_map, by = "score", sort = F)$lvl
}),
.SDcols = score_cols
]
english_score math_score science_score english_level math_level science_level
1: 3 4 3 Level C Level D Level C
2: 4 4 4 Level D Level D Level D
3: 3 3 4 Level C Level C Level D
4: 3 4 4 Level C Level D Level D
5: 4 4 3 Level D Level D Level C
6: 3 4 4 Level C Level D Level D
7: 4 3 4 Level D Level C Level D
8: 2 2 3 Level B Level B Level C
9: 4 3 3 Level D Level C Level C
10: 2 3 2 Level B Level C Level B
11: 3 4 3 Level C Level D Level C
12: 3 2 4 Level C Level B Level D
13: 2 4 4 Level B Level D Level D
14: 2 2 4 Level B Level B Level D
15: 3 4 4 Level C Level D Level D
16: 4 2 4 Level D Level B Level D
17: 3 3 4 Level C Level C Level D
18: 3 3 2 Level C Level C Level B
19: 4 2 3 Level D Level B Level C
20: 3 2 2 Level C Level B Level B
21: 4 2 3 Level D Level B Level C
22: 3 4 3 Level C Level D Level C
23: 2 2 4 Level B Level B Level D