Below are the train and test sets, train_dat
and test_dat
respectively. I want to iterate through each categorical column in the dataframes and assign each level to an integer that is determined by alphabetical order of the levels of the categorical variable. The integer values to assigned are stored in uniqueValuesTrain
. Without using for loop, how can I achieve a faster and vectorized solution?
# Data
cat_var_1 <- c("blue", "green", "green", "blue", "red", "brown")
cat_var_2 <- c("rock", "blues", "jazz", "jazz", "rock", "indie")
reg_var_1 <- c(23, 22, 21, 24, 56, 28)
target <- c(1, 0, 1, 0, 0, 1)
train_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 , target, stringsAsFactors=TRUE)
cat_var_1 <- c("green", "green", "blue", "blue", "green", "purple", "magenta")
cat_var_2 <- c("rock", "rock", "jazz", "jazz", "jazz", "pop", "house")
reg_var_1 <- c(12, 23, 25, 27, 34, 12, 32)
target <- c(1, 1, 1, 0, 1, 0, 0)
test_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 ,target, stringsAsFactors=TRUE)
targetVariable <- "target"
catVariables <- names(Filter(is.factor, train_dat))
# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]
# Integer Encoding
trainInt <- copy(train_dat)
testInt <- copy(test_dat)
for(col in catVariables){
uniqueValuesTrain <- sort(unique(trainInt[[col]]))
trainInt[[col]] <- match(trainInt[[col]], uniqueValuesTrain)
testInt[[col]] <- match(testInt[[col]], uniqueValuesTrain)
}
CodePudding user response:
Here is one option with tidyverse
library(dplyr)
train_dat <- train_dat %>%
mutate(across(all_of(catVariables),
~ match(.x, sort(unique(.x))),
.names = "{.col}_new"))
test_dat <- test_dat %>%
mutate(across(all_of(catVariables),
~ match(.x, sort(unique(train_dat[[cur_column()]]))), .names = "{.col}_new"))
-output
> train_dat
cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1 blue rock 23 1 1 4
2 green blues 22 0 3 1
3 green jazz 21 1 3 3
4 blue jazz 24 0 1 3
5 red rock 56 0 4 4
6 brown indie 28 1 2 2
> test_dat
cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1 green rock 12 1 3 4
2 green rock 23 1 3 4
3 blue jazz 25 1 1 3
4 blue jazz 27 0 1 3
5 green jazz 34 1 3 3
6 purple pop 12 0 NA NA
7 magenta house 32 0 NA NA
CodePudding user response:
A bit simpler:
library(data.table)
f <- function(x) match(x,sort(unique(x)))
setDT(train_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
setDT(test_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
Output:
cat_var_1 cat_var_2 reg_var_1 target
<int> <int> <num> <num>
1: 1 4 23 1
2: 3 1 22 0
3: 3 3 21 1
4: 1 3 24 0
5: 4 4 56 0
6: 2 2 28 1
cat_var_1 cat_var_2 reg_var_1 target
<int> <int> <num> <num>
1: 2 4 12 1
2: 2 4 23 1
3: 1 2 25 1
4: 1 2 27 0
5: 2 2 34 1
6: 4 3 12 0
7: 3 1 32 0