Home > front end >  Iterating over the colums of a dataframe without using for loop in r
Iterating over the colums of a dataframe without using for loop in r

Time:04-24

Below are the train and test sets, train_dat and test_dat respectively. I want to iterate through each categorical column in the dataframes and assign each level to an integer that is determined by alphabetical order of the levels of the categorical variable. The integer values to assigned are stored in uniqueValuesTrain. Without using for loop, how can I achieve a faster and vectorized solution?

# Data 

cat_var_1 <- c("blue", "green", "green", "blue", "red", "brown")
cat_var_2 <- c("rock", "blues", "jazz", "jazz", "rock", "indie")
reg_var_1 <- c(23, 22, 21, 24, 56, 28)
target <- c(1, 0, 1, 0, 0, 1)

train_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 , target, stringsAsFactors=TRUE)


cat_var_1 <- c("green", "green", "blue", "blue", "green", "purple", "magenta")
cat_var_2 <- c("rock", "rock", "jazz", "jazz", "jazz", "pop", "house")
reg_var_1 <- c(12, 23, 25, 27, 34, 12, 32)
target <- c(1, 1, 1, 0, 1, 0, 0)

test_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 ,target, stringsAsFactors=TRUE)

targetVariable <- "target"
catVariables <- names(Filter(is.factor, train_dat))

# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]

# Integer Encoding

trainInt <- copy(train_dat)
testInt <- copy(test_dat)

for(col in catVariables){
  uniqueValuesTrain <- sort(unique(trainInt[[col]]))
  trainInt[[col]] <- match(trainInt[[col]], uniqueValuesTrain)
  testInt[[col]] <- match(testInt[[col]], uniqueValuesTrain)
}



CodePudding user response:

Here is one option with tidyverse

library(dplyr)
train_dat <- train_dat %>%
  mutate(across(all_of(catVariables),
     ~ match(.x, sort(unique(.x))),
    .names = "{.col}_new"))
test_dat <- test_dat %>% 
   mutate(across(all_of(catVariables), 
   ~ match(.x, sort(unique(train_dat[[cur_column()]]))), .names = "{.col}_new"))

-output

> train_dat
  cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1      blue      rock        23      1             1             4
2     green     blues        22      0             3             1
3     green      jazz        21      1             3             3
4      blue      jazz        24      0             1             3
5       red      rock        56      0             4             4
6     brown     indie        28      1             2             2
> test_dat
  cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1     green      rock        12      1             3             4
2     green      rock        23      1             3             4
3      blue      jazz        25      1             1             3
4      blue      jazz        27      0             1             3
5     green      jazz        34      1             3             3
6    purple       pop        12      0            NA            NA
7   magenta     house        32      0            NA            NA

CodePudding user response:

A bit simpler:

library(data.table)
f <- function(x) match(x,sort(unique(x)))
setDT(train_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
setDT(test_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]

Output:

   cat_var_1 cat_var_2 reg_var_1 target
       <int>     <int>     <num>  <num>
1:         1         4        23      1
2:         3         1        22      0
3:         3         3        21      1
4:         1         3        24      0
5:         4         4        56      0
6:         2         2        28      1

  cat_var_1 cat_var_2 reg_var_1 target
       <int>     <int>     <num>  <num>
1:         2         4        12      1
2:         2         4        23      1
3:         1         2        25      1
4:         1         2        27      0
5:         2         2        34      1
6:         4         3        12      0
7:         3         1        32      0
  • Related