How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
CodePudding user response:
Here's an example with mtcars
:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL
CodePudding user response:
This error comes from step_dummy()
because the variable X33
only has one factor "TRUE"
. The easiest way to deal with this in your problem is to use step_zv()
on the nominal predictors before step_dummy()
.
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]