I am working on a very basic version of simulating a random forest model using tidymodels and ranger. I am just trying to make it run. I will deal with the validity of the model later. When I run the model, I get the following error:
"unique notes:
─────────────────────────────────────────────────────────────
Error in check_outcome()
:
! For a classification model, the outcome should be a factor."
I am at a loss because the simulated data, clearly, has the outcome as a factor. What am I missing? Is this a situation, where the vfolds is selecting portions of the outcome that does not have the complete factor? How would I check this?
Here is the code I am using:
library(tidyverse)
library(tidymodels)
library(themis)
library(dplyr)
library(vip)
library(forcats)
set.seed(987)
n = 1500
#simulated data
v1 = sample(c(0,1), size = n, replace = TRUE)
v2 = round(runif(n, 18, 80))
v3 = sample(c(0,1), size = n, replace = TRUE)
v4 = sample(c(0,1), size = n, replace = TRUE)
v5 = sample(c(0,1), size = n, replace = TRUE)
v6 = rbinom(n = n, size = 1, prob = .50)
xb = -9 3.5*v1 0.2*v2 0*v3 0*v4 0*v5 0*v6
p = 1/(1 exp(-xb))
y1 = rbinom(n = n, size = 1, prob = p)
y1 =
dplyr::recode_factor(y1, `1` = "yes", `2` = "no")
dat_set = as.data.frame(cbind(v1, v2, v3, v4, v5, v6, y1))
dat_set |>
count(y1)
class(y1)
#splitting into training and testing data
set.seed(123)
rf_split = initial_split(dat_set, strata = y1)
rf_train = training(rf_split)
rf_test = testing(rf_split)
#developing the model
set.seed(234)
rf_folds = vfold_cv(rf_train, strata = y1)
ranger_recipe =
recipe(formula = y1 ~ ., data = rf_train)
ranger_spec =
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) |>
set_mode("classification") |>
set_engine("ranger", importance = "impurity")
ranger_workflow =
workflow() |>
add_recipe(ranger_recipe) |>
add_model(ranger_spec)
set.seed(93186)
ranger_tune =
tune_grid(ranger_workflow,
resamples = rf_folds,
control = control_resamples(save_pred = TRUE))
I am, relatively, new to R and tidymodels. Please, point out my mistakes and I am open to all suggestions.
CodePudding user response:
The problem is that when you use cbind
it will convert to a matrix with only one class. This means that your y1 will still be a numeric variable. You check it with class
but this is the y1 still before cbind
, so you should convert it back to a factor like this:
library(tidyverse)
library(tidymodels)
library(themis)
library(dplyr)
library(vip)
library(forcats)
set.seed(987)
n = 1500
#simulated data
v1 = sample(c(0,1), size = n, replace = TRUE)
v2 = round(runif(n, 18, 80))
v3 = sample(c(0,1), size = n, replace = TRUE)
v4 = sample(c(0,1), size = n, replace = TRUE)
v5 = sample(c(0,1), size = n, replace = TRUE)
v6 = rbinom(n = n, size = 1, prob = .50)
xb = -9 3.5*v1 0.2*v2 0*v3 0*v4 0*v5 0*v6
p = 1/(1 exp(-xb))
y1 = rbinom(n = n, size = 1, prob = p)
y1 =
dplyr::recode_factor(y1, `1` = "yes", `2` = "no")
#> Warning: Unreplaced values treated as NA as `.x` is not compatible.
#> Please specify replacements exhaustively or supply `.default`.
dat_set = as.data.frame(cbind(v1, v2, v3, v4, v5, v6, y1))
# check type
str(dat_set)
#> 'data.frame': 1500 obs. of 7 variables:
#> $ v1: num 0 0 0 0 1 0 1 0 0 0 ...
#> $ v2: num 53 23 25 70 68 78 62 48 46 56 ...
#> $ v3: num 1 0 1 0 1 1 1 1 1 0 ...
#> $ v4: num 1 1 0 1 1 0 1 0 0 1 ...
#> $ v5: num 1 0 0 0 1 1 0 0 1 1 ...
#> $ v6: num 0 1 0 0 0 1 0 0 1 1 ...
#> $ y1: num 1 NA NA 1 1 1 1 NA NA 1 ...
# Convert to factor
dat_set$y1 <- as.factor(dat_set$y1)
dat_set |>
count(y1)
#> y1 n
#> 1 1 1015
#> 2 <NA> 485
class(y1)
#> [1] "factor"
#splitting into training and testing data
set.seed(123)
rf_split = initial_split(dat_set, strata = y1)
rf_train = training(rf_split)
rf_test = testing(rf_split)
#developing the model
set.seed(234)
rf_folds = vfold_cv(rf_train, strata = y1)
ranger_recipe =
recipe(formula = y1 ~ ., data = rf_train)
ranger_spec =
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) |>
set_mode("classification") |>
set_engine("ranger", importance = "impurity")
ranger_workflow =
workflow() |>
add_recipe(ranger_recipe) |>
add_model(ranger_spec)
set.seed(93186)
ranger_tune =
tune_grid(ranger_workflow,
resamples = rf_folds,
control = control_resamples(save_pred = TRUE))
#> i Creating pre-processing data to finalize unknown parameter: mtry
#> x Fold01: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold01: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold02: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold03: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold04: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold05: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold06: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold07: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold08: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold09: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 1/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 2/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 3/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 4/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 5/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 6/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 7/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 8/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 9/10: Error: Missing data in dependent variable.
#> x Fold10: preprocessor 1/1, model 10/10: Error: Missing data in dependent variable.
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.
Created on 2022-12-08 with reprex v2.0.2