Fehler in eval(predvars, data, env) : Objekt 'class

Everything works fine, as long as I don't use factors data (my original data contains 8500 rows and more columns):

data.frame(
         p2p = c(40,69,65,99,27,34,22,24,25,54,54,
                 58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
                 67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
                 26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
                 34,28,67),
         Age = c(75,27,27,49,56,14,59,53,57,27,31,
                 52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
                 81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
                 51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
                 71,38),
    ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
                 6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
                 6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
                 6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
                 9L,6L,6L),
       class = as.factor(c("hexp","hexp","hexp",
                           "hexp","mid","mid","mid","mid","hexp","mid",
                           "mid","hexp","hexp","hexp","hexp","hexp","hexp",
                           "hexp","hexp","hexp","hexp","hexp","hexp",
                           "hexp","hexp","hexp","hexp","hexp","hexp","mid",
                           "hexp","hexp","mid","hexp","mid","mid","mid",
                           "mid","hexp","hexp","hexp","hexp","mid","mid",
                           "mid","mid","mid","mid","hexp","hexp","hexp",
                           "hexp","hexp","hexp","hexp","hexp","hexp","hexp",
                           "hexp","hexp"))
)

set.seed(1234)
fall_split <- ml_fall %>%
  initial_split(strata = p2p)

hc_train <- training(fall_split)
hc_test <- testing(fall_split)


lm_spec <- linear_reg() %>%
  set_engine(engine = "lm")
lm_spec

fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
  step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
  prep()

lm_fit <- lm_spec %>%
  fit(p2p ~ .,
    data = juice(fall_rec)
  )

If I then use:

results_train <- lm_fit %>%
  predict(new_data = hc_train)

I get the error: Fehler in eval(predvars, data, env) : Objekt 'class_hexp' nicht gefunden

I can't see my error. Unused levels are deleted, names doesn't contain '-' ...

CodePudding user response：

You should convert your "class" column to numeric and the name of the column changes in the fit to "class_mid" so you should change your column name in train to "class_mid" like this:

ml_fall <- data.frame(
  p2p = c(40,69,65,99,27,34,22,24,25,54,54,
          58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
          67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
          26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
          34,28,67),
  Age = c(75,27,27,49,56,14,59,53,57,27,31,
          52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
          81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
          51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
          71,38),
  ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
               6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
               6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
               6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
               9L,6L,6L),
  class = as.factor(c("hexp","hexp","hexp",
                      "hexp","mid","mid","mid","mid","hexp","mid",
                      "mid","hexp","hexp","hexp","hexp","hexp","hexp",
                      "hexp","hexp","hexp","hexp","hexp","hexp",
                      "hexp","hexp","hexp","hexp","hexp","hexp","mid",
                      "hexp","hexp","mid","hexp","mid","mid","mid",
                      "mid","hexp","hexp","hexp","hexp","mid","mid",
                      "mid","mid","mid","mid","hexp","hexp","hexp",
                      "hexp","hexp","hexp","hexp","hexp","hexp","hexp",
                      "hexp","hexp"))
)

library(tidymodels)
set.seed(1234)
fall_split <- ml_fall %>%
  initial_split(strata = p2p)
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 3 breaks instead.

hc_train <- training(fall_split)
hc_test <- testing(fall_split)

lm_spec <- linear_reg() %>%
  set_engine(engine = "lm") %>%
  set_mode("regression")
lm_spec
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
  step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
  prep() 

lm_fit <- lm_spec %>%
  fit(p2p ~ .,
      data = bake(fall_rec, new_data = NULL)
  )

# colname and numeric
colnames(hc_train) <- c("p2p", "Age", "ank_hour", "class_mid")
hc_train$class_mid <- as.numeric(hc_train$class_mid)

results_train <- lm_fit %>%
  predict(new_data = hc_train)

results_train
#> # A tibble: 45 × 1
#>    .pred
#>    <dbl>
#>  1  51.0
#>  2  49.3
#>  3  48.2
#>  4  46.0
#>  5  43.5
#>  6  48.1
#>  7  47.7
#>  8  26.3
#>  9  31.8
#> 10  37.7
#> # … with 35 more rows

^{Created on 2022-07-16 by the reprex package (v2.0.1)}

CodePudding user response：

Finally I used workflows and removed skip = TRUE from the recipe.

library(workflows)
set.seed(1234)
fall_split <- ml_fall %>%
  initial_split(strata = p2p)

hc_train <- training(fall_split)
hc_test <- testing(fall_split)

lm_spec <- linear_reg() %>%
  set_engine(engine = "lm") %>% 
        set_mode("regression")
lm_spec


#### Recipe

fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
        step_dummy(all_nominal(), -all_outcomes()) %>% prep()
fall_rec


### Workflow

lm_wflow <- workflow() %>% 
        add_model(lm_spec) %>% 
        add_recipe(fall_rec)
lm_wflow

lm_fit <- fit(lm_wflow, data = hc_train)
lm_fit

results_train <- predict(lm_fit, new_data = hc_test) %>% 
        mutate(truth = hc_test$p2p)