Everything works fine, as long as I don't use factors data (my original data contains 8500 rows and more columns):
data.frame(
p2p = c(40,69,65,99,27,34,22,24,25,54,54,
58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
34,28,67),
Age = c(75,27,27,49,56,14,59,53,57,27,31,
52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
71,38),
ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
9L,6L,6L),
class = as.factor(c("hexp","hexp","hexp",
"hexp","mid","mid","mid","mid","hexp","mid",
"mid","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","mid",
"hexp","hexp","mid","hexp","mid","mid","mid",
"mid","hexp","hexp","hexp","hexp","mid","mid",
"mid","mid","mid","mid","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp"))
)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm")
lm_spec
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
prep()
lm_fit <- lm_spec %>%
fit(p2p ~ .,
data = juice(fall_rec)
)
If I then use:
results_train <- lm_fit %>%
predict(new_data = hc_train)
I get the error: Fehler in eval(predvars, data, env) : Objekt 'class_hexp' nicht gefunden
I can't see my error. Unused levels are deleted, names doesn't contain '-' ...
CodePudding user response:
You should convert your "class" column to numeric and the name of the column changes in the fit to "class_mid" so you should change your column name in train to "class_mid" like this:
ml_fall <- data.frame(
p2p = c(40,69,65,99,27,34,22,24,25,54,54,
58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
34,28,67),
Age = c(75,27,27,49,56,14,59,53,57,27,31,
52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
71,38),
ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
9L,6L,6L),
class = as.factor(c("hexp","hexp","hexp",
"hexp","mid","mid","mid","mid","hexp","mid",
"mid","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","mid",
"hexp","hexp","mid","hexp","mid","mid","mid",
"mid","hexp","hexp","hexp","hexp","mid","mid",
"mid","mid","mid","mid","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp"))
)
library(tidymodels)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 3 breaks instead.
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
lm_spec
#> Linear Regression Model Specification (regression)
#>
#> Computational engine: lm
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
prep()
lm_fit <- lm_spec %>%
fit(p2p ~ .,
data = bake(fall_rec, new_data = NULL)
)
# colname and numeric
colnames(hc_train) <- c("p2p", "Age", "ank_hour", "class_mid")
hc_train$class_mid <- as.numeric(hc_train$class_mid)
results_train <- lm_fit %>%
predict(new_data = hc_train)
results_train
#> # A tibble: 45 × 1
#> .pred
#> <dbl>
#> 1 51.0
#> 2 49.3
#> 3 48.2
#> 4 46.0
#> 5 43.5
#> 6 48.1
#> 7 47.7
#> 8 26.3
#> 9 31.8
#> 10 37.7
#> # … with 35 more rows
Created on 2022-07-16 by the reprex package (v2.0.1)
CodePudding user response:
Finally I used workflows and removed skip = TRUE from the recipe.
library(workflows)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
lm_spec
#### Recipe
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>% prep()
fall_rec
### Workflow
lm_wflow <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(fall_rec)
lm_wflow
lm_fit <- fit(lm_wflow, data = hc_train)
lm_fit
results_train <- predict(lm_fit, new_data = hc_test) %>%
mutate(truth = hc_test$p2p)