as I come from a classical time series analysis approach, I am still kinda new to parameter tuning. As tuning all local models (couple of hundreds of time series for product demand in my case) turns out to be not even near scalability, I want to analyze first the effect of tuning time series with low accuracy values, to evaluate the trade-off between scalability and accuracy, to see if tuning is justified for a particular time series issue. When I run the code below, it seems like I didn't properly specify the ranges for the regular grid. I think so, because it seems odd, to retrieve only three combinations for a tree value with range from 50 to 2000. Is this standard behavior? Does changing the levels argument help in this case somehow? It didn't change anything in my case. Also, is there a way to retrieve the optimal number of folds for resampling, rather than guessing it? I hope for some advise or useful examples.
Thanks in advance!
# data and libs
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(timetk))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
suppressPackageStartupMessages(library(tictoc))
suppressPackageStartupMessages(library(readxl))
dates <- ymd("2016-01-01") months(0:59)
fake_values <-c(296,325,339,812,723,310,842,500,555,260,243,306,204,330,467,713,1054,827,75,437 ,558,222,350,139,306,395,472,741,1020,903,837,738,676,506,199,219,342,406,417 ,977,1503,117,942,843,716,378,267,392,329,369,536,1168,1260,1066,949,906,1744,2495,418,447)
df <- bind_cols(fake_values, dates) %>%
rename(c(y = ...1, ds = ...2))
# training- and test set
data_splits <- initial_time_split(df, prop = 0.8)
data_train <- training(data_splits)
data_test <- testing(data_splits)
# plot cv
split_obj<- time_series_split(df, assess = "1 year", cumulative = TRUE)
split_obj %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(ds, y)
# Resample - CV plan
resampling_strategy <-
data_train %>%
time_series_cv(
initial = "36 months",
assess = "12 months",
skip = "12 months",
cumulative = TRUE
)
# model spec
prophet_boost_model <- prophet_boost(
mode = "regression",
growth = "linear",
changepoint_num =tune(),
changepoint_range = tune(),
trees = tune()
) %>%
set_engine("prophet_xgboost")
# regular grid
prophet_grid <- grid_regular(
changepoint_num(range = c(1L, 45L)),
changepoint_range(range = c(0.5, 0.9)),
trees(range = c(50,2000))
#, levels = 10
#, size = 100
)
# recipe
basic_rec <- recipe(y ~ ds, data = data_train)
# wf
wflw_spec_tune_prophet <- workflow() %>%
add_model(prophet_boost_model) %>%
add_recipe(basic_rec)
# parallel proc
#cores <- parallel::detectCores(logical = FALSE)
library(doParallel)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
# automated tuning
tic()
tune_results <-
wflw_spec_tune_prophet %>%
tune_grid(
resamples = resampling_strategy,
grid = prophet_grid,
metrics = metric_set(rmse, mae))
toc()
stopCluster(cl)
# save the best model
best_results <- tune_results %>%
show_best(metric = 'rmse',n = 20)
best_results
best_results$mean
CodePudding user response:
You're right on the money! Changing the levels
parameter in grid_regular()
is how you can increase the number of parameters to try within your range. Here's a few examples - hope this helps!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# levels will default to 3 for each tuned paramater
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10))
)
#> # A tibble: 9 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 1025 1
#> 3 2000 1
#> 4 50 5
#> 5 1025 5
#> 6 2000 5
#> 7 50 10
#> 8 1025 10
#> 9 2000 10
# you can also specify the number of levels!
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10)),
levels = 5
)
#> # A tibble: 25 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 537 1
#> 3 1025 1
#> 4 1512 1
#> 5 2000 1
#> 6 50 3
#> 7 537 3
#> 8 1025 3
#> 9 1512 3
#> 10 2000 3
#> # ... with 15 more rows
# or, if you want to, you can specify different number
# of tuning parameters to try for each by creating a vector
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10)),
levels = c(3, 2)
)
#> # A tibble: 6 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 1025 1
#> 3 2000 1
#> 4 50 10
#> 5 1025 10
#> 6 2000 10
Created on 2021-10-18 by the reprex package (v2.0.1)