I have a data Gemini_ETHUSD_d.csv which you can download from this link
I try to re-run the code below from this link:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 5
import pandas as pd
# from pandas_datareader import data as web
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('./Gemini_ETHUSD_d.csv', skiprows=1, parse_dates=True, index_col='Date',dtype=str)
df = df.sort_index().drop('Symbol', axis=1)
df.head()
def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):
"""Create a sample plot for indices of a cross-validation object."""
# Generate the training/testing visualizations for each CV split
for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=None)):
# Fill in indices with the training/test groups
indices = np.array([np.nan] * len(X))
indices[tt] = 1
indices[tr] = 0
# Visualize the results
ax.scatter(range(len(indices)), [ii .5] * len(indices),
c=indices, marker='_', lw=lw, cmap=cmap_cv,
vmin=-.2, vmax=1.2)
# Plot the data classes and groups at the end
ax.scatter(range(len(X)), [ii 1.5] * len(X),
c=y, marker='_', lw=lw, cmap=cmap_data)
# Formatting
yticklabels = list(range(n_splits)) ['class']
ax.set(yticks=np.arange(n_splits 2) .5, yticklabels=yticklabels,
xlabel='Sample index', ylabel="CV iteration",
ylim=[n_splits 1.2, -.1], xlim=[0, 100])
ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
return ax
class BlockingTimeSeriesSplit():
def __init__(self, n_splits):
self.n_splits = n_splits
def get_n_splits(self, X, y, groups):
return self.n_splits
def split(self, X, y=None, groups=None):
n_samples = len(X)
k_fold_size = n_samples // self.n_splits
indices = np.arange(n_samples)
margin = 0
for i in range(self.n_splits):
start = i * k_fold_size
stop = start k_fold_size
mid = int(0.5 * (stop - start)) start
yield indices[start: mid], indices[mid margin: stop]
STEPS = 9
for i in np.arange(1, STEPS):
col_name = '{}d_Fwd_Close'.format(i)
df[col_name] = df['Close'].shift(-i)
df = df.dropna()
Features = 6
X = df.iloc[:, :Features]
y = df.iloc[:, Features:]
split = int(len(df) * 0.7)
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]
X.head()
def build_model(_alpha, _l1_ratio):
estimator = ElasticNet(
alpha=_alpha,
l1_ratio=_l1_ratio,
fit_intercept=True,
normalize=False,
precompute=False,
max_iter=16,
copy_X=True,
tol=0.1,
warm_start=False,
positive=False,
random_state=None,
selection='random'
)
return MultiOutputRegressor(estimator, n_jobs=4)
sklearn.metrics.SCORERS.keys()
model = build_model(_alpha=1.0, _l1_ratio=0.3)
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
print(f"RMSE: {rmse.mean()} ( /- {rmse.std()}")
print(f"\nR2: {R2.mean()} ( /- {R2.std()}")
# Blocking time series splitter
btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')
print(f"RMSE: {rmse.mean()} ( /- {rmse.std()}")
print(f"\nR2: {R2.mean()} ( /- {R2.std()}")
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, best_params):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = cv_results['std_test_score']
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Plot Grid search scores
_, ax = plt.subplots(1,1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 ': ' str(val))
ax.set_title(f"Grid Search Best Params: {best_params}", fontsize=12, fontweight='medium')
ax.set_xlabel(name_param_1, fontsize=12)
ax.set_ylabel('CV Average Score', fontsize=12)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
ax.legend(bbox_to_anchor=(1.02, 1.02))
# Time series splitter
model.get_params().keys()
params = {
'estimator__alpha':(0.1, 0.3, 0.5, 0.7, 0.9),
'estimator__l1_ratio':(0.1, 0.3, 0.5, 0.7, 0.9)
}
scores = []
for i in range(30):
model = build_model(_alpha=1.0, _l1_ratio=0.3)
finder = GridSearchCV(
estimator=model,
param_grid=params,
scoring='r2',
n_jobs=4,
iid=False,
refit=True,
cv=tscv, # change this to the splitter subject to test
verbose=1,
pre_dispatch=8,
error_score=-999,
return_train_score=True
)
finder.fit(X_train, y_train)
best_params = finder.best_params_
best_score = round(finder.best_score_,4)
scores.append(best_score)
But it raises a type error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-43-df2fef72cef2> in <module>
168 model = build_model(_alpha=1.0, _l1_ratio=0.3)
169
--> 170 finder = GridSearchCV(
171 estimator=model,
172 param_grid=params,
~\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
TypeError: __init__() got an unexpected keyword argument 'iid'
Based on solution from here, I use command pip install --upgrade scikit-optimize==0.23.3
but it raises another issue:
ERROR: Could not find a version that satisfies the requirement scikit-optimize==0.23.3
ERROR: No matching distribution found for scikit-optimize==0.23.3
Someone could help me to fix this issue? Thanks a lot.
CodePudding user response:
1. Reason
scikit-optimize 0.8.1 has parameter iid which is not accepted by scikit-learn 0.24.2
2. Solution
Downgrade scikit-learn version to 0.22.2 and scikit-optimize to 0.8.1 by:
pip install scikit-optimize==0.8.1
pip install scikit-learn==0.22.2