I am trying to Improve a Regression Model using GridSearchCV with Pipeline, but I ran into an error. if i am not worn then, it points to Invalid Paramaters
, I've cross checked the parameters properly, but still i can't debug the code.
## importing libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
## importing the model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
## setup random seed()
import numpy as np
np.random.seed(42)
## Import Data and Drop rows with Missing Labels
data = pd.read_csv("Data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"],inplace=True)
## Define categorical columns
categorical_features = ["Make", "Colour"]
# Create categorical transformer (imputes missing values, then one hot encodes them)
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Define door feature
door_feature = ["Doors"]
# Create door transformer (fills all door missing values with 4)
door_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value=4)),
])
# Define numeric featrue
numeric_features = ["Odometer (KM)"]
# Create a transformer for filling all missing numeric values with the mean
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean'))
])
# Create a column transformer which combines all of the other transformers
# into one step
preprocessor = ColumnTransformer(
transformers=[
('categorical', categorical_transformer, categorical_features),
('door', door_transformer, door_feature),
('numerical', numeric_transformer, numeric_features)
])
# Create the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), # this will fill our missing data and make sure it's all numbers
('regressor', RandomForestRegressor())]) # this will model our data
#split data
x = data.drop("Price",axis=1)
y = data["Price"]
# Split data into train and teset sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# Fit the model on the training data
#(note: when fit() is called with a Pipeline(), fit_transform() is used for transformers)
model.fit(X_train, y_train)
# Score the model on the data
# (note: when score() or predict() is called with a Pipeline(), transform() is used for transformers)
model.score(X_test, y_test)
The GridSearch Tuning
Tuning the model above with GridSearchCV using Pipeline
## from sklearn.model_selection import GridSearchCV
## Already Imported above.
pipe_grid = {
"preprocessor__num__imputer__strategy": ["mean", "median"],
"model__e_estimators": [100, 1000],
"model__max_depth": [None],
"model__max_features": ["auto"],
"model__min_samples_split": [2, 4]
}
gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(x_train,y_train)
Here's the Error i got, After passing some hyperparameter's to Improve on the model.
Fitting 5 folds for each of 8 candidates, totalling 40 fits
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [34], in <cell line: 12>()
3 pipe_grid = {
4 "preprocessor__num__imputer__strategy": ["mean", "median"],
5 "model__e_estimators": [100, 1000],
(...)
8 "model__min_samples_split": [2, 4]
9 }
11 gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
---> 12 gs_model.fit(x_train,y_train)
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
869 results = self._format_results(
870 all_candidate_params, n_splits, all_out, all_more_results
871 )
873 return results
--> 875 self._run_search(evaluate_candidates)
877 # multimetric is determined here because in the case of a callable
878 # self.scoring the return type is only known after calling
879 first_test_score = all_out[0]["test_scores"]
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_search.py:1375, in GridSearchCV._run_search(self, evaluate_candidates)
1373 def _run_search(self, evaluate_candidates):
1374 """Search all candidates in param_grid"""
-> 1375 evaluate_candidates(ParameterGrid(self.param_grid))
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_search.py:822, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
814 if self.verbose > 0:
815 print(
816 "Fitting {0} folds for each of {1} candidates,"
817 " totalling {2} fits".format(
818 n_splits, n_candidates, n_candidates * n_splits
819 )
820 )
--> 822 out = parallel(
823 delayed(_fit_and_score)(
824 clone(base_estimator),
825 X,
826 y,
827 train=train,
828 test=test,
829 parameters=parameters,
830 split_progress=(split_idx, n_splits),
831 candidate_progress=(cand_idx, n_candidates),
832 **fit_and_score_kwargs,
833 )
834 for (cand_idx, parameters), (split_idx, (train, test)) in product(
835 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
836 )
837 )
839 if len(out) < 1:
840 raise ValueError(
841 "No fits were performed. "
842 "Was the CV iterator empty? "
843 "Were there no candidates?"
844 )
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:1043, in Parallel.__call__(self, iterable)
1034 try:
1035 # Only set self._iterating to True if at least a batch
1036 # was dispatched. In particular this covers the edge
(...)
1040 # was very quick and its callback already dispatched all the
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1046 while self.dispatch_one_batch(iterator):
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:779, in Parallel._dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
782 # grow. To ensure correct results ordering, .insert is
783 # used (rather than .append) in the following line
784 self._jobs.insert(job_idx, job)
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
569 def __init__(self, batch):
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:262, in BatchedCalls.__call__(self)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\joblib\parallel.py:262, in <listcomp>(.0)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\utils\fixes.py:117, in _FuncWrapper.__call__(self, *args, **kwargs)
115 def __call__(self, *args, **kwargs):
116 with config_context(**self.config):
--> 117 return self.function(*args, **kwargs)
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\model_selection\_validation.py:674, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
671 for k, v in parameters.items():
672 cloned_parameters[k] = clone(v, safe=False)
--> 674 estimator = estimator.set_params(**cloned_parameters)
676 start_time = time.time()
678 X_train, y_train = _safe_split(estimator, X, y, train)
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\pipeline.py:188, in Pipeline.set_params(self, **kwargs)
169 def set_params(self, **kwargs):
170 """Set the parameters of this estimator.
171
172 Valid parameter keys can be listed with ``get_params()``. Note that
(...)
186 Pipeline class instance.
187 """
--> 188 self._set_params("steps", **kwargs)
189 return self
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\utils\metaestimators.py:72, in _BaseComposition._set_params(self, attr, **params)
69 self._replace_estimator(attr, name, params.pop(name))
71 # 3. Step parameters and other initialisation arguments
---> 72 super().set_params(**params)
73 return self
File ~\Desktop\ML-course\sample_project_1\env\lib\site-packages\sklearn\base.py:246, in BaseEstimator.set_params(self, **params)
244 if key not in valid_params:
245 local_valid_params = self._get_param_names()
--> 246 raise ValueError(
247 f"Invalid parameter {key!r} for estimator {self}. "
248 f"Valid parameters are: {local_valid_params!r}."
249 )
251 if delim:
252 nested_params[key][sub_key] = value
ValueError: Invalid parameter 'model' for estimator Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('categorical',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Make', 'Colour']),
('door',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value=4,
strategy='constant'))]),
['Doors']),
('numerical',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['Odometer (KM)'])])),
('regressor', RandomForestRegressor())]). Valid parameters are: ['memory', 'steps', 'verbose'].
CodePudding user response:
The prefix should be regressor__
, not model__
, according to your pipeline steps naming.
There also seems to be a typo in n_estimators
:
pipe_grid = {
"preprocessor__num__imputer__strategy": ["mean", "median"],
"regressor__n_estimators": [100, 1000],
"regressor__max_depth": [None],
"regressor__max_features": ["auto"],
"regressor__min_samples_split": [2, 4]
}
CodePudding user response:
Change your param_grid
to this:
pipe_grid = {
"preprocessor__num__imputer__strategy": ["mean", "median"],
"regressor__n_estimators": [100, 1000],
"regressor__max_depth": [None],
"regressor__max_features": ["auto"],
"regressor__min_samples_split": [2, 4]
}