How can I implement a method for feature engineering in a Pipeline? Why am I getting the following error?
I want to apply my add_features
method after my data has been preprocessed in my PipeLine and there are no more missing values. Due to the fact that I get an error if I try to apply the method on a Dataset with NA values. So I thought the appropriate answer was to add the method as the next step in my PipeLine.
How can I achieve this? My cross-split variable definition is not included in this code but it was defined previously. If it is of any help I am using the 'Titanic Disaster' dataset.
My code:
nominal_pipeline = Pipeline([
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OneHotEncoder(drop='first'))
])
ordinal_pipeline = Pipeline([
('imputer', SimpleImputer(strategy = 'most_frequent')),
('encoder',OrdinalEncoder())
])
numeric_pipeline = Pipeline([
('imputer',SimpleImputer(strategy='mean')),
('scaler',StandardScaler())
])
preprocessing_pipeline = ColumnTransformer([
('nominal_preprocessor', nominal_pipeline,nominal),
('ordinal_preprocessor', ordinal_pipeline,ordinal),
('numeric_preprocessor', numeric_pipeline,numerical)
])
def add_features (df):
df['Family_size'] = df['Parch'] df['SibSp'] 1
df['Alone'] = 0
df.loc[df.Family_size == 1, 'Alone'] = 1
df.loc[ df['Fare'] <= 130, 'Fare'] = 0
df.loc[(df['Fare'] > 130) & (df['Fare'] <= 256), 'Fare'] = 1
df.loc[(df['Fare'] > 256) & (df['Fare'] <= 384), 'Fare'] = 2
df.loc[ df['Fare'] > 384, 'Fare'] = 3
df['Fare'] = df['Fare'].astype(int)
df['Title'] = df.Name.str.extract(' ([A-Za-z] )\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
df['Title'] = df['Title'].map(title_mapping)
df['Title'] = df['Title'].fillna(0)
df.drop(columns = ["Name"] , inplace = True)
return df
get_features = FunctionTransformer(add_features, validate=False)
knn = KNeighborsClassifier(n_neighbors=3)
complete_pipeline_knn = Pipeline([
('inputter',preprocessing_pipeline),
('feat_eng',get_features),
('estimator', knn)
], verbose=True)
complete_pipeline_knn.fit(X_train,y_train)
y_pred = complete_pipeline_knn.predict(X_train)
y_pred_test = complete_pipeline_knn.predict(test)
OUT[1]: [Pipeline] .......... (step 1 of 3) Processing inputter, total= 0.0s
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/tmp/ipykernel_17/1113124560.py in <module>
4 ('estimator', knn)
5 ], verbose=True)
----> 6 complete_pipeline_knn.fit(X_train,y_train)
7
8 y_pred = complete_pipeline_knn.predict(X_train)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
--> 355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
853 else:
854 # fit method of arity 2 (supervised transformation)
--> 855 return self.fit(X, y, **fit_params).transform(X)
856
857
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_function_transformer.py in transform(self, X)
180 """
181 X = self._check_input(X, reset=False)
--> 182 return self._transform(X, func=self.func, kw_args=self.kw_args)
183
184 def inverse_transform(self, X):
/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_function_transformer.py in _transform(self, X, func, kw_args)
203 func = _identity
204
--> 205 return func(X, **(kw_args if kw_args else {}))
206
207 def __sklearn_is_fitted__(self):
/tmp/ipykernel_17/3427601580.py in add_features(df)
1 def add_features (df):
----> 2 df['Family_size'] = df['Parch'] df['SibSp'] 1
3 df['Alone'] = 0
4 df.loc[df.Family_size == 1, 'Alone'] = 1
5
/opt/conda/lib/python3.7/site-packages/scipy/sparse/_index.py in __getitem__(self, key)
31 """
32 def __getitem__(self, key):
---> 33 row, col = self._validate_indices(key)
34 # Dispatch to specialized methods.
35 if isinstance(row, INT_TYPES):
/opt/conda/lib/python3.7/site-packages/scipy/sparse/_index.py in _validate_indices(self, key)
136 row = M
137 elif not isinstance(row, slice):
--> 138 row = self._asindices(row, M)
139
140 if isintlike(col):
/opt/conda/lib/python3.7/site-packages/scipy/sparse/_index.py in _asindices(self, idx, length)
160
161 if x.ndim not in (1, 2):
--> 162 raise IndexError('Index dimension must be <= 2')
163
164 if x.size == 0:
IndexError: Index dimension must be <= 2
CodePudding user response:
Your first pipeline step preprocessing_pipeline
, transforms the data into a sparse array, and so accessing columns by name in your add_features
fails. Anyway, I don't think it would do what you want: your function looks at values of Fare
, but after the preprocessing pipeline those values would've been scaled, etc. Why do you want to apply it after preprocessing?