Using User Method in a Pipeline-CodePudding

How can I implement a method for feature engineering in a Pipeline? Why am I getting the following error?

I want to apply my add_features method after my data has been preprocessed in my PipeLine and there are no more missing values. Due to the fact that I get an error if I try to apply the method on a Dataset with NA values. So I thought the appropriate answer was to add the method as the next step in my PipeLine.

How can I achieve this? My cross-split variable definition is not included in this code but it was defined previously. If it is of any help I am using the 'Titanic Disaster' dataset.

My code:

nominal_pipeline = Pipeline([
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OneHotEncoder(drop='first'))
])

ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder',OrdinalEncoder())
    
  ])

numeric_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
 preprocessing_pipeline = ColumnTransformer([
    ('nominal_preprocessor', nominal_pipeline,nominal),
    ('ordinal_preprocessor', ordinal_pipeline,ordinal),
    ('numeric_preprocessor', numeric_pipeline,numerical)
])

def add_features (df):

  df['Family_size'] = df['Parch']   df['SibSp']   1
  df['Alone'] = 0
  df.loc[df.Family_size == 1, 'Alone'] = 1

  df.loc[ df['Fare'] <= 130, 'Fare'] = 0
  df.loc[(df['Fare'] > 130) & (df['Fare'] <= 256), 'Fare'] = 1
  df.loc[(df['Fare'] > 256) & (df['Fare'] <= 384), 'Fare'] = 2
  df.loc[ df['Fare'] > 384, 'Fare'] = 3
  df['Fare'] = df['Fare'].astype(int)

  df['Title'] = df.Name.str.extract(' ([A-Za-z] )\.', expand=False)
  df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
   'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
  df['Title'] = df['Title'].replace('Mlle', 'Miss')
  df['Title'] = df['Title'].replace('Ms', 'Miss')
  df['Title'] = df['Title'].replace('Mme', 'Mrs')

  title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
  df['Title'] = df['Title'].map(title_mapping)
  df['Title'] = df['Title'].fillna(0)

  df.drop(columns = ["Name"] , inplace = True)

  return df

get_features = FunctionTransformer(add_features, validate=False)
knn = KNeighborsClassifier(n_neighbors=3)


complete_pipeline_knn = Pipeline([
                ('inputter',preprocessing_pipeline),
                ('feat_eng',get_features),
                ('estimator', knn)
            ], verbose=True)
complete_pipeline_knn.fit(X_train,y_train)

y_pred = complete_pipeline_knn.predict(X_train)
y_pred_test = complete_pipeline_knn.predict(test)

OUT[1]: [Pipeline] .......... (step 1 of 3) Processing inputter, total=   0.0s

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/tmp/ipykernel_17/1113124560.py in <module>
      4                 ('estimator', knn)
      5             ], verbose=True)
----> 6 complete_pipeline_knn.fit(X_train,y_train)
      7 
      8 y_pred = complete_pipeline_knn.predict(X_train)

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    388         """
    389         fit_params_steps = self._check_fit_params(**fit_params)
--> 390         Xt = self._fit(X, y, **fit_params_steps)
    391         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392             if self._final_estimator != "passthrough":

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    353                 message_clsname="Pipeline",
    354                 message=self._log_message(step_idx),
--> 355                 **fit_params_steps[name],
    356             )
    357             # Replace the transformer of the step with the fitted

/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    853         else:
    854             # fit method of arity 2 (supervised transformation)
--> 855             return self.fit(X, y, **fit_params).transform(X)
    856 
    857 

/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_function_transformer.py in transform(self, X)
    180         """
    181         X = self._check_input(X, reset=False)
--> 182         return self._transform(X, func=self.func, kw_args=self.kw_args)
    183 
    184     def inverse_transform(self, X):

/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_function_transformer.py in _transform(self, X, func, kw_args)
    203             func = _identity
    204 
--> 205         return func(X, **(kw_args if kw_args else {}))
    206 
    207     def __sklearn_is_fitted__(self):

/tmp/ipykernel_17/3427601580.py in add_features(df)
      1 def add_features (df):
----> 2     df['Family_size'] = df['Parch']   df['SibSp']   1
      3     df['Alone'] = 0
      4     df.loc[df.Family_size == 1, 'Alone'] = 1
      5 

/opt/conda/lib/python3.7/site-packages/scipy/sparse/_index.py in __getitem__(self, key)
     31     """
     32     def __getitem__(self, key):
---> 33         row, col = self._validate_indices(key)
     34         # Dispatch to specialized methods.
     35         if isinstance(row, INT_TYPES):

/opt/conda/lib/python3.7/site-packages/scipy/sparse/_index.py in _validate_indices(self, key)
    136                 row  = M
    137         elif not isinstance(row, slice):
--> 138             row = self._asindices(row, M)
    139 
    140         if isintlike(col):

/opt/conda/lib/python3.7/site-packages/scipy/sparse/_index.py in _asindices(self, idx, length)
    160 
    161         if x.ndim not in (1, 2):
--> 162             raise IndexError('Index dimension must be <= 2')
    163 
    164         if x.size == 0:

IndexError: Index dimension must be <= 2

CodePudding user response：

Your first pipeline step preprocessing_pipeline, transforms the data into a sparse array, and so accessing columns by name in your add_features fails. Anyway, I don't think it would do what you want: your function looks at values of Fare, but after the preprocessing pipeline those values would've been scaled, etc. Why do you want to apply it after preprocessing?