I need to apply a KBinsDiscretizer
as a step within a sklearn.pipeline
only on specific columns and return it as a pandas dataframe as following:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
class PandasColumnTransformer(ColumnTransformer):
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
return pd.DataFrame(super().fit_transform(X), columns=X.columns, index=X.index)
class PandasKBinsDiscretizer(KBinsDiscretizer):
def __init__(self, n_bins):
super(PandasKBinsDiscretizer, self).__init__(n_bins, encode='ordinal')
def transform(self, X):
self.col_names = list(X.columns.values)
X = super(PandasKBinsDiscretizer, self).transform(X)
X = pd.DataFrame(X, columns=self.col_names)
return X
binner_on_numeric = PandasColumnTransformer(transformers=[
("binner", PandasKBinsDiscretizer(2), 'numeric_col_to_change')])
pp = Pipeline([('binner_just_numeric', binner_on_numeric)])
d = {'numeric_col_not_to_change': [1, 2, 1, 2, 1, 2],
'numeric_col_to_change': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(data=d)
res = pp.fit_transform(df)
assert isinstance(res, pd.DataFrame)
Im getting the following error:
ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.
Any help on that would we awsome!
CodePudding user response:
This error occurs because you are selecting one element in your ColumnTransformer
. You can change it to a 2D array by using a list ['numeric_col_to_change']
.
You can also specify how you want to treat elements that are not handle by the ColumnTransformer
with the remainder
parameter. remainder='passthrough'
will simply return them as-is instead of removing them.
This should work:
binner_on_numeric = PandasColumnTransformer(transformers=[
("binner", PandasKBinsDiscretizer(2), ['numeric_col_to_change'])]
,remainder='passthrough')
res = pp.fit_transform(df)
will return the following dataframe:
numeric_col_not_to_change numeric_col_to_change
0 0.0 1.0
1 0.0 2.0
2 0.0 1.0
3 1.0 2.0
4 1.0 1.0
5 1.0 2.0