I wanted to make all of the custom transformations I make to my data in a pipe. I thought that I could use it as pipe.fit_transform(X)
to transform my X before using it in a model, but I also thought that I'll be able to append to the pipeline model itself and simply use it as one using pipe.steps.append(('model', self.model))
.
Unfortunately, after everything was built I've noticed that I'm getting different results when transforming the data and using it directly in a model vs doing everything in one pipeline. Have anyone experienced anything like this?
Adding code:
# Base pipeline to be used
BASE_PIPE = Pipeline([
('dim_increase_num', data_num_mix()),
('dim_increase_cat', data_cat_mix()),
('start', data_get_dummies()),
('dm_correlation', data_x_corr_()),
('scaler', DFStandardScaler()),
('column_ectraction', ColumnExtractor(columns_catboost)),
])
class base_model_class:
def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
return self.fit(X_train, y_train).predict(X_test)
def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
y_pred = self.fit(X_train, y_train).predict(X_test)
result= r2_score(y_test, y_pred)
return result
class model_linear_regression(base_model_class):
def __init__(self, pipe=None, inverse=False):
self.name = 'Linear_Regression'
self.model = LinearRegression()
if pipe==None:
self.pipe = Pipeline([('model', self.model)])
else:
self.pipe = deepcopy(pipe)
self.pipe.steps.append(('model', self.model))
if inverse:
self.pipe = TransformedTargetRegressor( regressor=self.pipe,
func=np.log1p,
inverse_func=np.expm1)
def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
self.pipe.fit(X, y)
return self
def predict(self, X:pd.DataFrame=X_test):
y_pred = self.pipe.predict(X)
return y_pred
And then, when using everything gives different R2 scores:
Xx=BASE_PIPE.fit_transform(X)
model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974
model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e 22
EDIT: providing all steps in pipeline used:
class data_num_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
X_ = X.copy()
self.frames = [X_]
for col in self.columns:
A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
A = A.rename(columns={col:col '^s'})
self.frames = [A]
B = pd.DataFrame(X_[col] * X_[col])
B = B.rename(columns={col:col '^2'})
self.frames = [B]
return pd.concat(self.frames, axis=1)
class data_cat_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=CATEGORICAL_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
for col in self.columns:
df_col_count = X_[col].value_counts().to_frame().reset_index()
df_col_count.columns = ["var_name", "var_count"]
df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()
X_['C_' col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
X_['F_' col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
return X_
class data_get_dummies(BaseEstimator, TransformerMixin):
def __init__(self, columns:list = CATEGORICAL_FEATURES):
self.columns = columns
self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
def fit(self, X, y = None):
self.encoder.fit(X)
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
encoder_columns = self.encoder.get_feature_names_out()
fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
return df_temp
class data_x_corr(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
self.columns = columns
self.corr_val = corr_val
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# prepare numeric df
X_ = X.copy()
x = X_[self.columns]
corr_matrix = x.corr(method='spearman')
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j 1), (i 1):(i 2)]
col = item.columns
row = item.index
val = item.values
if val >= self.corr_val:
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i 1):(i 2)].columns.values
X_ = X_.drop(col, axis=1)
return X_
class DFStandardScaler(TransformerMixin):
# StandardScaler but for pandas DataFrames
def __init__(self):
self.ss = None
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
self.ss = StandardScaler()
self.ss.fit(X)
self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
return self
def transform(self, X) -> pd.DataFrame:
# assumes X is a DataFrame
Xss = self.ss.transform(X)
Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return Xscaled
def __str__(self):
return "DF_StandardScaler"
def __repr__(self):
return "DF_StandardScaler"
class ColumnExtractor(TransformerMixin, BaseEstimator):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
Xcols = X[self.cols]
return Xcols
CodePudding user response:
The one transformer that stands out to me is data_cat_mix
, specifically the count-of-level columns. When applied to train test, these are consistent (but leaks test information); when applied separately, the values in train will generally be much higher (just from its size being three times larger), so the model doesn't really understand how to treat them in the test set.