transforming data first vs doing everything in pipe results in different results when using a model-CodePudding

I wanted to make all of the custom transformations I make to my data in a pipe. I thought that I could use it as pipe.fit_transform(X) to transform my X before using it in a model, but I also thought that I'll be able to append to the pipeline model itself and simply use it as one using pipe.steps.append(('model', self.model)).

Unfortunately, after everything was built I've noticed that I'm getting different results when transforming the data and using it directly in a model vs doing everything in one pipeline. Have anyone experienced anything like this?

Adding code:

# Base pipeline to be used
BASE_PIPE = Pipeline([
                ('dim_increase_num', data_num_mix()),
                ('dim_increase_cat', data_cat_mix()),
                ('start', data_get_dummies()),
                ('dm_correlation', data_x_corr_()),
                ('scaler', DFStandardScaler()),
                ('column_ectraction', ColumnExtractor(columns_catboost)),
                ])

class base_model_class:
    def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
        return self.fit(X_train, y_train).predict(X_test)
    def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
        y_pred = self.fit(X_train, y_train).predict(X_test)
        result= r2_score(y_test, y_pred)
        return result

class model_linear_regression(base_model_class):
    def __init__(self, pipe=None, inverse=False):
        self.name = 'Linear_Regression'
        self.model = LinearRegression()
        
        if pipe==None:
            self.pipe = Pipeline([('model', self.model)])
        else:
            self.pipe = deepcopy(pipe)
            self.pipe.steps.append(('model', self.model))

        if inverse:
            self.pipe = TransformedTargetRegressor( regressor=self.pipe,
                                                    func=np.log1p, 
                                                    inverse_func=np.expm1)
    def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
        self.pipe.fit(X, y)
        return self
    def predict(self, X:pd.DataFrame=X_test):
        y_pred = self.pipe.predict(X)
        return y_pred

And then, when using everything gives different R2 scores:

Xx=BASE_PIPE.fit_transform(X)

model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974

model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e 22

EDIT: providing all steps in pipeline used:

class data_num_mix(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list=NUMERIC_FEATURES):
        self.columns = columns
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        X_ = X.copy()
        self.frames = [X_]
        for col in self.columns:
            A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
            A = A.rename(columns={col:col '^s'})
            self.frames  = [A]

            B = pd.DataFrame(X_[col] * X_[col])
            B = B.rename(columns={col:col '^2'})
            self.frames  = [B]
        return pd.concat(self.frames, axis=1)


class data_cat_mix(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list=CATEGORICAL_FEATURES):
        self.columns = columns
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None) -> pd.DataFrame:
        X_ = X.copy()
        for col in self.columns:
            df_col_count = X_[col].value_counts().to_frame().reset_index()
            df_col_count.columns = ["var_name", "var_count"]
            df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()

            X_['C_' col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
            X_['F_' col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
        return X_


class data_get_dummies(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list = CATEGORICAL_FEATURES):
        self.columns = columns
        self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
    def fit(self, X, y = None):
        self.encoder.fit(X)
        return self
    def transform(self, X, y = None) -> pd.DataFrame:
        X_ = X.copy()
        encoder_columns = self.encoder.get_feature_names_out()
        fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
        df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
        return df_temp


class data_x_corr(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
        self.columns = columns
        self.corr_val = corr_val
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # prepare numeric df
        X_ = X.copy()
        x = X_[self.columns]

        corr_matrix = x.corr(method='spearman')
        iters = range(len(corr_matrix.columns) - 1)
        drop_cols = []

        # Iterates through Correlation Matrix Table to find correlated columns
        for i in iters:
            for j in range(i):
                item = corr_matrix.iloc[j:(j 1), (i 1):(i 2)]
                col = item.columns
                row = item.index
                val = item.values
                if val >= self.corr_val:
                    drop_cols.append(i)

        drops = sorted(set(drop_cols))[::-1]
        # Drops the correlated columns
        for i in drops:
            col = x.iloc[:, (i 1):(i 2)].columns.values
            X_ = X_.drop(col, axis=1)
        return X_


class DFStandardScaler(TransformerMixin):
    # StandardScaler but for pandas DataFrames
    def __init__(self):
        self.ss = None
        self.mean_ = None
        self.scale_ = None
    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X)
        self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
        self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
        return self
    def transform(self, X) -> pd.DataFrame:
        # assumes X is a DataFrame
        Xss = self.ss.transform(X)
        Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
        return Xscaled
    def __str__(self):
         return "DF_StandardScaler"
    def __repr__(self):
         return "DF_StandardScaler"


class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        # stateless transformer
        return self
    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols

CodePudding user response：

The one transformer that stands out to me is data_cat_mix, specifically the count-of-level columns. When applied to train test, these are consistent (but leaks test information); when applied separately, the values in train will generally be much higher (just from its size being three times larger), so the model doesn't really understand how to treat them in the test set.