I try to do a prediction on a dataset with a head of:
ID Reason Month Day ... Season Drinker Age Group TimeOff
0 28 23 10 2 ... 4.0 Yes Middle Aged Low
1 17 18 1 3 ... 2.0 Yes NaN High
2 25 1 7 3 ... 1.0 Yes Adult High
3 11 28 11 2 ... 4.0 Yes Adult Low
4 10 23 3 2 ... 2.0 No Middle Aged Low
.. .. ... ... ... ... ... ... ... ...
587 28 28 3 2 ... 3.0 NaN Young Adult Low
588 20 28 10 5 ... 4.0 NaN Middle Aged Low
589 14 8 3 5 ... 2.0 No Middle Aged High
590 28 0 5 4 ... NaN No Adult Low
591 34 25 5 6 ... NaN No Middle Aged High
Then while preprocessing I try to drop the column 'Season' but get an error detailed later, this is the code:
import numpy as np
from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
def load_dataset(train_csv_path):
data = pd.read_csv(train_csv_path, sep=',')
return data
class DataPreprocessor(object):
def __init__(self):
self.transformer: Pipeline = None
def fit(self, dataset_df):
numerical_columns = ['ID', 'Transportation expense', 'Residence Distance', 'Service time', 'Weight', 'Height', 'Season', 'Pet', 'Son', 'Day', 'Month', 'Reason']
categorical_columns = list(set(dataset_df.columns) - set(numerical_columns))
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median"))
])
categorical_transformer = OneHotEncoder(drop=None, sparse=False, handle_unknown='ignore')
cat_pipeline = Pipeline([
('1hot', categorical_transformer)
])
preprocessor = ColumnTransformer(
transformers=[
("dropId", 'drop', 'ID'),
("num", num_pipeline, numerical_columns),
("cat", cat_pipeline, categorical_columns),
]
)
self.transformer = Pipeline(steps=[
("preprocessor", preprocessor)
])
### DROPPING HERE
dataset = dataset_df.drop("Season", axis=1)
self.transformer.fit(dataset)
def transform(self, df):
return self.transformer.transform(df)
def train_model(processed_X, y):
model = GaussianNB()
model.fit(processed_X, y)
return model
if __name__ == '__main__':
preprocessor = DataPreprocessor()
train_csv_path = 'time_off_data_train.csv'
train_dataset_df = load_dataset(train_csv_path)
print(train_dataset_df.head)
X_train = train_dataset_df.iloc[:, :-1]
y_train = train_dataset_df['TimeOff']
preprocessor.fit(X_train)
model = train_model(preprocessor.transform(X_train), y_train)
I get this error:
Traceback (most recent call last):
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Season'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/sklearn/utils/__init__.py", line 416, in _get_column_indices
col_idx = all_columns.get_loc(col)
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3623, in get_loc
raise KeyError(key) from err
KeyError: 'Season'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/.../PycharmProjects/final_proj/main.py", line 200, in <module>
preprocessor.fit(X_train)
File "/Users/.../PycharmProjects/final_proj/main.py", line 149, in fit
self.transformer.fit(dataset)
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py", line 640, in fit
self.fit_transform(X, y=y)
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py", line 670, in fit_transform
self._validate_column_callables(X)
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py", line 357, in _validate_column_callables
transformer_to_input_indices[name] = _get_column_indices(X, columns)
File "/Users/.../PycharmProjects/final_proj/venv/lib/python3.8/site-packages/sklearn/utils/__init__.py", line 424, in _get_column_indices
raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
Why do I get this error for fitting after dropping the said column and what is the difference between this and dropping it within the ColumnTransformer
function?
CodePudding user response:
There is a huge difference between dropping a column inside the fit
method of your DataProcessor
and within a the ColumnTransformer
.
When dropping inside the fit
method, the column will only be dropped when you wish to fit you preprocessor (when you call preprocessor.fit(X_train)
) and not when you want to actually transform your training data (preprocessor.transform(X_train)
). You can note that the fit
method doesn't return a dataframe which means that dropping the column the way you did is useless (howerver the transform
method does return the preprocessed dataframe).
Your script failed because when calling self.transformer.fit(dataset)
inside the fit
method of your preprocessor
, the transformer expects your dataframe to have a "Season" column since you specified it when you declare your numerical_columns
.
If you want to correctly drop the "Season" column you can (and must) use the ColumnTransformer
, the same way you did with "ID" column (by declaring a drop
transformer for the "Season" column). Then your preprocessor will expect your data to have a "Season" column when fitting, and will know that it should drop it when transforming.