OneHotEncoder ValueError: Input contains NaN-CodePudding

I have downloaded this data, and this is my code:

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import plotly.figure_factory as ff
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
random_state = 27912

df_train = pd.read_csv("...")
df_test  = pd.read_csv("...")


X_train, X_test, y_train, y_test = train_test_split(df_train.drop(["Survived", "Ticket", "Cabin", "Name", "PassengerId"], 
                                                                  axis = 1), 
                                                    df_train["Survived"], test_size=0.2, 
                                                    random_state=42)

numeric_col_names = ["Age", "SibSp", "Parch", "Fare"]
ordinal_col_names = ["Pclass"]
one_hot_col_names = ["Embarked", "Sex"]

ct = make_column_transformer(
    (SimpleImputer(strategy="median"), numeric_col_names),
    (SimpleImputer(strategy="most_frequent"), ordinal_col_names   one_hot_col_names),
    (OrdinalEncoder(), ordinal_col_names),
    (OneHotEncoder(), one_hot_col_names),
    (StandardScaler(), ordinal_col_names   one_hot_col_names   numeric_col_names))

preprocessing_pipeline = Pipeline([("transformers", ct)])

preprocessing_pipeline.fit_transform(X_train)

I'm trying make column_transformer for preprocessing step, however, the OneHotEncoding step is giving me an error, ValueError: Input contains NaN. I don't really know why this is happening, because I'm imputing the values before. Any clues on why this is happening?

Trying something like this doesn't help neither

preprocessing_pipeline = Pipeline([("transformers", ct_first)])
ct_second = make_column_transformer((OneHotEncoder(), one_hot_col_names),(StandardScaler(), ordinal_col_names   one_hot_col_names   numeric_col_names))
pipeline = Pipeline([("transformer1", preprocessing_pipeline), ("transformer2", ct_second)])
pipeline.fit_transform(X_train)

I would like to know why is this happening and why the above code, first and second tries, are not correct. Thanks

CodePudding user response：

You need to create a pipeline for each column type to make sure that the different steps are applied sequentially (i.e. to make sure that the missing values are imputed prior to encoding and scaling), see also this example in the scikit-learn documentation.

import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer

# Load the data (from https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Extract the features
X_train = df_train.drop(labels=['Survived', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
X_test = df_test.drop(labels=['Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)

# Map the feature names to the corresponding 
# types (numerical, ordinal or categorical)
numeric_col_names = ['Age', 'SibSp', 'Parch', 'Fare']
ordinal_col_names = ['Pclass']
one_hot_col_names = ['Embarked', 'Sex']

# Define the numerical features pipeline
numeric_col_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define the ordinal features pipeline
ordinal_col_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder()),
    ('scaler', StandardScaler())
])

# Define the categorical features pipeline
one_hot_col_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse=False)),
    ('scaler', StandardScaler())
])

# Create the overall preprocessing pipeline
preprocessing_pipeline = make_column_transformer(
    (numeric_col_transformer, numeric_col_names),
    (ordinal_col_transformer, ordinal_col_names),
    (one_hot_col_transformer, one_hot_col_names),
)

# Fit the pipeline to the training data
preprocessing_pipeline.fit(X_train)

# Apply the pipeline to the training and test data
X_train_ = preprocessing_pipeline.transform(X_train)
X_test_ = preprocessing_pipeline.transform(X_test)