Error finding attribute `feature_names_in_` that exists in docs-CodePudding

I'm getting the error AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_' even though that attribute is written in the docs.

I'm on scikit-learn version 1.0.2.

I created an object LogisticRegression and I am trying to use the documented attribute of feature_names_in_ but it's returning an error.


#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse

from scipy.stats import chi2_contingency

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)


#create functions for preprocessing

# function to replace NaN's in the ordinal and interval data 
def replace_NAN_median(X_df):
    opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
                'household_children']
    for column in opinions:
        X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
    return X_df

# function to replace NaN's in the catagorical data     
def replace_NAN_mode(X_df):
    miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
    for column in miss_cat_features:
        X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
    return X_df


# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)

col_transformer = ColumnTransformer(transformers=
    # replace NaN's in the binary data                                
    [("NAN_0", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0), 
    ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition', 
    'child_under_6_months', 'health_worker', 'health_insurance']),
    
     # MinMaxScaler on our numeric ordinal and interval data
    ("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                                'opinion_seas_sick_from_vacc', 
                                'household_adults', 'household_children']),
     
     # OHE catagorical string data
    ("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex', 
                                'income_poverty', 'marital_status', 'rent_or_own',
                                'employment_status', 'census_msa'])],
     
    remainder="passthrough")


# Preprocessing Pipeline 
preprocessing_pipe = Pipeline(steps=[
    ("NAN_median", NAN_median), 
    ("NAN_mode", NAN_mode), 
    ("col_transformer", col_transformer)
    ])

# model
logreg_optimized_pipe =  Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
                                    ("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])

#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)

#trying to get feature names
logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-38-512bfaf5962d> in <module>
----> 1 logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
      

AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_'

I'm open to alternative suggestions on how to get the feature names as well.

CodePudding user response：

Docs says the following:

feature_names_in_ndarray of shape (n_features_in_,) Names of features seen during fit. Defined only when X has feature names that are all strings.

You should make sure that data that reaches model has names in. Also, it is defined only when fit is called.

Link to the docs for your version 1.0.2 LogisticRegression

CodePudding user response：

So it turns out that SimpleImputer returns an array - thereby removing the column names. I replaced SimpleImputer with a function to fix this. I wasn't able to figure out how to use .feature_names_in_ on the LogisticRegression() model, but it did work when I called it on the preprocessing pipeline ColumnTransformer, and most importantly I was able to use .get_feature_names_out() on the preprocessing pipeline to get the feature names that were fed into the model.

Code:


  

    #imports
    import numpy as np
    import pandas as pd
    import statistics
    import scipy.sparse
    
    from scipy.stats import chi2_contingency
    
    from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.impute import SimpleImputer
    
    # train_test_split()
    X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
    
    
    #create functions for preprocessing
    
    # function to replace NaN's in the ordinal and interval data 
    def replace_NAN_median(X_df):
        opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
                    'household_children']
        for column in opinions:
            X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
        return X_df
    
    # function to replace NaN's in the catagorical data     
    def replace_NAN_mode(X_df):
        miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
        for column in miss_cat_features:
            X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
        return X_df
    # function to replace NaN's in the binary data 
                               
    def replace_NAN_0(X_df):
        miss_binary = ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
        'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
        'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition', 
        'child_under_6_months', 'health_worker', 'health_insurance']
        for column in miss_binary:
            X_df[column].replace(np.nan, 0, inplace = True)
        return X_df
    
    
    # Instantiate transformers
    NAN_median = FunctionTransformer(replace_NAN_median)
    NAN_mode = FunctionTransformer(replace_NAN_mode)
    NAN_0 = FunctionTransformer(replace_NAN_0)
    col_transformer = ColumnTransformer(transformers= [
         # MinMaxScaler on our numeric ordinal and interval data
        ("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                                    'opinion_seas_sick_from_vacc', 
                                    'household_adults', 'household_children']),
         
         # OHE catagorical string data
        ("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex', 
                                    'income_poverty', 'marital_status', 'rent_or_own',
                                    'employment_status', 'census_msa'])],
         
        remainder="passthrough")
    
    
    # Preprocessing Pipeline 
    preprocessing_pipe = Pipeline(steps=[
        ("NAN_median", NAN_median), 
        ("NAN_mode", NAN_mode),
        ("NAN_0", NAN_0),
        ("col_transformer", col_transformer)
        ])
    
    # model
    logreg_optimized_pipe =  Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
                                        ("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
    
    #fit model to training data
    logreg_optimized_pipe.fit(X_train, y_train)
    
    #trying to get feature names
    logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].feature_names_in_
    
    #output - feature names put into `ColumnTransformer`
    array(['respondent_id', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       'behavioral_outside_home', 'behavioral_touch_face',
       'doctor_recc_seasonal', 'chronic_med_condition',
       'child_under_6_months', 'health_worker', 'health_insurance',
       'opinion_seas_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race',
       'sex', 'income_poverty', 'marital_status', 'rent_or_own',
       'employment_status', 'census_msa', 'household_adults',
       'household_children'], dtype=object)

    logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].get_feature_names_out()

    #output - feature names after `ColumnTransformer`
    array(['scaler__opinion_seas_vacc_effective', 'scaler__opinion_seas_risk',
       'scaler__opinion_seas_sick_from_vacc', 'scaler__household_adults',
       'scaler__household_children', 'ohe__age_group_18 - 34 Years',
       'ohe__age_group_35 - 44 Years', 'ohe__age_group_45 - 54 Years',
       'ohe__age_group_55 - 64 Years', 'ohe__age_group_65  Years',
       'ohe__education_12 Years', 'ohe__education_< 12 Years',
       'ohe__education_College Graduate', 'ohe__education_Some College',
       'ohe__race_Black', 'ohe__race_Hispanic',
       'ohe__race_Other or Multiple', 'ohe__race_White',
       'ohe__sex_Female', 'ohe__sex_Male',
       'ohe__income_poverty_<= $75,000, Above Poverty',
       'ohe__income_poverty_> $75,000',
       'ohe__income_poverty_Below Poverty', 'ohe__marital_status_Married',
       'ohe__marital_status_Not Married', 'ohe__rent_or_own_Own',
       'ohe__rent_or_own_Rent', 'ohe__employment_status_Employed',
       'ohe__employment_status_Not in Labor Force',
       'ohe__employment_status_Unemployed',
       'ohe__census_msa_MSA, Not Principle  City',
       'ohe__census_msa_MSA, Principle City', 'ohe__census_msa_Non-MSA',
       'remainder__respondent_id', 'remainder__behavioral_antiviral_meds',
       'remainder__behavioral_avoidance',
       'remainder__behavioral_face_mask',
       'remainder__behavioral_wash_hands',
       'remainder__behavioral_large_gatherings',
       'remainder__behavioral_outside_home',
       'remainder__behavioral_touch_face',
       'remainder__doctor_recc_seasonal',
       'remainder__chronic_med_condition',
       'remainder__child_under_6_months', 'remainder__health_worker',
       'remainder__health_insurance'], dtype=object)