How to integrate keras model with sequential backward selection code?-CodePudding

I am trying to integrate a Keras deep neural network as a classifier within code for sequential backward feature selection in Python. (Originally, I tried to wrap the Keras deep neural network within Scikeras to use within scikit-learn's built in sequential feature selection models, but I kept getting error messages).

I found this code from scratch for sequential backward feature selection (taken from https://vitalflux.com/sequential-backward-feature-selection-python-example/), and have been trying to integrate a Keras model in the code to replace the "estimator" within the function but I keep getting this error: ValueError: Input 0 of layer "sequential_410" is incompatible with the layer: expected shape=(None, 45), found shape=(None, 44)

Here is the code that I have so far for the sequential backward feature selection and the deep neural network:

import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier, KerasRegressor

# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone

 
class SequentialBackwardSearch():
    '''
    Instantiate with Estimator and given number of features
    '''
    def __init__(self, estimator, k_features):
        self.estimator = clone(estimator)
        self.k_features = k_features
         
    '''
    X_train - Training data Pandas dataframe
    X_test - Test data Pandas dataframe
    y_train - Training label Pandas dataframe
    y_test - Test data Pandas dataframe
    '''
    def fit(self, X_train, X_test, y_train, y_test):
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train.values, X_test.values,
                                 y_train.values, y_test.values, self.indices_)
        self.scores_ = [score]
        '''
        Iterate through all the dimensions until k_features is reached
        At the end of loop, dimension count is reduced by 1
        '''
        while dim > k_features:
            scores = []
            subsets = []
            '''
            Iterate through different combinations of features, train the model,
            record the score
            '''
            for p in combinations(self.indices_, r=dim - 1):
                score = self._calc_score(X_train.values, X_test.values, y_train.values, y_test.values, p)
                scores.append(score)
                subsets.append(p)
            #
            # Get the index of best score
            #
            best_score_index = np.argmax(scores)
            #
            # Record the best score
            #
            self.scores_.append(scores[best_score_index])
            #
            # Get the indices of features which gave best score
            #
            self.indices_ = subsets[best_score_index]
            #
            # Record the indices of features for best score
            #
            self.subsets_.append(self.indices_)
            dim -= 1 # Dimension is reduced by 1
     
    '''
    Transform training, test data set to the data set
    havng features which gave best score
    '''
    def transform(self, X):
        return X.values[:, self.indices_]
     
    '''
    Train models with specific set of features
    indices - indices of features
    '''
    def _calc_score(self, X_train, X_test, y_train, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train.ravel())
        y_pred = self.estimator.predict(X_test[:, indices])
        score = accuracy_score(y_test, y_pred)
        return score


# ===============================================
# Keras deep neural network

def dnn():
    model = keras.Sequential([
    layers.Dense(20, activation='relu', input_shape = (X_train.shape[1])),
    layers.Dropout(0.3),
    layers.Dense(20, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
])

    model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
    early_stopping = keras.callbacks.EarlyStopping(
        patience=5,
        min_delta=0.001,
        restore_best_weights=True,
)
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        batch_size=512,
        callbacks=[early_stopping],
)
    history_df = pd.DataFrame(history.history)
    print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
    history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
    history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
    
    return model

keras_clf = KerasClassifier(dnn,
                            epochs=5,
                            verbose=False)

keras_clf._estimator_type = "classifier"

And this is the code I have for trying to integrate them together:

k_features = 5
#
# Instantiate SequentialBackwardSearch
#
sbs = SequentialBackwardSearch(keras_clf, k_features)
#
# Fit the data to determine the k_features which give the
# most optimal model performance
#
sbs.fit(X_train, X_test, y_train, y_test)
#
# Transform the training data set to dataset having k_features
# giving most optimal model performance
#
X_train_kfeatures = sbs.transform(X_train)
#
# Transform the test data set to dataset having k_features
#
X_test_kfeatures = sbs.transform(X_test)

sbs.indices_
X_train.columns[[sbs.indices_]] # sbs is an instance of SequentialBackwardSearch class

I am wondering whether this is even possible (integrating a neural network to the existing code for sequential backward feature selection) or if there's anything I can do to get it to run and output the top 5 features from the training dataset. I have tried to address the error message by altering the input shape of the neural network, but I believe it is correct (45 features). Any help or advice would be welcome!

CodePudding user response：

This should work with SciKeras!

I had to clean up your code / fix some bugs. I first did a "sanity check" using Scikit-Learn's MLPClassfier, then I ran it against an MLPClassfier created using Keras. Details may differ for more complex model architectures, but this shows that it does work.

import numpy as np

# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone

 
class SequentialBackwardSearch:
    '''
    Instantiate with Estimator and given number of features
    '''
    def __init__(self, estimator, k_features):
        self.estimator = clone(estimator)
        self.k_features = k_features
         
    '''
    X_train - Training data Pandas dataframe
    X_test - Test data Pandas dataframe
    y_train - Training label Pandas dataframe
    y_test - Test data Pandas dataframe
    '''
    def fit(self, X_train, X_test, y_train, y_test):
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, X_test,
                                 y_train, y_test, self.indices_)
        self.scores_ = [score]
        '''
        Iterate through all the dimensions until k_features is reached
        At the end of loop, dimension count is reduced by 1
        '''
        while dim > self.k_features:
            scores = []
            subsets = []
            '''
            Iterate through different combinations of features, train the model,
            record the score
            '''
            for p in combinations(self.indices_, r=dim - 1):
                score = self._calc_score(X_train, X_test, y_train, y_test, p)
                scores.append(score)
                subsets.append(p)
            #
            # Get the index of best score
            #
            best_score_index = np.argmax(scores)
            #
            # Record the best score
            #
            self.scores_.append(scores[best_score_index])
            #
            # Get the indices of features which gave best score
            #
            self.indices_ = subsets[best_score_index]
            #
            # Record the indices of features for best score
            #
            self.subsets_.append(self.indices_)
            dim -= 1 # Dimension is reduced by 1
     
    '''
    Transform training, test data set to the data set
    havng features which gave best score
    '''
    def transform(self, X):
        return X.values[:, self.indices_]
     
    '''
    Train models with specific set of features
    indices - indices of features
    '''
    def _calc_score(self, X_train, X_test, y_train, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train.ravel())
        y_pred = self.estimator.predict(X_test[:, indices])
        score = accuracy_score(y_test, y_pred)
        return score

# Sklearn MLPClassifier

from sklearn.neural_network import MLPClassifier

estimator = MLPClassifier()

search = SequentialBackwardSearch(estimator, 1)

X = np.random.randint(0, 2, size=(100, 5))
y = X[:, -1]

search.fit(X, X, y, y)

assert list(search.indices_) == [4]

# SciKeras MLPClassifier
# see https://www.adriangb.com/scikeras/stable/notebooks/MLPClassifier_MLPRegressor.html

import tensorflow.keras as keras
from scikeras.wrappers import KerasClassifier

class KerasMLPClassifier(KerasClassifier):

    def __init__(
        self,
        hidden_layer_sizes=(100, ),
        optimizer="adam",
        optimizer__learning_rate=0.001,
        epochs=200,
        verbose=0,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.hidden_layer_sizes = hidden_layer_sizes
        self.optimizer = optimizer
        self.epochs = epochs
        self.verbose = verbose

    def _keras_build_fn(self, compile_kwargs):
        model = keras.Sequential()
        inp = keras.layers.Input(shape=(self.n_features_in_))
        model.add(inp)
        for hidden_layer_size in self.hidden_layer_sizes:
            layer = keras.layers.Dense(hidden_layer_size, activation="relu")
            model.add(layer)
        if self.target_type_ == "binary":
            n_output_units = 1
            output_activation = "sigmoid"
            loss = "binary_crossentropy"
        elif self.target_type_ == "multiclass":
            n_output_units = self.n_classes_
            output_activation = "softmax"
            loss = "sparse_categorical_crossentropy"
        else:
            raise NotImplementedError(f"Unsupported task type: {self.target_type_}")
        out = keras.layers.Dense(n_output_units, activation=output_activation)
        model.add(out)
        model.compile(loss=loss, optimizer=compile_kwargs["optimizer"])
        return model

estimator2 = KerasMLPClassifier()

search2 = SequentialBackwardSearch(estimator2, 1)

search2.fit(X, X, y, y)

assert list(search2.indices_) == [4]

Notebook version (can't promise this will be around forever): https://colab.research.google.com/drive/1EWxT3GWZsqhftz4f7W5GsXNe_SPtva4H#scrollTo=chU7wLn1BTU1