How to perform K-Fold Cross Validation in a Neural Network?-CodePudding

I'm developing a CNN for a binary image classification problem (Cats/Dogs). My goal is to use K-Fold CV (in this case I'd apply 5 folds) to find the best parameters (batch size, epochs).

so far my code is this


# Defining the Loss
loss = binary_crossentropy

# Creating the grid of parameters
batches = [32, 64, 128, 256]
epochs = [20, 30, 40, 50]
params_grid = dict(batch_size = batches, epochs = epochs)

# Creating the model
def model_cnn_three_layer(optimizer='adam'):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), padding = "same", use_bias=False, input_shape = (64, 64, 1), activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Conv2D(32, (3, 3), padding = "same", use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Conv2D(64, (3, 3), padding = "same", use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Conv2D(64, (3, 3), padding = "same", use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    # Compiling the model
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    model.summary()

    return model

# Create the sklearn CV model for the network

model_cnn_three_layer_CV = KerasClassifier(build_fn=model_cnn_three_layer, verbose=1)

grid = GridSearchCV(estimator=model_cnn_three_layer_CV, 
                    param_grid=params_grid,
                    cv=5)

grid_result = grid.fit(X_train, y_train)

# Print results
print(f'Best Accuracy for {grid_result.best_score_:.4} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f'mean={mean:.4}, std={stdev:.4} using {param}')

Is this approach correct?

If I wanted to compute the CV 'manually' (not using sklearn) how would I change the code? I found an answer to a similar question that does something like this

# parameters
epochs = 20
batch_size = 64

# Defining callback(s)
early_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# Defining plots
legend_size = 14

# Define the K-fold Cross Validator
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True)

loss_cnn_three_layer = []
acc_cnn_three_layer = []

fold_no = 1
for train, test in kfold.split(X, y):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), padding = "same", use_bias=False, input_shape = (64, 64, 1), activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Conv2D(32, (3, 3), padding = "same", use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Conv2D(64, (3, 3), padding = "same", use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Conv2D(64, (3, 3), padding = "same", use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D(pool_size = (2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, use_bias=False, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    # compiling the model
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    net_name = "CNN_three_layers_batch_and_dropout"

    model.summary()

    # log dir for saving TensorBoard logs
    logdir = os.path.join("CNN_nets", net_name)

    # callback to run TensorBoard
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
    callbacks = [tensorboard_callback, early_callback]

    history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test),
                        batch_size=batch_size, callbacks=callbacks, verbose=1)

    scores = model.evaluate(X_test, y_test)
    print(
        f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1] * 100}%')
    acc_cnn_three_layer.append(scores[1] * 100)
    loss_cnn_three_layer.append(scores[0])

    # Increase fold number
    fold_no = fold_no   1

# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(loss_cnn_three_layer)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i   1} - Loss: {loss_cnn_three_layer[i]} - Accuracy: {acc_cnn_two_layer[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_cnn_three_layer)} ( - {np.std(acc_cnn_three_layer)})')
print(f'> Loss: {np.mean(loss_cnn_three_layer)}')
print('------------------------------------------------------------------------')

But I'm not convinced by this approach, because it simply runs the model 5 times on the same data and not on different splits of the training data. How would this be changed to effectively run a CV on split portions of the train data and then evaluate on the test data? Moreover, how would I loop this last network over the values of the parameters of the grid?

CodePudding user response：

from sklearn.model_selection import StratifiedKFold as kfold

x = # features
y = # labels

batches = [32, 64, 128, 256]
epochs = [20, 30, 40, 50]

splits = 5
kf = kfold(splits, shuffle=True)
indices = kf.split(x, y)
loss_cnn_three_layer = []
acc_cnn_three_layer = []
preds = []
for train, test in indices:
    x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]

    # do model stuff

    history = model.fit(x_train, y_train, shuffle=True, epochs=10, verbose=1)
    prediction = model.predict(x_test)
    loss_cnn_three_layer.append(history.history["loss"])
    acc_cnn_three_layer.append(history.history["accuracy"])
    preds.append(prediction)

Edit to include iterable of parameters:

from sklearn.model_selection import StratifiedKFold as kfold

x = # features
y = # labels

splits = 5
kf = kfold(splits, shuffle=True)
indices = kf.split(x, y)
loss_cnn_three_layer = []
acc_cnn_three_layer = []
preds = []
for batch, epochs in zip(batches, epochs):
    for train, test in indices:
        x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]

        # do model stuff
        
        history = model.fit(x_train, y_train, shuffle=True, batch_size=batch epochs=epochs, verbose=1)
        prediction = model.predict(x_test)
        loss_cnn_three_layer.append(history.history["loss"])
        acc_cnn_three_layer.append(history.history["accuracy"])
        preds.append(prediction)

If you want to iterate different batches and epochs based on the kfold instead, just swap the two for placements, but leave everything else within.

If you want to have the dictionary, do something like this:

for i, j in zip([*params_grid.values()]):  # assuming batch and epoch lists have the same length
    # where i is batch, and j is epochs
    # do stuff

If you wanted to train the model based on the number of epochs per batch size (or vice versa*), do something like this:

for k, l in [(i, j) for j in epochs for i in batches]:  # swap batches and epochs for vice versa*
    # where k is batch, and l is epochs
    # do stuff

CodePudding user response：

I tried the following solution

loss_cnn_three_layer = []
acc_cnn_three_layer = []

# create the first loop for batches and epochs
for batch, epoch in zip(batches, epochs):   
# second loop for training the model on each split
    for train, test in indices:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

        # model = tf.keras.Sequential([ ... ])
    
        # compiling the model
        model.compile(optimizer = optimizer, loss=loss, metrics=['accuracy'])

        net_name = "CNN_three_layers_batch_and_dropout"

        model.summary()


        # log dir for saving TensorBoard logs
        logdir = os.path.join("CNN_nets", net_name)

        # callback to run TensorBoard
        tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq = 1)
        callbacks = [tensorboard_callback, early_callback]
        
        # fitting the network
        history = model.fit(X_train, y_train, epochs = epoch,
                            batch_size = batch, callbacks = callbacks, verbose = 1)

        # evaluating the performance
        scores = model.evaluate(X_test, y_test)
        
        # printing accuracy and loss
        print(f'Score per batch {batch} and epochs {epoch}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
        acc_cnn_three_layer.append(scores[1] * 100)
        loss_cnn_three_layer.append(scores[0])

However by doing this it runs the model and the Cross Validation only on the first combination of batches and epochs (32, 20) and then it stops.