Training loss after last epoch differs from training loss (same data!) during evaluation


I am building a deep convolutional model with a custom loss function. As a first step, I am trying to bring training loss down as far as possible to see if my model can overfit.

Training with on only one batch, the model can reduce the training loss to almost zero. But when I evaluate it on the same data it has been trained on, the loss is by magnitudes larger than even the test loss. It is even considerably larger than the loss for randomly generated predictions.

For training and evaluation I use the standard Keras model.fit and model.evaluate functions:

history = model.fit(x=training_batch_generator,
                callbacks = [stop_early, tensorboard, checkpoints])

training_loss = model.evaluate(x=yolo_training_batch_generator)

I use subclassed keras.utils.Sequence for the data:

class YoloSequence(Sequence):

def __init__(self, x_set, y_set, batch_size, grid_len):
    self.x, self.y = x_set, y_set
    self.batch_size = batch_size
    self.grid_len = grid_len

def __len__(self):
    return (np.ceil(len(self.x) / self.batch_size)).astype(np.int)

def __getitem__(self, idx):
    batch_x = self.x[idx * self.batch_size : (idx   1) * self.batch_size]
    batch_y = self.y[idx * self.batch_size : (idx   1) * self.batch_size] 

    image_batch = [preprocess_image(path) for path in batch_x]
    label_batch = [preprocess_label(path, self.grid_len) for path in batch_y]

    return np.array(image_batch), np.array(label_batch)

And a custom loss function:

class YoloLoss(keras.losses.Loss):
def __init__(self, name="yolo_loss", **kwargs):
    super().__init__(name=name, **kwargs)

def call(self, y_true, y_pred): 
    # shape of y_true: batch-size, GRID_SIZE, GRID_SIZE, 19
    # 19: [conf, x, y, w, h, 0, 0, 0, 0, 0, p[0], p[1], ..., p[8]]
    # get y_pred into same format as y_true:
    y_pred = tf.cast(K.reshape(y_pred, (-1, GRID_LEN, GRID_LEN, 19)), dtype=tf.float32)
    y_true = tf.cast(y_true, dtype=tf.float32)

    # compute ious (each iou of shape [1, batchsize, gridsize, gridsize, 1], one iou for each cell):
    iou_bb1 = K.expand_dims(self.compute_iou(y_pred[..., 1:5], y_true[..., 1:5]), axis=0)
    iou_bb2 = K.expand_dims(self.compute_iou(y_pred[..., 6:10], y_true[..., 1:5]), axis=0)

    ious = K.concatenate([iou_bb1, iou_bb2], axis=0) # shape:  [2, batchsize, gridsize, gridsize, 1]

    # bestbox: box that is responsible for a given cell [batchsize, gridsize, gridsize, 1]:
    bestbox = K.cast(K.argmax(ious, axis=0), dtype=tf.float32) 

    # exists_box: for each cell in every batch, does there exist a box? shape: [batchsize, gridsize, gridsize, 1]
    exists_box = K.expand_dims(y_true[..., 0], axis=3)

    ### box loss ###
    # if a box exists, use predictions of best box:
    xy_pred = (bestbox * y_pred[..., 6:8])   ((1 - bestbox) * y_pred[..., 1:3])
    box_predictions_xy = (exists_box * xy_pred)
    box_targets_xy = (exists_box * y_true[..., 1:3])

    # square-root of width and height(same change is less important in larger box):
    wh_pred = ((bestbox * y_pred[..., 8:10])   (1 - bestbox) * y_pred[..., 3:5])
    box_predictions_wh = (K.sign(exists_box * wh_pred) * K.sqrt(K.abs(exists_box * wh_pred)  1e-6)) # derivative of squareroot as you go to zero: infinity, so add 1e-6 for numerical stability
    box_targets_wh = (K.sqrt(exists_box * y_true[..., 3:5])) 

    mse = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)
    box_loss = mse(box_predictions_xy, box_targets_xy)   mse(box_predictions_wh, box_targets_wh)

    ### object loss ###
    confidence = (bestbox * y_pred[..., 5:6])   ((1 - bestbox) * y_pred[..., 0:1])
    best_ious = tf.where(
        K.cast(bestbox, tf.bool),
        K.reshape(iou_bb2, (-1, GRID_LEN, GRID_LEN, 1)),
        K.reshape(iou_bb1, (-1, GRID_LEN, GRID_LEN, 1))
    object_loss = mse((exists_box * confidence), (best_ious * y_true[..., 0:1])) 

    ### no object loss ###
    no_object_loss = mse(((1 - exists_box) * confidence), ((1 - exists_box) * y_true[..., 0:1])) # second term is all zeros

    ### class loss ###
    class_loss = mse((exists_box * y_pred[..., 10:]), (exists_box * y_pred[..., 10:]))

    ### total loss ###
    lambda_coord = 5
    lambda_noobj = 0.5
    loss = (
        lambda_coord * box_loss
          lambda_noobj * no_object_loss
    return loss

Has been answered here!

The problem was that a model with batch normalization behaves differently during training and evaluation

