Why does my Resnet56 implementation have less accuracy than in the original paper?-CodePudding

I was trying to implement Resnet56 in Tensorflow to classify the CIFAR10 images, but somehow I got a lower accuracy than the original creators.
I did everything exactly as described in the paper: same architecture, same data augmentation, same learning rate scheduling, same batch size...
But somehow my implementation produced an accuracy of only 91.84%, while in the original paper they reached 93.03% for the 56 layer Resnet.
Here is the link to the Resnet paper: https://arxiv.org/pdf/1512.03385.pdf
I found what my problem was (see answers if interested) and here you can find my (now correct) implementation, that can now reach the exact same accuracy:

import argparse
import datetime
import os
import re

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow_datasets as tfds

os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Report only TF errors and warnings by default

parser = argparse.ArgumentParser()
parser.add_argument("--resnet_n", default=9, type=int, help="n from Resnet paper.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")


class ResNet(keras.Model):
    class ResidualBlock(tf.Module):
        def __init__(self, filters: int, down_sample: bool):
            super().__init__()
            self.filters = filters
            self.down_sample = down_sample

        def __call__(self, x):
            out = x

            out = keras.layers.Conv2D(filters=self.filters,
                                      kernel_size=(3, 3),
                                      strides=(1, 1) if not self.down_sample else (2, 2),
                                      padding="same",
                                      use_bias=False,
                                      kernel_initializer=tf.keras.initializers.HeNormal)(out)
            out = keras.layers.BatchNormalization()(out)
            out = keras.layers.ReLU()(out)

            out = keras.layers.Conv2D(filters=self.filters,
                                      kernel_size=(3, 3),
                                      strides=(1, 1),
                                      padding="same",
                                      use_bias=False,
                                      kernel_initializer=tf.keras.initializers.HeNormal)(out)
            out = keras.layers.BatchNormalization()(out)

            if self.down_sample:
                residual = keras.layers.Conv2D(filters=self.filters, kernel_size=(1, 1), strides=(2, 2),
                                               padding="same",
                                               use_bias=False,
                                               kernel_initializer=tf.keras.initializers.HeNormal)(x)
                residual = tf.keras.layers.BatchNormalization()(residual)
            else:
                residual = x

            out = out   residual
            out = keras.layers.ReLU()(out)
            return out

    def __init__(self, args):
        inputs = keras.layers.Input(shape=(32, 32, 3), dtype=tf.float32)
        outputs = keras.layers.Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), padding="same", use_bias=False,
                                      kernel_initializer=tf.keras.initializers.HeNormal)(
            inputs)
        outputs = keras.layers.BatchNormalization()(outputs)
        outputs = keras.layers.ReLU()(outputs)

        for _ in range(0, args.resnet_n):
            outputs = self.ResidualBlock(16, False)(outputs)

        outputs = self.ResidualBlock(32, True)(outputs)
        for _ in range(1, args.resnet_n):
            outputs = self.ResidualBlock(32, False)(outputs)

        outputs = self.ResidualBlock(64, True)(outputs)
        for _ in range(1, args.resnet_n):
            outputs = self.ResidualBlock(64, False)(outputs)

        outputs = keras.layers.GlobalAveragePooling2D()(outputs)
        outputs = keras.layers.Dense(10, activation=tf.nn.softmax)(outputs)
        super().__init__(inputs, outputs)


def main(args, tb_callback):
    ds_train,ds_test = tfds.load("cifar10",split=["train","test"],as_supervised=True)

    img_augmentation = keras.Sequential(
        [
            keras.layers.RandomFlip("horizontal"),
            keras.layers.RandomTranslation(height_factor=0.125, width_factor=0.125, fill_mode="constant",
                                           fill_value=0.5)
        ]
    )
    ds_train = ds_train.map(lambda img, label: (tf.cast(img, tf.float32) / 255.0, label))
    ds_test = ds_test.map(lambda img, label: (tf.cast(img, tf.float32) / 255.0, label))

    total_count, per_pixel_sum = ds_train.reduce((np.float32(0), tf.zeros((32, 32, 3))),
                                                 lambda prev, curr: (prev[0]   1.0, prev[1]   curr[0]))
    per_pixel_mean = per_pixel_sum / total_count

    ds_train = ds_train.map(lambda img, label: (img_augmentation(img, training=True), tf.one_hot(label, 10)))
    ds_test = ds_test.map(lambda img, label: (img, tf.one_hot(label, 10)))

    ds_train = ds_train.map(lambda img, label: (img - per_pixel_mean, label))
    ds_test = ds_test.map(lambda img, label: (img - per_pixel_mean, label))

    ds_train = ds_train.shuffle(5000).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    ds_test = ds_test.shuffle(5000).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

    model = ResNet(args)

    learning_rate = keras.optimizers.schedules.PiecewiseConstantDecay(
        [32000, 48000], [0.1, 0.01, 0.001]
    )
    weight_decay = keras.optimizers.schedules.PiecewiseConstantDecay(
        [32000, 48000], [1e-4, 1e-5, 1e-6]
    )

    model.compile(
        optimizer=tfa.optimizers.SGDW(weight_decay=weight_decay, learning_rate=learning_rate, momentum=0.9,
                                      nesterov=False),
        loss=tf.losses.CategoricalCrossentropy(),
        metrics=[tf.metrics.CategoricalAccuracy("accuracy")],
    )

    model.fit(x=ds_train, epochs=200, validation_data=ds_test, callbacks=[tb_callback], use_multiprocessing=True,
              workers=args.threads)

    model.save(args.logdir   '/model')
    print('OK')


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Fix random seeds and threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Create logdir name
    args.logdir = os.path.join("{}/{}".format("logs", os.path.basename(globals().get("__file__", "notebook"))),
                               "{}-{}".format(
                                   datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
                                   ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in
                                             sorted(vars(args).items())))
                               ))

    tb_callback = tf.keras.callbacks.TensorBoard(args.logdir, histogram_freq=1, update_freq=100, profile_batch=0)

    main(args, tb_callback)

CodePudding user response：

I found what my problems were:

I didn't apply data augmentation correctly, changed img_augmentation(img) to img_augmentation(img, training=True)
Changed kernel initializer to HeNormal, what they used in the paper
Added per pixel mean substraction as a normalization
Disabling nesterov helped somehow (IDK why)