How to train image classification model using tf.data pipeline?-CodePudding

I'm training large classes image classification model. Since dataset is huge it wont fit inside ram memory so I'm using tf.data pipeline to store it in cache and read as training goes on. Overall code which I've built is as below

file_location = r'C:\imageclassification\data'
data_dir = pathlib.Path(file_location)
class_labels_cnt = len(os.listdir(file_location))
print('total class :',class_labels_cnt)
image_count = len(list(data_dir.glob('*/*.png')))

batch_size = 64
img_height = 224
img_width = 224
epochs = 500
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)


class_names = np.array(sorted([item.name for item in data_dir.glob('*[!.csv]') if item.name != "LICENSE.txt"]))
print('class_names:',class_names)
val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)


print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)


def decode_img(img):
  # Convert the compressed string to a 3D uint8 tensor
  img = tf.io.decode_jpeg(img, channels=3)
  # Resize the image to the desired size
  return tf.image.resize(img, [img_height, img_width])
  return img
def process_path(file_path):
  label = get_label(file_path)
  # Load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label
AUTOTUNE = tf.data.AUTOTUNE
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)

for image, label in train_ds.take(1):
  print("Image shape: ", image.numpy().shape)
  print("Label: ", label.numpy())

def configure_for_performance(ds):
  ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds
model = build_resnet_pretrained(input_shape=(224, 224, 3), no_classes=class_labels_cnt)

train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)



model.compile( loss='categorical_crossentropy',
               optimizer=Adam(amsgrad=True, decay=0.001 / epochs),
               metrics=['accuracy']
             )

history = model.fit(
    x=train_ds,
    epochs=epochs,
    steps_per_epoch=int(tf.data.experimental.cardinality(train_ds).numpy()/ batch_size),
    callbacks=callbacks,
    validation_data=val_ds,
    validation_steps=int(tf.data.experimental.cardinality(val_ds).numpy() / batch_size)
)

when I try to run this I endup getting error as below

> WARNING:tensorflow:`period` argument is deprecated. Please use
> `save_freq` to specify the frequency in number of batches seen. Epoch
> 1/500 Traceback (most recent call last):   File
> "C:\Users\john\Desktop\John_data\Project\All_Scripts\Scripts\Deep_Learning\ResNet\tfdata_clsfn.py",
> line 113, in <module>
>     history = model.fit(   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\engine\training.py",
> line 1184, in fit
>     tmp_logs = self.train_function(iterator)   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\def_function.py",
> line 885, in __call__
>     result = self._call(*args, **kwds)   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\def_function.py",
> line 933, in _call
>     self._initialize(args, kwds, add_initializers_to=initializers)   File
> "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\def_function.py",
> line 759, in _initialize
>     self._stateful_fn._get_concrete_function_internal_garbage_collected( 
> # pylint: disable=protected-access   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\function.py",
> line 3066, in _get_concrete_function_internal_garbage_collected
>     graph_function, _ = self._maybe_define_function(args, kwargs)   File
> "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\function.py",
> line 3463, in _maybe_define_function
>     graph_function = self._create_graph_function(args, kwargs)   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\function.py",
> line 3298, in _create_graph_function
>     func_graph_module.func_graph_from_py_func(   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\framework\func_graph.py",
> line 1007, in func_graph_from_py_func
>     func_outputs = python_func(*func_args, **func_kwargs)   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\eager\def_function.py",
> line 668, in wrapped_fn
>     out = weak_wrapped_fn().__wrapped__(*args, **kwds)   File "C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\framework\func_graph.py",
> line 994, in wrapper
>     raise e.ag_error_metadata.to_exception(e) ValueError: in user code:
> 
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\engine\training.py:853
> train_function  *
>         return step_function(self, iterator)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\engine\training.py:842
> step_function  **
>         outputs = model.distribute_strategy.run(run_step, args=(data,))
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1286
> run
>         return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2849
> call_for_each_replica
>         return self._call_for_each_replica(fn, args, kwargs)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3632
> _call_for_each_replica
>         return fn(*args, **kwargs)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\engine\training.py:835
> run_step  **
>         outputs = model.train_step(data)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\engine\training.py:788
> train_step
>         loss = self.compiled_loss(
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\engine\compile_utils.py:201
> __call__
>         loss_value = loss_obj(y_t, y_p, sample_weight=sw)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\losses.py:141
> __call__
>         losses = call_fn(y_true, y_pred)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\losses.py:245
> call  **
>         return ag_fn(y_true, y_pred, **self._fn_kwargs)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\util\dispatch.py:206
> wrapper
>         return target(*args, **kwargs)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\losses.py:1665
> categorical_crossentropy
>         return backend.categorical_crossentropy(
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\util\dispatch.py:206
> wrapper
>         return target(*args, **kwargs)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\keras\backend.py:4839
> categorical_crossentropy
>         target.shape.assert_is_compatible_with(output.shape)
>     C:\Users\john\Anaconda3\envs\tf2_john\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1161
> assert_is_compatible_with
>         raise ValueError("Shapes %s and %s are incompatible" % (self, other))
> 
>     ValueError: Shapes (None, 1) and (None, 1922) are incompatible
> 
> 
> Process finished with exit code 1

I'm not able to figureout the cause of the error , even though number of classes are same as output softmax layers Any help or suggestion to rectify this error will be appreciated very much

CodePudding user response：

I fixed this by changing the loss function to sparse categorical cross entropy