I have a very large dataset (raw files ~750GB) and I created a cached dataset pipeline using the TensorFlow data API like this:
dataset = tf.data.Dataset.from_generator(MSUMFSD(pathlib.Path(dataset_locations["mfsd"]), True), output_types=(tf.string, tf.float32))
This dataset consists of all file paths I want to use for processing. After that I use this interleave
transformation, to generate the actual input data for my model:
class DatasetTransformer:
def __init__(self, haar_cascade_path, window, img_shape):
self.rppg_extractor = RPPGExtractionChrom(haar_cascade_path, window, img_shape)
self.window = window
self.img_shape = img_shape
def generator(self, file, label):
for signal, frame in self.rppg_extractor.process_file_iter(file.decode()):
yield (signal, frame), [label]
def __call__(self, file, label):
output_signature = (
(
tensorflow.TensorSpec(shape=(self.window), dtype=tensorflow.float32),
tensorflow.TensorSpec(shape=(self.img_shape[0], self.img_shape[1], 3), dtype=tensorflow.float32)
),
tensorflow.TensorSpec(shape=(1), dtype=tensorflow.float32))
return tensorflow.data.Dataset.from_generator(self.generator, args=(file, label), output_signature=output_signature)
dataset = dataset.interleave(
DatasetTransformer("rppg/haarcascade_frontalface_default.xml", window_size, img_shape),
num_parallel_calls=tf.data.AUTOTUNE
)
dataset = dataset.prefetch(tf.data.AUTOTUNE).shuffle(320).cache(cache_filename)
Now I want to iterate through the dataset once to create the cached dataset (consisting of the real input for the model) and to obtain the dataset size. Is there a way to show the progress of iteration? My attempt was to obtain the number of files before the interleave transformation like this:
dataset_file_amount = dataset.reduce(0, lambda x,_: x 1).numpy()
and then show a progress bar using tqdm while iterating through the "real" dataset like this:
def dataset_reducer(x, pbar):
pbar.update()
return x 1
pbar = tqdm(total=dataset_file_amount, desc="Preprocessing files...")
size = dataset.reduce(0, lambda x,_: dataset_reducer(x, pbar)).numpy()
When running this code I get a progress bar with the correct total amount (number of files) but the progress bar isn't updated. It's stuck at 0% at once the processing has been finished, it just continues the execution. Do you have an idea how to show (at least for the number of processed files) the progress of preprocessing? Thanks already!
Edit
Actually, the progress bar is stuck at 1/X
instead of 0%.
CodePudding user response:
I fixed the issue by not updating the progress bar inside of the reduce function. I pass the pbar object to the DatasetTransformer class and update the progress after the for loop of the generate method. This updates the progress based on processed files (I extract several hundred of frames per file and now I get the progress on how many files have been processed already):
class DatasetTransformer:
def __init__(self, haar_cascade_path, window, img_shape, progress):
self.rppg_extractor = RPPGExtractionChrom(haar_cascade_path, window, img_shape)
self.window = window
self.img_shape = img_shape
self.progress = progress
def generator(self, file, label):
rppg_extractor = RPPGExtractionChrom(self.haar_cascade_path, self.window, self.img_shape)
for signal, frame in rppg_extractor.process_file_iter(file.decode()):
yield (signal, frame), [label]
self.progress.update(1) # <- Update Progress bar here
def __call__(self, file, label):
output_signature = (
(
tensorflow.TensorSpec(shape=(self.window), dtype=tensorflow.float32),
tensorflow.TensorSpec(shape=(self.img_shape[0], self.img_shape[1], 3), dtype=tensorflow.float32)
),
tensorflow.TensorSpec(shape=(1), dtype=tensorflow.float32))
return tensorflow.data.Dataset.from_generator(self.generator, args=(file, label), output_signature=output_signature)
dataset = dataset.interleave(
DatasetTransformer("rppg/haarcascade_frontalface_default.xml", window_size, img_shape),
num_parallel_calls=tf.data.AUTOTUNE
)
pbar = tqdm(total=dataset_file_amount, desc="Preprocessing files...")
dataset = dataset.interleave(
DatasetTransformer("rppg/haarcascade_frontalface_default.xml", window_size, img_shape, pbar),
num_parallel_calls=tf.data.AUTOTUNE
)