I have the following code:
def generator_train(x_train_df, y_train_df, batch_size):
for i in range(int(len(x_train_df) / batch_size)):
x_train = x_train_df[i * batch_size:(i 1) * batch_size]
y_train = y_train_df[i * batch_size:(i 1) * batch_size]
yield np.array(x_train), np.array(y_train)
train_generator = generator_train(x_train_df, y_train_df, batch_size)
history = model.fit(train_generator,
epochs=epochs_no,
steps_per_epoch=number_of_rows_input/batch_size,
verbose=1,
max_queue_size=100,
validation_data=None,
workers=8,
use_multiprocessing=True
)
The x_train_df, y_train_df are pandas.DataFrames both. I'm still getting the following error referring to pickle. However, the fit_generator should have noting to do with dumping/loading pickled data.
Exception in thread Traceback (most recent call last):
Thread-2 File "", line 1, in
Traceback (most recent call last): File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\threading.py", line 954, in _bootstrap_inner File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\spawn.py", line 116, in spawn_main self.run() File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\threading.py", line 892, in run exitcode = _main(fd, parent_sentinel) File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\spawn.py", line 126, in _main self = reduction.pickle.load(from_parent) EOFError: Ran out of input self._target(*self._args, **self._kwargs) File "E:\Tut\pythonProject5_MachineLearning\venv\lib\site-packages\keras\utils\data_utils.py", line 868, in _run with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor: File "E:\Tut\pythonProject5_MachineLearning\venv\lib\site-packages\keras\utils\data_utils.py", line 858, in pool_fn pool = get_pool_class(True)( File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\context.py", line 119, in Pool return Pool(processes, initializer, initargs, maxtasksperchild, File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 212, in init self._repopulate_pool() File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 303, in _repopulate_pool return self._repopulate_pool_static(self._ctx, self.Process, File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 326, in _repopulate_pool_static w.start() File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\process.py", line 121, in start self._popen = self._Popen(self) File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\context.py", line 327, in _Popen return Popen(process_obj) File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\popen_spawn_win32.py", line 93, in init reduction.dump(process_obj, to_child) File "C:\Users\Admin\AppData\Local\Programs\Python\Python39\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle 'generator' object
What am I missing?
CodePudding user response:
One solution would be by using MirroredStrategy() for the neural network and the date should be preprocessed using the functions from tensorflow.data.Dataset
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
model = Sequential()
model.add(Dense.....
.....
model.compile(loss='mae', optimizer='sgd')
def dataset_fn(dummy_argument):
x = np.array(x_train_df).astype(np.float32)
y = np.array(y_train_df).astype(np.float32)
dataset = tf.data.Dataset.from_tensor_slices((x, y))
return dataset.repeat().batch(batch_size=batch_size, drop_remainder=True)
dist_dataset = strategy.experimental_distribute_datasets_from_function(dataset_fn)
history = model.fit(
dist_dataset,
epochs=epochs,
steps_per_epoch=number_of_batches_in_the_x_set,
verbose=1,
max_queue_size=max_queue_size,
validation_data=None,
workers=number_of_workers,
use_multiprocessing=True
)
CodePudding user response:
You are pickling: because you're using multiprocessing, and multiprocessing needs to pickle anything it runs to send it to the new python processes. Since your train_generator
is needed in each process, it will be sent, i.e. pickled.
As the linked question notes, avoid this by not using a generator: trivially, cast to list and evaluate before sending; but more sensibly, rewrite your generator to return the list for you.