I am doing a multiclass image classification and this code is working fine, when I put base_model.trainable = False
:
file_paths = train['image'].values # train is a pd.DataFrame
labels = train['label'].values
valfile_paths = val['image'].values
vallabels = val['label'].values
ds_train = tf.data.Dataset.from_tensor_slices((file_paths, labels))
ds_val = tf.data.Dataset.from_tensor_slices((valfile_paths, vallabels))
def read_image(image_file, label):
image = tf.io.read_file(image_file)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, (300, 500))
return image, label
def augment(image, label):
image = tf.image.random_flip_left_right(image)
image = tf.image.random_brightness(image, max_delta=0.5)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
return image, label
ds_train = ds_train.map(read_image).map(augment).batch(28)
ds_val = ds_val.map(read_image).batch(28)
base_model = EfficientNetV2L(input_shape = (300, 500, 3),
include_top = False,
weights = 'imagenet',
include_preprocessing = True)
base_model.trainable = False
x = base_model.output
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(128, activation = 'relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(6, activation = 'softmax')(x)
model = tf.keras.Model(inputs = base_model.input, outputs = x)
model.compile(optimizer = optimizers.Adam(learning_rate = 0.001),
loss = losses.SparseCategoricalCrossentropy(),
metrics = ['accuracy'])
callback = [callbacks.EarlyStopping(monitor = 'val_loss', patience = 2)]
history = model.fit(ds_train, batch_size = 28, validation_data = ds_val, epochs = 10, verbose = 1, callbacks = callback)
After training the model for 8 epoch (early stopping), I want to fine tune it by setting training = True, But when I turn base_model.trainable = True
,then it gives me ResourceExhaustedError
:
base_model.trainable = True
model.compile(optimizer = optimizers.Adam(learning_rate = 0.0001),
loss = losses.SparseCategoricalCrossentropy(),
metrics = ['accuracy'])
callback = [callbacks.EarlyStopping(monitor = 'val_loss', patience = 2)]
history = model.fit(ds_train, batch_size = 16, validation_data = ds_val, epochs = 10, verbose = 1, callbacks = callback)
error:
Epoch 1/10
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-56-6bda2975dd16> in <module>
1 callback = [callbacks.EarlyStopping(monitor = 'val_loss', patience = 2)]
2
----> 3 history = model.fit(ds_train, batch_size = 16, validation_data = ds_val, epochs = 10, verbose = 1, callbacks = callback)
1 frames
/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
52 try:
53 ctx.ensure_initialized()
---> 54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
55 inputs, attrs, num_outputs)
56 except core._NotOkStatusException as e:
ResourceExhaustedError: Graph execution error:
Detected at node 'model/block3b_project_conv/Conv2D' defined at (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 992, in launch_instance
app.start()
File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 612, in start
self.io_loop.start()
File "/usr/local/lib/python3.8/dist-packages/tornado/platform/asyncio.py", line 149, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
self._run_once()
File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
handle._run()
File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
self._context.run(self._callback, *self._args)
File "/usr/local/lib/python3.8/dist-packages/tornado/ioloop.py", line 690, in <lambda>
lambda f: self._run_callback(functools.partial(callback, future))
File "/usr/local/lib/python3.8/dist-packages/tornado/ioloop.py", line 743, in _run_callback
ret = callback()
File "/usr/local/lib/python3.8/dist-packages/tornado/gen.py", line 787, in inner
self.run()
File "/usr/local/lib/python3.8/dist-packages/tornado/gen.py", line 748, in run
yielded = self.gen.send(value)
File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 365, in process_one
yield gen.maybe_future(dispatch(*args))
File "/usr/local/lib/python3.8/dist-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/usr/local/lib/python3.8/dist-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 543, in execute_request
self.do_execute(
File "/usr/local/lib/python3.8/dist-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.8/dist-packages/ipykernel/ipkernel.py", line 306, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.8/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2854, in run_cell
result = self._run_cell(
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2881, in _run_cell
return runner(coro)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3057, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3249, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-56-6bda2975dd16>", line 3, in <module>
history = model.fit(ds_train, batch_size = 16, validation_data = ds_val, epochs = 10, verbose = 1, callbacks = callback)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1409, in fit
tmp_logs = self.train_function(iterator)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1051, in train_function
return step_function(self, iterator)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1030, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 889, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 458, in call
return self._run_internal_graph(
File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 596, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 250, in call
outputs = self.convolution_op(inputs, self.kernel)
File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 225, in convolution_op
return tf.nn.convolution(
Node: 'model/block3b_project_conv/Conv2D'
OOM when allocating tensor with shape[96,384,1,1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node model/block3b_project_conv/Conv2D}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_351566]
I tried setting batch_size
= 1, but it is still not working. Any solution?
CodePudding user response:
The V2L is a large model (479MB
) and so I think it's normal to face ResourceExhaustedError
. It depends on your GPU, whether it can take it or not. So, the simple answer would be to use better accelerator. However, here are some common approach you can try, but it's not guaranteed.
- Use smaller input / unfreeze not all layers but few, etc.
- Enable
mixed_precision
. - Configure
JIT
compilation. - Set memory growth for physical device (GPU).
- If possible, use
TPU
accelerator (freely available on kaggle and colab.)
tf.config.optimizer.set_jit(True) # (2)
keras.mixed_precision.set_global_policy("mixed_float16") # (1)
physical_devices = tf.config.list_physical_devices('GPU')
for pd in physical_devices:
tf.config.experimental.set_memory_growth(pd, True) # (3)
To set up TPU, check this code-example [device-section]. It might be helpful.Also check this ticket, see the [Feature Request 1] section. Here you can find a gist, you can use it to find the optimal batch size for training.