I wish to experiement with noisy GRU states instead of resetting them to zero for each batch. I try below an implementation. My initial code was resetting initial states to zero with (states = None)
, I changed the train_step
with
noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)
The model class inherited from Tensorflow Model now looks like
class MyModel(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, rnn_units):
super().__init__(self)
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(rnn_units,
stateful=True,
return_sequences=True,
return_state=True,
activation='tanh',
recurrent_activation='sigmoid',
recurrent_dropout=0.2,
dropout=0.2,
reset_after=True
)
self.dense = tf.keras.layers.Dense(vocab_size)
def call(self, inputs, states=None, return_state=False, training=False):
x = inputs
x = self.embedding(x, training=training)
if states is None:
states = self.gru.get_initial_state(x)
x, states = self.gru(x, initial_state=states, training=training)
x = self.dense(x, training=training)
if return_state:
return x, states
else:
return x
@tf.function
def train_step(self, inputs):
inputs, labels = inputs
with tf.GradientTape() as tape:
noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)
loss=self.compiled_loss(labels, predictions, regularization_losses=self.losses)
grads=tape.gradient(loss, model.trainable_variables)
self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
self.compiled_metrics.update_state(labels, predictions)
return {m.name: m.result() for m in self.metrics}
Training runs with no error, but inference fails with
ValueError: in user code:
train.py:239 generate_one_step *
predicted_logits, states = self.model(inputs=input_ids, states=states,
train-v4.py:133 call *
x, states = self.gru(x, initial_state=states, training=training)
/usr/local/lib/python3.6/dist-packages/keras/layers/recurrent.py:716 __call__ **
return super(RNN, self).__call__(inputs, **kwargs)
[...]
ValueError: Input 0 is incompatible with layer gru: expected shape=(64, None, 256), found shape=(1, None, 256)
The generator looks like this
class OneStep(tf.keras.Model):
def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
super().__init__()
self.temperature = temperature
self.model = model
[initialize stuff]
[...]
@tf.function
def generate_one_step(self, inputs, states=None):
input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
input_ids = self.ids_from_chars(input_chars).to_tensor()
predicted_logits, states = self.model(inputs=input_ids, states=states,
return_state=True)
predicted_logits = predicted_logits[:, -1, :]
predicted_logits = predicted_logits/self.temperature
predicted_logits = predicted_logits self.prediction_mask
predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
predicted_ids = tf.squeeze(predicted_ids, axis=-1)
predicted_chars = self.chars_from_ids(predicted_ids)
return predicted_chars, states
and the code throwing the error is
for n in range(10000):
next_char, states = one_step_model.generate_one_step(next_char, states=states)
result.append(next_char)
In my understanding, we initialize states with some noise instead of zeroes to avoid overfitting. Model is better trained as before, and weights saved for inference. Should the inference model be changed as well, should the states behavior be updated in the generator too?
CodePudding user response:
IIUC, I think you might be running into this problem, where Keras
requires that the same batch size you used for your GRU
during training is also used during inference. This requirement pops up when you set the stateful
parameter to True
. Take a look at the post I linked for more details and possible workarounds.