I'm trying to incorporate a RelativePositionEmbedding
layer into a transformer example. The embedding layer can be found in the build_model
method below:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from official.nlp.modeling.layers import position_embedding
def readucr(filename):
data = np.loadtxt(filename, delimiter="\t")
y = data[:, 0]
x = data[:, 1:]
return x, y.astype(int)
root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"
x_train, y_train = readucr(root_url "FordA_TRAIN.tsv")
x_test, y_test = readucr(root_url "FordA_TEST.tsv")
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))
n_classes = len(np.unique(y_train))
idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
# Build model
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
# Attention and Normalization
x = layers.MultiHeadAttention(
key_dim=head_size, num_heads=num_heads, dropout=dropout
)(inputs, inputs)
x = layers.Dropout(dropout)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
res = x inputs
# Feed Forward Part
x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
x = layers.Dropout(dropout)(x)
x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
return x res
def build_model(
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0,
mlp_dropout=0
):
inputs = keras.Input(shape=input_shape)
x = inputs # => shape is (None, 500, 1)
x = position_embedding.RelativePositionEmbedding(hidden_size=500)(x) # Now (500, 500)
# Add batch dimension back. But how to accept batch size greater than 1?
x = layers.Lambda(lambda x: tf.expand_dims(x, axis=0))(x) # Now (1, 500, 500)
for _ in range(num_transformer_blocks):
x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
for dim in mlp_units:
x = layers.Dense(dim, activation="relu")(x)
x = layers.Dropout(mlp_dropout)(x)
outputs = layers.Dense(n_classes, activation="softmax")(x)
return keras.Model(inputs, outputs)
input_shape = x_train.shape[1:]
model = build_model(
input_shape,
head_size=256,
num_heads=4,
ff_dim=4,
num_transformer_blocks=4,
mlp_units=[128],
mlp_dropout=0.4,
dropout=0.25
)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer=keras.optimizers.Adam(learning_rate=1e-4),
metrics=["sparse_categorical_accuracy"]
)
callbacks = [
keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
keras.callbacks.TensorBoard(log_dir="./logs")
]
model.fit(
x_train,
y_train,
validation_split=0.2,
epochs=5,
batch_size=64,
callbacks=callbacks
)
model.evaluate(x_test, y_test, verbose=1)
The following blows up because I've specified batch_size
of 64
. However everything works fine when setting batch_size
to 1
because the expand_dims
operation only adds a size 1
batch dimension, as opposed to an Input
layer that adds None
for arbitrary batch sizes.
So how can I add "back in" a batch dimension greater than 1? Is there another way I should be using the RelativePositionEncoding
layer to not interfere with batch sizes?
I've tried looking into the Reshape
method as well without success.
I thought this question would solve my issue, but this only adds a leading 1
dimension like the Lambda layer I incorporated, rather than None
, which I think would resolve the issue.
CodePudding user response:
I do not think you can pass the output of the RelativePositionEmbedding
directly to another layer. If you take a look here, the authors are adding the output of this layer to the original input. Your code will work if you change your model like this:
# ....
# Your code
def build_model(
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0,
mlp_dropout=0
):
inputs = keras.Input(shape=input_shape)
x = inputs # => shape is (None, 500, 1)
pos_encoding = position_embedding.RelativePositionEmbedding(hidden_size=500)(x) # Now (500, 500)
x = inputs pos_encoding
for _ in range(num_transformer_blocks):
x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
for dim in mlp_units:
x = layers.Dense(dim, activation="relu")(x)
x = layers.Dropout(mlp_dropout)(x)
outputs = layers.Dense(n_classes, activation="softmax")(x)
return keras.Model(inputs, outputs)
# ....
# Your code
45/45 [==============================] - 54s 1s/step - loss: 1.0281 - sparse_categorical_accuracy: 0.5111 - val_loss: 0.7387 - val_sparse_categorical_accuracy: 0.5645
42/42 [==============================] - 8s 187ms/step - loss: 0.7440 - sparse_categorical_accuracy: 0.5424
[0.7440475225448608, 0.5424242615699768]