How to use TensorFlow RelativePositionEmbedding layers with batches?-CodePudding

I'm trying to incorporate a RelativePositionEmbedding layer into a transformer example. The embedding layer can be found in the build_model method below:

import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from official.nlp.modeling.layers import position_embedding


def readucr(filename):
    data = np.loadtxt(filename, delimiter="\t")
    y = data[:, 0]
    x = data[:, 1:]
    return x, y.astype(int)

root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"

x_train, y_train = readucr(root_url   "FordA_TRAIN.tsv")
x_test, y_test = readucr(root_url   "FordA_TEST.tsv")


x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

n_classes = len(np.unique(y_train))

idx = np.random.permutation(len(x_train))

x_train = x_train[idx]
y_train = y_train[idx]

y_train[y_train == -1] = 0
y_test[y_test == -1] = 0

# Build model

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    x = layers.MultiHeadAttention(
            key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(inputs, inputs)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    res = x   inputs

    # Feed Forward Part
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x   res

def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0
):
    inputs = keras.Input(shape=input_shape)
    x = inputs # => shape is (None, 500, 1)

    x = position_embedding.RelativePositionEmbedding(hidden_size=500)(x) # Now (500, 500)

    # Add batch dimension back. But how to accept batch size greater than 1?
    x = layers.Lambda(lambda x: tf.expand_dims(x, axis=0))(x) # Now (1, 500, 500)
    
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation="softmax")(x)
    return keras.Model(inputs, outputs)

input_shape = x_train.shape[1:]

model = build_model(
    input_shape,
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[128],
    mlp_dropout=0.4,
    dropout=0.25
)


model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    metrics=["sparse_categorical_accuracy"]
)

callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
    keras.callbacks.TensorBoard(log_dir="./logs")
]

model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=64,
    callbacks=callbacks
)

model.evaluate(x_test, y_test, verbose=1)

The following blows up because I've specified batch_size of 64. However everything works fine when setting batch_size to 1 because the expand_dims operation only adds a size 1 batch dimension, as opposed to an Input layer that adds None for arbitrary batch sizes.

So how can I add "back in" a batch dimension greater than 1? Is there another way I should be using the RelativePositionEncoding layer to not interfere with batch sizes?

I've tried looking into the Reshape method as well without success. I thought this question would solve my issue, but this only adds a leading 1 dimension like the Lambda layer I incorporated, rather than None, which I think would resolve the issue.

CodePudding user response：

I do not think you can pass the output of the RelativePositionEmbedding directly to another layer. If you take a look here, the authors are adding the output of this layer to the original input. Your code will work if you change your model like this:

# ....
# Your code
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0
):
    inputs = keras.Input(shape=input_shape)
    x = inputs # => shape is (None, 500, 1)

    pos_encoding = position_embedding.RelativePositionEmbedding(hidden_size=500)(x) # Now (500, 500)
    x = inputs   pos_encoding

    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation="softmax")(x)
    return keras.Model(inputs, outputs)
# ....
# Your code

45/45 [==============================] - 54s 1s/step - loss: 1.0281 - sparse_categorical_accuracy: 0.5111 - val_loss: 0.7387 - val_sparse_categorical_accuracy: 0.5645
42/42 [==============================] - 8s 187ms/step - loss: 0.7440 - sparse_categorical_accuracy: 0.5424
[0.7440475225448608, 0.5424242615699768]