I have an encoder-decoder model that makes great predictions but I am struggling to save the layers' hidden states so that the model can be reused.

The text below describes every step I took to train, test, save and load my model.

Imports

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Embedding
from tensorflow.keras.models import Model

Training

After preprocessing the data, I trained the encoder-decoder model as shown below.

Training Model Code

embedding_size = 175
vocab_size = len(tokenizer.word_index)

encoder_inputs = Input(shape=(None,))
en_x =  Embedding(vocab_size, embedding_size, mask_zero=True)(encoder_inputs)

# Encoder lstm
encoder = LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)

# discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

# target word embeddings
dex =  Embedding(vocab_size, embedding_size, mask_zero=True)
final_dex = dex(decoder_inputs)

# decoder lstm
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex,
                                     initial_state=encoder_states)

decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# While training, model takes eng and french words and outputs #translated french word
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# rmsprop is preferred for nlp tasks
model.compile(optimizer='rmsprop', loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                  metrics=['accuracy'])

model.fit([X_train, X_decoder], y_train,
          batch_size=32,
          epochs=50,
          validation_split=0.1)

Training Model Summary

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 175)    499800      input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 175)    499800      input_3[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 512), (None, 1409024     embedding[0][0]                  
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 512),  1409024     embedding_1[0][0]                
                                                                 lstm[0][1]                       
                                                                 lstm[0][2]                       
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, None, 2856)   1465128     lstm_1[0][0]                     
==================================================================================================
Total params: 5,282,776
Trainable params: 5,282,776
Non-trainable params: 0
__________________________________________________________________________________________________

Inference

After training I created the following inference model (as the training model uses teacher reinforcing and cannot be used to make predictions).

Inference Model

encoder_model = Model(encoder_inputs, encoder_states)

# Redefine the decoder model with decoder will be getting below inputs from encoder while in prediction
decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
final_dex2 = dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)

decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

# sampling model will take encoder states and decoder_input (seed initially) and output the predictions. We don't care about decoder_states2
decoder_model = Model(
    [decoder_inputs]   decoder_states_inputs,
    [decoder_outputs2]   decoder_states2)

Now all I needed was a function that makes predictions (see below), and after some testing found that my model had a 97.2% accuracy on the testing set.

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = tokenizer.word_index['<sos>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq]   states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word[sampled_token_index]
        decoded_sentence.append(sampled_char)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '<eos>' or
           len(decoded_sentence) > 6):
            stop_condition = True

       # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]
        
    return decoded_sentence

Saving the Model

I then saved the training model and the two inference models. I also saved the tokeniser I used to preprocess the data.

model.save('training_model.h5')
encoder_model.save('encoder_model.h5')
decoder_model.save('decoder_model.h5')

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Loading the Model

This is where I am getting stuck! In order to make predictions I need to load the layers and states: encoder_inputs, encoder_states, dex, decoder_inputs, decoder_lstm and decoder_dense

Attempt 1

At first I tried simply loading encoder_model and decoder_model then simply calling decode_sequence() but the loaded model had an accuracy of 0% - clearly the hidden states were not being saved as I expected them.

Attempt 2

I then attempted to load the layers of the initial training model and then recreating the inference model. Here is what I tried...

encoder_inputs = model.layers[0]
_, state_h, state_c = model.layers[4].output
encoder_states = [state_h, state_c]
decoder_inputs = model.layers[1]
decoder_lstm = model.layers[5]

Then re-ran the code in the Inference section.

This cause the following error...

ValueError: Input tensors to a Functional must come from `tf.keras.Input`. Received: <keras.engine.input_layer.InputLayer object at 0x16b7010a0> (missing previous layer metadata).

I am not really sure what to do at this point. Can anyone help?

CodePudding user response：

I figured out a solution! It is a little hacky, but it works! Here are the steps I took to save and load the trained model.

Step 1 - Save the tokenizer and the weights each individual layer

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# save the weights individually
for layer in model.layers:
    weights = layer.get_weights()
    if weights != []:
        np.savez(f'{layer.name}.npz', weights)

Step 2 - Load the tokenizer and the layers

# load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# load the weights
w_encoder_embeddings = np.load('encoder_embeddings.npz', allow_pickle=True)
w_decoder_embeddings = np.load('decoder_embeddings.npz', allow_pickle=True)
w_encoder_lstm = np.load('encoder_lstm.npz', allow_pickle=True)
w_decoder_lstm = np.load('decoder_lstm.npz', allow_pickle=True)
w_dense = np.load('dense.npz', allow_pickle=True)

Step 3 - Recreate the training model

This is my model (see question for more details):

embedding_size = 175
vocab_size = len(tokenizer.word_index)   1

encoder_inputs = Input(shape=(None,), name="encoder_inputs")
encoder_embeddings = Embedding(vocab_size, embedding_size, mask_zero=True, name="encoder_embeddings")(encoder_inputs)

# Encoder lstm
encoder_lstm = LSTM(512, return_state=True, name="encoder_lstm")
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings)

# discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,), name="decoder_inputs")

# target word embeddings
decoder_embeddings = Embedding(vocab_size, embedding_size, mask_zero=True, name="decoder_embeddings")
training_decoder_embeddings = decoder_embeddings(decoder_inputs)

# decoder lstm
decoder_lstm = LSTM(512, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(training_decoder_embeddings,
                                     initial_state=encoder_states)

decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'), name="dense")
decoder_outputs = decoder_dense(decoder_outputs)

# While training, model takes input and traget words and outputs target strings
loaded_model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="training_model")

Now we want to apply our saved weights to the layers in this model

# set the weights of the model

loaded_model.layers[2].set_weights(w_encoder_embeddings['arr_0'])
loaded_model.layers[3].set_weights(w_decoder_embeddings['arr_0'])
loaded_model.layers[4].set_weights(w_encoder_lstm['arr_0'])
loaded_model.layers[5].set_weights(w_decoder_lstm['arr_0'])
loaded_model.layers[6].set_weights(w_dense['arr_0'])

Step 4 - Create the inference model

encoder_model = Model(encoder_inputs, encoder_states)

# Redefine the decoder model with decoder will be getting below inputs from encoder while in prediction
decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
inference_decoder_embeddings = decoder_embeddings(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(inference_decoder_embeddings, initial_state=decoder_states_inputs)

decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

# sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
decoder_model = Model(
    [decoder_inputs]   decoder_states_inputs,
    [decoder_outputs2]   decoder_states2)

And voilà! I can now make inferences using the previously trained model!