Write generator function for LSTM text generation model-CodePudding

i have a LSTM model for text generation but when trying to increase the amount of data to input, I run into RAM issues so I found out that I can use fit_generator function to load the data in step by step.

The problem is currently that keras.utils.to_categorical takes to much space when the amount of unique words increases.

So i want to convert this code block into a generator function:

x_values, labels = input_seqs[:, :-1], input_seqs[:, -1]
y_values = tf.keras.utils.to_categorical(labels, num_classes=total_unique_words)

#Shape of x_values: (152250, 261)
#Shape of y_values: (152250, 4399)

And i got something like this but I'm not sure how to assign the right values to batch_x and batch_y

def generator(input_seq, batch_size):

    index = 0 
    while True:
      batch_x = np.zeros((batch_size, max_seq_length-1))
      batch_y = np.zeros((batch_size, total_unique_words))
      for i in range(batch_size):
        batch_x[i] = input_seqs[:, :-1][i]
        batch_y[i] = tf.keras.utils.to_categorical(input_seqs[:, -1][i], num_classes=total_unique_words)
        index = index   1
        if index == len(input_seq):
          index = 0

      yield batch_x, batch_y

Full code for better overview:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_list)
word_index = tokenizer.word_index
total_unique_words = len(tokenizer.word_index)   1 

input_sequences = []
for line in review_list:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_seqs = token_list[:i 1]
    input_sequences.append(n_gram_seqs)

max_seq_length = max([len(x) for x in input_sequences])
input_seqs = np.array(pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre'))

x_values, labels = input_seqs[:, :-1], input_seqs[:, -1]
y_values = tf.keras.utils.to_categorical(labels, num_classes=total_unique_words)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
K.clear_session()
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim = total_unique_words, output_dim=100, input_length=max_seq_length-1),
tf.keras.layers.LSTM(256, return_sequences=True), 
tf.keras.layers.Dropout(0.2), 
tf.keras.layers.LSTM(256), 
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(total_unique_words , activation='softmax')])
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

CodePudding user response：

You could try something like this:

def generator(input_seq, batch_size, dataset_size):

    no_batches = int(dataset_size/batch_size)
    batch_x = np.zeros((batch_size, max_seq_length-1))
    batch_y = np.zeros((batch_size, total_unique_words))

    for i in range(no_batches):
        batch_x = input_seqs[:, :-1][(i*batch_size) : ((i 1)*batch_size)]
        batch_y = tf.keras.utils.to_categorical(input_seqs[:, -1][(i*batch_size) : ((i 1)*batch_size)], num_classes=total_unique_words)

        yield batch_x, batch_y

    return

I added the dataset_size(152250 in your case) argument so that the number of batches could be calculated.