i have a LSTM model for text generation but when trying to increase the amount of data to input, I run into RAM issues so I found out that I can use fit_generator function to load the data in step by step.
The problem is currently that keras.utils.to_categorical takes to much space when the amount of unique words increases.
So i want to convert this code block into a generator function:
x_values, labels = input_seqs[:, :-1], input_seqs[:, -1]
y_values = tf.keras.utils.to_categorical(labels, num_classes=total_unique_words)
#Shape of x_values: (152250, 261)
#Shape of y_values: (152250, 4399)
And i got something like this but I'm not sure how to assign the right values to batch_x and batch_y
def generator(input_seq, batch_size):
index = 0
while True:
batch_x = np.zeros((batch_size, max_seq_length-1))
batch_y = np.zeros((batch_size, total_unique_words))
for i in range(batch_size):
batch_x[i] = input_seqs[:, :-1][i]
batch_y[i] = tf.keras.utils.to_categorical(input_seqs[:, -1][i], num_classes=total_unique_words)
index = index 1
if index == len(input_seq):
index = 0
yield batch_x, batch_y
Full code for better overview:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_list)
word_index = tokenizer.word_index
total_unique_words = len(tokenizer.word_index) 1
input_sequences = []
for line in review_list:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_seqs = token_list[:i 1]
input_sequences.append(n_gram_seqs)
max_seq_length = max([len(x) for x in input_sequences])
input_seqs = np.array(pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre'))
x_values, labels = input_seqs[:, :-1], input_seqs[:, -1]
y_values = tf.keras.utils.to_categorical(labels, num_classes=total_unique_words)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
K.clear_session()
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim = total_unique_words, output_dim=100, input_length=max_seq_length-1),
tf.keras.layers.LSTM(256, return_sequences=True),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.LSTM(256),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(total_unique_words , activation='softmax')])
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
CodePudding user response:
You could try something like this:
def generator(input_seq, batch_size, dataset_size):
no_batches = int(dataset_size/batch_size)
batch_x = np.zeros((batch_size, max_seq_length-1))
batch_y = np.zeros((batch_size, total_unique_words))
for i in range(no_batches):
batch_x = input_seqs[:, :-1][(i*batch_size) : ((i 1)*batch_size)]
batch_y = tf.keras.utils.to_categorical(input_seqs[:, -1][(i*batch_size) : ((i 1)*batch_size)], num_classes=total_unique_words)
yield batch_x, batch_y
return
I added the dataset_size
(152250 in your case) argument so that the number of batches could be calculated.