I have been working on a comparison of the CNN and RNN deep learning models for sentimental analysis.
I built the CNN following this guide: https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/ , and I got an accuracy of 90 in CNN.
However, when I tried to recreate a LSTM model, the accuracy seems to hover around 0.5 -, and doesnt seems to improve over time. I wonder what is wrong with my codes I the only thing I have done is to replace the existing CNN model with LSTM in the model.add section. I have tried to change the loss from "binary" to "categorical", and different activation function. It still doesn't resolve the issue.
This is my CNN model which worked fine
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
positive_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/neg', vocab, True)
train_docs = negative_docs positive_docs
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels, TRAINING DATASET
ytrain = array([0 for _ in range(900)] [1 for _ in range(900)])
# load all test reviews
positive_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/neg', vocab, False)
test_docs = negative_docs positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels, TESTING DATASET
ytest = array([0 for _ in range(100)] [1 for _ in range(100)])
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) 1
print(vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
history=model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
print(history.history.keys())
#summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
This is my current LSTM model
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
positive_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/neg', vocab, True)
train_docs = negative_docs positive_docs
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels, TRAINING DATASET
ytrain = array([0 for _ in range(900)] [1 for _ in range(900)])
# load all test reviews
positive_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('C:/Users/zenhu/OneDrive/Desktop/Final Year Project/archive/review_polarity/txt_sentoken/neg', vocab, False)
test_docs = negative_docs positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels, TESTING DATASET
ytest = array([0 for _ in range(100)] [1 for _ in range(100)])
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) 1
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(units=100))
model.add(Dense(10))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
history=model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
print(history.history.keys())`
#summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
CodePudding user response:
The problem is in your LSTM layer. It is not returning a sequence of the same length. You must set return_sequences=True
when stacking layer so that the second layer has a three-dimensional sequence input. After adding return_sequences = True
parameter in your LSTM layer, it will give you around 90% accuracy for sure.
model.add(LSTM(units=100, return_sequences = True))