Home > Back-end >  how to replace keras embedding with pre-trained word embedding to CNN
how to replace keras embedding with pre-trained word embedding to CNN

Time:04-17

I am currently studying how CNNs can be used in text classification and found some code on stack overflow that had worked with the use of a keras embedding layer.

I ran the code with the keras embedding but now want to test out what would happen with a pre-trained embedding, I have downloaded the word2vec api from gensim but dont know how to adapt the code from there?

My question is how can I replace the keras embedding layer with a pre-trained embedding like the word2vec model or Glove?

heres is the code

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard

# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

CodePudding user response:

This reads the text file containing the weights, stores the words and their weights in a dictionary, then maps them into a new matrix using the vocabulary of your fit tokenizer.

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from tensorflow import keras
import itertools
import numpy as np


# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

word_index = keras.datasets.imdb.get_word_index()

embedding_vecor_length = 300  # same as the embeds to be loaded below
embeddings_dictionary = dict()
glove_file = open('./embeds/glove.6B.300d.txt', 'rb')

for line in glove_file:
    records = line.split()  # seperates each line by a white space
    word = records[0]  # the first element is the word
    vector_dimensions = np.asarray(
        records[1:], dtype='float32')  # the rest are the weights
    # storing in dictionary
    embeddings_dictionary[word] = vector_dimensions
    
glove_file.close()

# len_of_vocab = len(word_index)
embeddings_matrix = np.zeros((top_words, embedding_vecor_length))
# mapping to a new matrix, using only the words in your tokenizer's vocabulary
for word, index in word_index.items():
    if index>=top_words:
        continue
    # the weights of the individual words in your vocabulary
    embedding_vector = embeddings_dictionary.get(bytes(word, 'utf-8'))
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


# Using embedding from Keras
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,
          input_length=max_review_length, name="embeddinglayer", weights=[embeddings_matrix], trainable=True))


# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=3, callbacks=[
          tensorBoardCallback], batch_size=64)


# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

  • Related