Is there a way to use pre-trained Embedding with Tf-Idf in tensorflow?-CodePudding

I am using the default and basic implementation of Text Classification as:


 

  tokenizer = Tokenizer(num_words=vocab_size, filters = filters)
  tokenizer.fit_on_texts(list(train_X))
  train_X = tokenizer.texts_to_sequences(train_X)
  val_X = tokenizer.texts_to_sequences(val_X)
  train_X = pad_sequences(train_X, maxlen=maxlen)
  val_X = pad_sequences(val_X, maxlen=maxlen)

 def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # For loading Embedding

  embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
  all_embs = np.stack(embeddings_index.values())
  emb_mean,emb_std = all_embs.mean(), all_embs.std()
  embed_dim = all_embs.shape[1]

  word_index = tokenizer.word_index
  vocab_size = min(vocab_size, len(word_index))

  embedding_matrix = np.random.normal(emb_mean, emb_std, (vocab_size, embed_dim)) # vocab_size was nb_words
  for word, i in word_index.items():
      if i >= vocab_size: continue
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None: embedding_matrix[i] = embedding_vector

It works fine but is there a way to texts_to_matrix, where there are options like binart, tfidf, count etc. Is it possible that I can use them with existing Embeddings?

One possible way could be to use a Multiple Input Model and then Concatenate two inputs at one place. Apart from that, is there any?

CodePudding user response：

The most common approach is to multiply each word vector by its corresponding tf_idf score. One often sees this approach in academic papers. You could do something like this:

Create tfidf scores:

import tensorflow as tf
import numpy as np
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
import collections

def td_idf_word2weight(text):
    print("Creating TfidfVectorizer...")
    tfidf = TfidfVectorizer(preprocessor=' '.join)
    tfidf.fit(text)

    # if a word was never seen - it is considered to be at least as infrequent as any of the known words
    max_idf = max(tfidf.idf_)
    return collections.defaultdict(
        lambda: max_idf,
        [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

text = [['she let the balloon float up into the air with her hopes and dreams'],
        ['the old rusted farm equipment surrounded the house predicting its demise'],
        ['he was so preoccupied with whether or not he could that he failed to stop to consider if he should']]

td_idf = td_idf_word2weight(text)

text = np.concatenate(text)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text)
text_sequences = tokenizer.texts_to_sequences(text)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences, padding='post')
vocab_size = len(tokenizer.word_index)   1
print(td_idf.items())
print(vocab_size)

Creating TfidfVectorizer...
dict_items([('she', 1.6931471805599454), ('let', 1.6931471805599454), ('the', 1.2876820724517808), ('balloon', 1.6931471805599454), ('float', 1.6931471805599454), ('up', 1.6931471805599454), ('into', 1.6931471805599454), ('air', 1.6931471805599454), ('with', 1.2876820724517808), ('her', 1.6931471805599454), ('hopes', 1.6931471805599454), ('and', 1.6931471805599454), ('dreams', 1.6931471805599454), ('old', 1.6931471805599454), ('rusted', 1.6931471805599454), ('farm', 1.6931471805599454), ('equipment', 1.6931471805599454), ('surrounded', 1.6931471805599454), ('house', 1.6931471805599454), ('predicting', 1.6931471805599454), ('its', 1.6931471805599454), ('demise', 1.6931471805599454), ('he', 1.6931471805599454), ('was', 1.6931471805599454), ('so', 1.6931471805599454), ('preoccupied', 1.6931471805599454), ('whether', 1.6931471805599454), ('or', 1.6931471805599454), ('not', 1.6931471805599454), ('could', 1.6931471805599454), ('that', 1.6931471805599454), ('failed', 1.6931471805599454), ('to', 1.6931471805599454), ('stop', 1.6931471805599454), ('consider', 1.6931471805599454), ('if', 1.6931471805599454), ('should', 1.6931471805599454)])
38

Create tf_idf-weighted embeddings matrix:

model = api.load("glove-twitter-25")
embedding_dim = 25
weight_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
  try:
    embedding_vector = model[word] * td_idf[word]
    weight_matrix[i] = embedding_vector 
  except KeyError:
    weight_matrix[i] = np.random.uniform(-5, 5, embedding_dim)
print(weight_matrix.shape)

(38, 25)