Trying to understand listwise documentation
while trying to replicate by mixing deep model to listwise I am stuck at point where I am not able to set the pool size inside the sequential layer in an dynamic manner. For example consider below code
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q tensorflow-ranking
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
from typing import Dict, Text
import os
import tempfile
import datetime
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")
ratings = ratings.map(lambda x: {
"movie_title": x["movie_title"],
"user_id": x["user_id"],
"user_rating": x["user_rating"],
# "timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"])
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
lambda x: x["user_id"]))))
class MovieModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000_00
self.title_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
# tf.keras.layers.Flatten(),
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.AveragePooling2D(pool_size=(1,4),strides=1, padding='valid',),
])
self.title_vectorizer.adapt(movies)
def call(self, titles):
return self.title_text_embedding(titles)
After we create movie model lets try to test it before we can use it on proper movie data
below is the test code
test_movie_titles = [["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)","Dances with Wolves (1990)", "Speed (1994)"]]
md = MovieModel()
test_ratings = md(tf.constant(tf.reshape(test_movie_titles,[1,5,1])) )
test_ratings
This now works perfect and I will get an output as below
<tf.Tensor: shape=(1, 5, 1, 32), dtype=float32, numpy=
array([[[[ 0.00778975, -0.00899004, 0.02926993, -0.00527342,
0.00706512, 0.02012717, 0.03438753, 0.01971687,
-0.00543808, -0.00754605, -0.02241766, 0.00045748,
-0.00785657, -0.00291913, 0.00670988, 0.01176082,
-0.02052191, -0.00751739, -0.01433057, 0.008
-----
----
Now if you notice in the code above I have hardcoded the pool_size as 1,4 ( tf.keras.layers.AveragePooling2D(pool_size=(1,4),strides=1, padding='valid',),
) because the test sample I had used above only have maximum 4 words, so the vectorization will produce vector of size 4, now problem is how to I ensure the right pool size when I pass the whole dataset (movies) to the model. How can I pass such external value (pool_size) to an sequential layer from outside?
The above code was run on google colab using tensorflow version 2.9.1
CodePudding user response:
Maybe something like this:
class MovieModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000_00
self.title_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
# tf.keras.layers.Flatten(),
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
])
self.title_vectorizer.adapt(movies)
def call(self, titles, pool_size):
avg_layer = tf.keras.layers.AveragePooling2D(pool_size=pool_size,strides=1,padding='valid',)
return avg_layer(self.title_text_embedding(titles))
test_movie_titles = [["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)","Dances with Wolves (1990)", "Speed (1994)"]]
md = MovieModel()
test_ratings = md(tf.constant(tf.reshape(test_movie_titles,[1,5,1])), pool_size = (1, 4))
test_ratings