NOTE - Since I do need to stream loading the data instead of in memory, Please show the example using tf.data.experimental.make_csv_dataset
. Also, please show an example using my exact dataset.
I'm trying to replicate this TensorFlow Recommenders tutorial with a toy dataset. However, I'm getting this below error:
Epoch 1/5
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7920/1393870474.py in <module>
106
107 # Train.
--> 108 model.fit(interactions, epochs=5)
109
110 # Evaluate.
~/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
65 except Exception as e: # pylint: disable=broad-except
66 filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67 raise e.with_traceback(filtered_tb) from None
68 finally:
69 del filtered_tb
~/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
1127 except Exception as e: # pylint:disable=broad-except
1128 if hasattr(e, "ag_error_metadata"):
-> 1129 raise e.ag_error_metadata.to_exception(e)
1130 else:
1131 raise
ValueError: in user code:
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function *
return step_function(self, iterator)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step **
outputs = model.train_step(data)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/models/base.py", line 68, in train_step
loss = self.compute_loss(inputs, training=True)
File "/tmp/ipykernel_7920/1393870474.py", line 94, in compute_loss
return self.task(user_embeddings, channel_embeddings)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
ValueError: Exception encountered when calling layer "retrieval" (type Retrieval).
in user code:
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/tasks/retrieval.py", line 143, in call *
metric_update_ops.append(
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/metrics/factorized_top_k.py", line 84, in update_state *
top_k_predictions, _ = self._candidates(query_embeddings, k=self._k)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
ValueError: Exception encountered when calling layer "streaming" (type Streaming).
in user code:
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py", line 441, in top_k *
joined_scores = tf.concat([state_scores, x_scores], axis=1)
ValueError: Shape must be rank 2 but is rank 3 for '{{node concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](args_0, args_2, concat/axis)' with input shapes: [1,0], [?,1,1], [].
Call arguments received:
• queries=tf.Tensor(shape=(1, 64), dtype=float32)
• k=100
Call arguments received:
• query_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
• candidate_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
• sample_weight=None
• candidate_sampling_probability=None
• candidate_ids=None
• compute_metrics=True
Here's my code:
from typing import Dict, Text
import pandas as pd
from pathlib import Path
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
df_interactions = pd.DataFrame({
'user_id': [
'00001446-da5f-4d17',
'00001446-da5f-4d17',
'00005ab5-c9e0-4b05-',
'00005ab5-c9e0-4b05-',
'000093dd-1a11-4600',
'000093dd-1a11-4600',
'00009b34-65b5-42c1',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd'
],
'channel_id': [
'1', '2', 'A56',
'3', 'B72', '2',
'M63', '2', '5', 'A56'
]
})
df_interactions.to_csv('experiment_interactions.csv', index=False)
df_channels = pd.DataFrame({
'channel_id': [
'1', '2', '3', '5', 'A56', 'B72', 'M63'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
})
df_channels.to_csv('experiment_channels.csv', index=False)
interactions = tf.data.experimental.make_csv_dataset(
file_pattern='experiment_interactions.csv',
column_defaults=[tf.string, tf.string],
batch_size=1
)
channels = tf.data.experimental.make_csv_dataset(
file_pattern='experiment_channels.csv',
column_defaults=[tf.string, tf.string],
batch_size=1
)
# Select the basic features.
interactions = interactions.map(lambda x: {
"user_id": tf.strings.to_number(x["user_id"]),
"channel_id": tf.strings.to_number(x["channel_id"])
})
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))
# Build a model.
class Model(tfrs.Model):
def __init__(self):
super().__init__()
# Set up user representation.
self.user_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up movie representation.
self.item_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up a retrieval task and evaluation metrics over the
# entire dataset of candidates.
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=channels.batch(1).map(self.item_model)
)
)
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["user_id"])
channel_embeddings = self.item_model(features["channel_id"])
return self.task(user_embeddings, channel_embeddings)
model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
#shuffled = interactions.shuffle(100000, seed=42, reshuffle_each_iteration=False)
#train = shuffled.take(80000)
#test = shuffled.skip(80000).take(20000)
# Train.
model.fit(interactions, epochs=5)
Additional info:
- TensorFlow version: '2.7.0'
- TensorFlow Datasets version: '4.4.0'
- Pandas version: '1.3.4'
CodePudding user response:
You seem to be preprocessing your data incorrectly. For example, you cannot use tf.strings.to_number
to convert 00001446-da5f-4d17
into some number. It will throw an error, since the string contains more than just numbers. Also, each sample in your dataset was an array instead of a single sample: Channel 1, for example, was not 1, but [1]. This was the cause of the original problem in your question. Here is a simplified working example based on your code:
from typing import Dict, Text
import pandas as pd
from pathlib import Path
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
df_interactions = pd.DataFrame({
'user_id': [
'4d17',
'4d17',
'4b05',
'4b05',
'93dd',
'93dd',
'9b34',
'4bcd',
'-4bcd',
'4bcd'
],
'channel_id': [
'1', '2', '6',
'3', '7', '2',
'8', '2', '5', '6'
]
})
df_channels = pd.DataFrame({
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
})
df_channels = pd.DataFrame({
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
})
interactions = tf.data.Dataset.from_tensor_slices((dict(df_interactions)))
interactions = interactions.map(lambda x: {
"user_id": tf.strings.to_number(tf.strings.regex_replace(x["user_id"], '[^0-9^]', "")),
"channel_id": tf.strings.to_number(x["channel_id"])
})
channels = tf.data.Dataset.from_tensor_slices((dict(df_channels)))
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))
# Build a model.
class Model(tfrs.Model):
def __init__(self):
super().__init__()
# Set up user representation.
self.user_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up movie representation.
self.item_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up a retrieval task and evaluation metrics over the
# entire dataset of candidates.
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=channels.batch(1).map(self.item_model)
)
)
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["user_id"])
channel_embeddings = self.item_model(features["channel_id"])
return self.task(user_embeddings, channel_embeddings)
model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
tf.random.set_seed(42)
model.fit(interactions.batch(1), epochs=5)
Epoch 1/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 2/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 3/5
10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 4/5
10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 5/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
<keras.callbacks.History at 0x7fe480d22f50>
If you want to read your files into a dataset, try something like this:
################## ORIGINAL DATASET ##################
df_interactions = pd.DataFrame({
'user_id': [
'00001446-da5f-4d17',
'00001446-da5f-4d17',
'00005ab5-c9e0-4b05-',
'00005ab5-c9e0-4b05-',
'000093dd-1a11-4600',
'000093dd-1a11-4600',
'00009b34-65b5-42c1',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd'
],
'channel_id': [
'1', '2', 'A56',
'3', 'B72', '2',
'M63', '2', '5', 'A56'
]
})
df_channels = pd.DataFrame({
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
})
"""
################## MODIFIED DATASET ##################
df_interactions = pd.DataFrame({
'user_id': [
'4d17',
'4d17',
'4b05',
'4b05',
'93dd',
'93dd',
'9b34',
'4bcd',
'-4bcd',
'4bcd'
],
'channel_id': [
'1', '2', '6',
'3', '7', '2',
'8', '2', '5', '6'
]
})
df_channels = pd.DataFrame({
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
})
"""
df_channels.to_csv('experiment_channels.csv', index=False)
df_interactions.to_csv('experiment_interactions.csv', index=False)
channels = tf.data.experimental.CsvDataset('experiment_channels.csv', [tf.string, tf.string], header=True)
interactions = tf.data.experimental.CsvDataset('experiment_interactions.csv', [tf.string, tf.string], header=True)
def preprocess_channels(x, y):
return x
def preprocess_interactions(x, y):
return {
"user_id": tf.strings.regex_replace(x, '[^0-9^]', ""),
"channel_id": y
}
channels = channels.map(preprocess_channels)
interactions = interactions.map(preprocess_interactions)
interactions_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
interactions_vocabulary.adapt(interactions.map(lambda x: x["user_id"]))
channels_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
channels_vocabulary.adapt(channels)
# Build a model.
class Model(tfrs.Model):
def __init__(self):
super().__init__()
self.user_model = tf.keras.Sequential([
interactions_vocabulary,
tf.keras.layers.Embedding(interactions_vocabulary.vocabulary_size(), 64)
])
self.item_model = tf.keras.Sequential([
channels_vocabulary,
tf.keras.layers.Embedding(channels_vocabulary.vocabulary_size(), 64)
])
# Set up a retrieval task and evaluation metrics over the
# entire dataset of candidates.
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=channels.batch(1).map(self.item_model)
)
)
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["user_id"])
channel_embeddings = self.item_model(features["channel_id"])
return self.task(user_embeddings, channel_embeddings)
model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
tf.random.set_seed(42)
model.fit(interactions.batch(1), epochs=5)
Note this example uses tf.keras.layers.StringLookup
.
The
tf.data.experimental.CsvDataset
class provides a minimal CSV Dataset interface.
However, you are far more flexible than using a more high-level API like tf.data.experimental.make_csv_dataset
. Check out the docs for more information.