TensorFlow Recommenders - ValueError: Shape must be rank 2 but is rank 3-CodePudding

NOTE - Since I do need to stream loading the data instead of in memory, Please show the example using tf.data.experimental.make_csv_dataset. Also, please show an example using my exact dataset.

I'm trying to replicate this TensorFlow Recommenders tutorial with a toy dataset. However, I'm getting this below error:

Epoch 1/5

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_7920/1393870474.py in <module>
    106 
    107 # Train.
--> 108 model.fit(interactions, epochs=5)
    109 
    110 # Evaluate.

~/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
     65     except Exception as e:  # pylint: disable=broad-except
     66       filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67       raise e.with_traceback(filtered_tb) from None
     68     finally:
     69       del filtered_tb

~/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
   1127           except Exception as e:  # pylint:disable=broad-except
   1128             if hasattr(e, "ag_error_metadata"):
-> 1129               raise e.ag_error_metadata.to_exception(e)
   1130             else:
   1131               raise

ValueError: in user code:

    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/models/base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "/tmp/ipykernel_7920/1393870474.py", line 94, in compute_loss
        return self.task(user_embeddings, channel_embeddings)
    File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "retrieval" (type Retrieval).
    
    in user code:
    
        File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/tasks/retrieval.py", line 143, in call  *
            metric_update_ops.append(
        File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/metrics/factorized_top_k.py", line 84, in update_state  *
            top_k_predictions, _ = self._candidates(query_embeddings, k=self._k)
        File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        ValueError: Exception encountered when calling layer "streaming" (type Streaming).
        
        in user code:
        
            File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py", line 441, in top_k  *
                joined_scores = tf.concat([state_scores, x_scores], axis=1)
        
            ValueError: Shape must be rank 2 but is rank 3 for '{{node concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](args_0, args_2, concat/axis)' with input shapes: [1,0], [?,1,1], [].
        
        
        Call arguments received:
          • queries=tf.Tensor(shape=(1, 64), dtype=float32)
          • k=100
    
    
    Call arguments received:
      • query_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
      • candidate_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
      • sample_weight=None
      • candidate_sampling_probability=None
      • candidate_ids=None
      • compute_metrics=True

Here's my code:

from typing import Dict, Text
import pandas as pd
from pathlib import Path

import tensorflow as tf 
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

df_interactions = pd.DataFrame({
    'user_id': [
        '00001446-da5f-4d17', 
        '00001446-da5f-4d17',
        '00005ab5-c9e0-4b05-',
        '00005ab5-c9e0-4b05-',
        '000093dd-1a11-4600', 
        '000093dd-1a11-4600',
        '00009b34-65b5-42c1', 
        '0000ae32-4a91-4bcd',
        '0000ae32-4a91-4bcd',
        '0000ae32-4a91-4bcd'
    ], 
    'channel_id': [
        '1', '2', 'A56',
        '3', 'B72', '2', 
        'M63', '2', '5', 'A56'
    ]
})

df_interactions.to_csv('experiment_interactions.csv', index=False)

df_channels = pd.DataFrame({
    'channel_id': [
        '1', '2', '3', '5', 'A56', 'B72', 'M63' 
    ],
    'channel_name': [
        'Popular', 
        'Best',
        'Highest Rated',
        'Large Following',
        'Nice', 
        'Retro',
        'Modern'
    ]
})

df_channels.to_csv('experiment_channels.csv', index=False)


interactions = tf.data.experimental.make_csv_dataset(
    file_pattern='experiment_interactions.csv', 
    column_defaults=[tf.string, tf.string], 
    batch_size=1
)
channels = tf.data.experimental.make_csv_dataset(
    file_pattern='experiment_channels.csv', 
    column_defaults=[tf.string, tf.string], 
    batch_size=1
)


# Select the basic features.
interactions = interactions.map(lambda x: {
    "user_id": tf.strings.to_number(x["user_id"]),
    "channel_id": tf.strings.to_number(x["channel_id"])
})
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))


# Build a model.
class Model(tfrs.Model):

  def __init__(self):
    super().__init__()

    # Set up user representation.
    self.user_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up movie representation.
    self.item_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up a retrieval task and evaluation metrics over the
    # entire dataset of candidates.
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=channels.batch(1).map(self.item_model)
        )
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    user_embeddings = self.user_model(features["user_id"])
    channel_embeddings = self.item_model(features["channel_id"])

    return self.task(user_embeddings, channel_embeddings)


model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
#shuffled = interactions.shuffle(100000, seed=42, reshuffle_each_iteration=False)

#train = shuffled.take(80000)
#test = shuffled.skip(80000).take(20000)

# Train.
model.fit(interactions, epochs=5)

Additional info:

TensorFlow version: '2.7.0'
TensorFlow Datasets version: '4.4.0'
Pandas version: '1.3.4'

CodePudding user response：

You seem to be preprocessing your data incorrectly. For example, you cannot use tf.strings.to_number to convert 00001446-da5f-4d17 into some number. It will throw an error, since the string contains more than just numbers. Also, each sample in your dataset was an array instead of a single sample: Channel 1, for example, was not 1, but [1]. This was the cause of the original problem in your question. Here is a simplified working example based on your code:

from typing import Dict, Text
import pandas as pd
from pathlib import Path

import tensorflow as tf 
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

df_interactions = pd.DataFrame({
    'user_id': [
        '4d17', 
        '4d17',
        '4b05',
        '4b05',
        '93dd', 
        '93dd',
        '9b34', 
        '4bcd',
        '-4bcd',
        '4bcd'
    ], 
    'channel_id': [
        '1', '2', '6',
        '3', '7', '2', 
        '8', '2', '5', '6'
    ]
})

df_channels = pd.DataFrame({
    'channel_id': [
        '1', '2', '3', '5', '6', '7', '8' 
    ],
    'channel_name': [
        'Popular', 
        'Best',
        'Highest Rated',
        'Large Following',
        'Nice', 
        'Retro',
        'Modern'
    ]
})

df_channels = pd.DataFrame({
    'channel_id': [
        '1', '2', '3', '5', '6', '7', '8' 
    ],
    'channel_name': [
        'Popular', 
        'Best',
        'Highest Rated',
        'Large Following',
        'Nice', 
        'Retro',
        'Modern'
    ]
})

interactions = tf.data.Dataset.from_tensor_slices((dict(df_interactions)))
interactions = interactions.map(lambda x: {
    "user_id": tf.strings.to_number(tf.strings.regex_replace(x["user_id"], '[^0-9^]', "")),
    "channel_id": tf.strings.to_number(x["channel_id"])
})

channels = tf.data.Dataset.from_tensor_slices((dict(df_channels)))
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))

# Build a model.
class Model(tfrs.Model):

  def __init__(self):
    super().__init__()

    # Set up user representation.
    self.user_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up movie representation.
    self.item_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up a retrieval task and evaluation metrics over the
    # entire dataset of candidates.
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=channels.batch(1).map(self.item_model)
        )
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    user_embeddings = self.user_model(features["user_id"])
    channel_embeddings = self.item_model(features["channel_id"])
    return self.task(user_embeddings, channel_embeddings)


model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
tf.random.set_seed(42)

model.fit(interactions.batch(1), epochs=5)

Epoch 1/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 2/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 3/5
10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 4/5
10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
Epoch 5/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e 00 - regularization_loss: 0.0000e 00 - total_loss: 0.0000e 00
<keras.callbacks.History at 0x7fe480d22f50>

If you want to read your files into a dataset, try something like this:


################## ORIGINAL DATASET ################## 
df_interactions = pd.DataFrame({
    'user_id': [
        '00001446-da5f-4d17', 
        '00001446-da5f-4d17',
        '00005ab5-c9e0-4b05-',
        '00005ab5-c9e0-4b05-',
        '000093dd-1a11-4600', 
        '000093dd-1a11-4600',
        '00009b34-65b5-42c1', 
        '0000ae32-4a91-4bcd',
        '0000ae32-4a91-4bcd',
        '0000ae32-4a91-4bcd'
    ], 
    'channel_id': [
        '1', '2', 'A56',
        '3', 'B72', '2', 
        'M63', '2', '5', 'A56'
    ]
})

df_channels = pd.DataFrame({
    'channel_id': [
        '1', '2', '3', '5', '6', '7', '8' 
    ],
    'channel_name': [
        'Popular', 
        'Best',
        'Highest Rated',
        'Large Following',
        'Nice', 
        'Retro',
        'Modern'
    ]
})

"""
################## MODIFIED DATASET ##################
df_interactions = pd.DataFrame({
    'user_id': [
        '4d17', 
        '4d17',
        '4b05',
        '4b05',
        '93dd', 
        '93dd',
        '9b34', 
        '4bcd',
        '-4bcd',
        '4bcd'
    ], 
    'channel_id': [
        '1', '2', '6',
        '3', '7', '2', 
        '8', '2', '5', '6'
    ]
})

df_channels = pd.DataFrame({
    'channel_id': [
        '1', '2', '3', '5', '6', '7', '8' 
    ],
    'channel_name': [
        'Popular', 
        'Best',
        'Highest Rated',
        'Large Following',
        'Nice', 
        'Retro',
        'Modern'
    ]
})
"""

df_channels.to_csv('experiment_channels.csv', index=False)
df_interactions.to_csv('experiment_interactions.csv', index=False)

channels = tf.data.experimental.CsvDataset('experiment_channels.csv', [tf.string, tf.string], header=True)
interactions = tf.data.experimental.CsvDataset('experiment_interactions.csv', [tf.string, tf.string], header=True)

def preprocess_channels(x, y):
  return x

def preprocess_interactions(x, y):
  return {
    "user_id": tf.strings.regex_replace(x, '[^0-9^]', ""),
    "channel_id": y
  }

channels = channels.map(preprocess_channels)
interactions = interactions.map(preprocess_interactions)

interactions_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
interactions_vocabulary.adapt(interactions.map(lambda x: x["user_id"]))

channels_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
channels_vocabulary.adapt(channels)

# Build a model.
class Model(tfrs.Model):

  def __init__(self):
    super().__init__()

    self.user_model = tf.keras.Sequential([
        interactions_vocabulary,
        tf.keras.layers.Embedding(interactions_vocabulary.vocabulary_size(), 64)
    ])

    self.item_model = tf.keras.Sequential([
        channels_vocabulary,
        tf.keras.layers.Embedding(channels_vocabulary.vocabulary_size(), 64)
    ])
    # Set up a retrieval task and evaluation metrics over the
    # entire dataset of candidates.
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=channels.batch(1).map(self.item_model)
        )
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    user_embeddings = self.user_model(features["user_id"])
    channel_embeddings = self.item_model(features["channel_id"])
    return self.task(user_embeddings, channel_embeddings)


model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
tf.random.set_seed(42)

model.fit(interactions.batch(1), epochs=5)

Note this example uses tf.keras.layers.StringLookup.

The tf.data.experimental.CsvDataset class provides a minimal CSV Dataset interface.

However, you are far more flexible than using a more high-level API like tf.data.experimental.make_csv_dataset. Check out the docs for more information.