Tensorflow labels for classification aren't loaded properly in the model-CodePudding

I'm having issues with the categories in in my data, I can't set the Dense softmax layer to "3" instead of "1" for 3 categories.

I assume my issue is with vectorize_text, but I am not completely sure. I can also assume that I don't set the label tensors correctly.

# Start of data generation

dummy_data = {'text': ['Love', 'Money', 'War'],
              'labels': [1,2,3]
              }
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500

df_train_bogus = pd.DataFrame(dummy_data)  


def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(dict(dataframe)).batch(batch_size)
  return ds

batch_size = 32
train_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
val_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)

# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000  # vocab size
embedding_dim = 128
# End of data generation

#  Start of vectorization
vectorize_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

def vectorize_text(text, labels):
  print(text)
  print(labels)

  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), labels

vectorize_layer.adapt(df_train_bogus['text'])

train_ds_vectorized = train_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
val_ds_vectorized = val_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))

"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)

"""

#  The model

model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(LSTM(embedding_dim, input_shape=(None, sequence_length)))

model.add(Dense(3, activation='softmax'))
#  Fails with this error:
#      ValueError: Shapes (None, 1) and (None, 3) are incompatible

model.summary()

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])  # model 4

epochs = 10

# Fit the model using the train and test datasets.
history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)

CodePudding user response：

Your labels from your dummy data are causing the problem. If they are not one-hot encoded, then I would suggest using the sparse_categorical_crossentropy loss function instead, which works on integer targets (that you already you have). Check out the docs for more information. Here is a complete working example:

import tensorflow as tf
import pandas as pd

dummy_data = {'text': ['Love', 'Money', 'War'],
              'labels': [0, 1, 2]
              }
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500

df_train_bogus = pd.DataFrame(dummy_data)  


def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(dict(dataframe)).batch(batch_size)
  return ds

batch_size = 32
train_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
val_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)

# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000  # vocab size
embedding_dim = 128

#  Start of vectorization
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

def vectorize_text(text, labels):
  print(text)
  print(labels)

  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), labels

vectorize_layer.adapt(df_train_bogus['text'])

train_ds_vectorized = train_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
val_ds_vectorized = val_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))

"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)

"""

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(tf.keras.layers.LSTM(embedding_dim, input_shape=(None, sequence_length)))

model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.summary()

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["sparse_categorical_accuracy"])  # model 4

epochs = 10

history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)
"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)

"""

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(tf.keras.layers.LSTM(embedding_dim, input_shape=(None, sequence_length)))

model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.summary()

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])  # model 4

epochs = 10

history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)

Note that your labels need to start from zero to n, since sparse_categorical_crossentropy produces a category index of the most likely class, which can be 0.

Update: The accuracy 0.333 is correct since you have 3 classes with an equal number of samples for each class. You need to use a larger dataset to see any reasonable results.

CodePudding user response：

Your issue is with your loss function. Categorical cross entropy in Keras requires the classes to not be in idx form, but as their target logits/activated outputs. So, your training losses should be of the form:

from tensorflow.keras.utils import to_categorical
n_classes = 3
y = [0,1,2] #IMPORTANT TO INDEX FROM 0 
cat_y = to_categorical(y,n_classes)


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

To achieve this you need to make a few changes to how you process your data, as you can see below:

# Start of data generation

dummy_data = {'text': ['Love', 'Money', 'War'],
              'labels': [1,2,0]
              }
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500

dummy_data['labels'] = to_categorical(dummy_data['labels'],3)
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices((dummy_data['text'],dummy_data['labels']))
    return ds

batch_size = 32
train_ds = df_to_dataset(dummy_data, batch_size=batch_size)
val_ds = df_to_dataset(dummy_data, batch_size=batch_size)

# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000  # vocab size
embedding_dim = 128
# End of data generation
#  Start of vectorization
vectorize_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

def vectorize_text(text, labels):
  print(text)
  print(labels)

  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), tf.expand_dims(labels, 0)

vectorize_layer.adapt(dummy_data['text'])

train_ds_vectorized = train_ds.map(lambda x,y: vectorize_text(x,y))
val_ds_vectorized = val_ds.map(lambda x,y: vectorize_text(x,y))