I'm having issues with the categories in in my data, I can't set the Dense softmax layer to "3" instead of "1" for 3 categories.
I assume my issue is with vectorize_text, but I am not completely sure. I can also assume that I don't set the label tensors correctly.
# Start of data generation
dummy_data = {'text': ['Love', 'Money', 'War'],
'labels': [1,2,3]
}
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500
df_train_bogus = pd.DataFrame(dummy_data)
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
ds = tf.data.Dataset.from_tensor_slices(dict(dataframe)).batch(batch_size)
return ds
batch_size = 32
train_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
val_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000 # vocab size
embedding_dim = 128
# End of data generation
# Start of vectorization
vectorize_layer = TextVectorization(
standardize = 'lower_and_strip_punctuation',
max_tokens=max_features,
output_mode="int",
output_sequence_length=sequence_length,
)
def vectorize_text(text, labels):
print(text)
print(labels)
text = tf.expand_dims(text, -1)
return vectorize_layer(text), labels
vectorize_layer.adapt(df_train_bogus['text'])
train_ds_vectorized = train_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
val_ds_vectorized = val_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
"""
# The model
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(LSTM(embedding_dim, input_shape=(None, sequence_length)))
model.add(Dense(3, activation='softmax'))
# Fails with this error:
# ValueError: Shapes (None, 1) and (None, 3) are incompatible
model.summary()
model.compile(loss="categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"]) # model 4
epochs = 10
# Fit the model using the train and test datasets.
history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)
CodePudding user response:
Your labels from your dummy data are causing the problem. If they are not one-hot encoded, then I would suggest using the sparse_categorical_crossentropy
loss function instead, which works on integer targets (that you already you have). Check out the docs for more information. Here is a complete working example:
import tensorflow as tf
import pandas as pd
dummy_data = {'text': ['Love', 'Money', 'War'],
'labels': [0, 1, 2]
}
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500
df_train_bogus = pd.DataFrame(dummy_data)
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
ds = tf.data.Dataset.from_tensor_slices(dict(dataframe)).batch(batch_size)
return ds
batch_size = 32
train_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
val_ds = df_to_dataset(df_train_bogus, batch_size=batch_size)
# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000 # vocab size
embedding_dim = 128
# Start of vectorization
vectorize_layer = tf.keras.layers.TextVectorization(
standardize = 'lower_and_strip_punctuation',
max_tokens=max_features,
output_mode="int",
output_sequence_length=sequence_length,
)
def vectorize_text(text, labels):
print(text)
print(labels)
text = tf.expand_dims(text, -1)
return vectorize_layer(text), labels
vectorize_layer.adapt(df_train_bogus['text'])
train_ds_vectorized = train_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
val_ds_vectorized = val_ds.map(lambda x: (vectorize_text(x['text'], x['labels'])))
"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
"""
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(tf.keras.layers.LSTM(embedding_dim, input_shape=(None, sequence_length)))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.summary()
model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["sparse_categorical_accuracy"]) # model 4
epochs = 10
history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)
"""
Output:
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
Tensor("args_1:0", shape=(None,), dtype=string)
Tensor("args_0:0", shape=(None,), dtype=int64)
"""
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embedding_dim, input_length=sequence_length))
model.add(tf.keras.layers.LSTM(embedding_dim, input_shape=(None, sequence_length)))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.summary()
model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"]) # model 4
epochs = 10
history = model.fit(train_ds_vectorized, validation_data=val_ds_vectorized, epochs=epochs)
Note that your labels need to start from zero
to n
, since sparse_categorical_crossentropy
produces a category index of the most likely class, which can be 0
.
Update: The accuracy 0.333 is correct since you have 3 classes with an equal number of samples for each class. You need to use a larger dataset to see any reasonable results.
CodePudding user response:
Your issue is with your loss function. Categorical cross entropy in Keras requires the classes to not be in idx form, but as their target logits/activated outputs. So, your training losses should be of the form:
from tensorflow.keras.utils import to_categorical
n_classes = 3
y = [0,1,2] #IMPORTANT TO INDEX FROM 0
cat_y = to_categorical(y,n_classes)
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]], dtype=float32)
To achieve this you need to make a few changes to how you process your data, as you can see below:
# Start of data generation
dummy_data = {'text': ['Love', 'Money', 'War'],
'labels': [1,2,0]
}
dummy_data['text'] = dummy_data['text']*500
dummy_data['labels'] = dummy_data['labels']*500
dummy_data['labels'] = to_categorical(dummy_data['labels'],3)
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
ds = tf.data.Dataset.from_tensor_slices((dummy_data['text'],dummy_data['labels']))
return ds
batch_size = 32
train_ds = df_to_dataset(dummy_data, batch_size=batch_size)
val_ds = df_to_dataset(dummy_data, batch_size=batch_size)
# Model constants (can be lower but that doesn't matter for this example)
sequence_length = 128
max_features = 20000 # vocab size
embedding_dim = 128
# End of data generation
# Start of vectorization
vectorize_layer = TextVectorization(
standardize = 'lower_and_strip_punctuation',
max_tokens=max_features,
output_mode="int",
output_sequence_length=sequence_length,
)
def vectorize_text(text, labels):
print(text)
print(labels)
text = tf.expand_dims(text, -1)
return vectorize_layer(text), tf.expand_dims(labels, 0)
vectorize_layer.adapt(dummy_data['text'])
train_ds_vectorized = train_ds.map(lambda x,y: vectorize_text(x,y))
val_ds_vectorized = val_ds.map(lambda x,y: vectorize_text(x,y))