Tensorflow target shape not matching - how to properly format data-CodePudding

I'm trying to build NLP classifier, data consists of 2 columns, one with text other one represents target with 4 classes in total. I've one-hot encoded target, but when running the model.fit() method shapes do not match.

Example of data structure:

text	target
'such a lovely day'	'a'
'not so great'	'b'
'hello world'	'c'

Below is the code that I used:

from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import numpy as np
import pandas as pd
import string

# load and split data
df = pd.read_csv('train.csv', index_col=[0])
encoder = LabelEncoder()
target_labels = encoder.fit_transform(df['target'])
target_labels = tf.keras.utils.to_categorical(target_labels, 4)

X_train, X_test, y_train, y_test = train_test_split(df[['text']], target_labels, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)
# convert to tf dataset
raw_train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
raw_val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))
raw_test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# text cleanup
def custom_standardization(input_data):
  new_line_replace = tf.strings.regex_replace(input_data, '\n', ' ')
  non_alphanum_replace = tf.strings.regex_replace(new_line_replace, '[^a-zA-Z0-9_ ]', '')
  stripped = tf.strings.strip(non_alphanum_replace)
  lowercase = tf.strings.lower(stripped)
  
  return tf.expand_dims(tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation),
                                  ''), -1)
# creating layer for text vectoriazation
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

def vectorize_text(text, label):
  return vectorize_layer(text), label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

# modeling
model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(max_features   1, 16),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

When running this code I get following error: ValueError: Shapes (4, 1) and (None, 4) are incompatible

I've tried to flatten out and reshape data to be (1,4) but that doesn't work, also if I change dense layer to 1 neuron, output is impossible to interpret and it's not what's required.

train_ds.take(1) <TakeDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(4,), dtype=tf.float32, name=None))>

Anybody faced the same problem before, appreciate any help?

CodePudding user response：

Just write this

target_labels = target_labels.reshape(length of training set, 1, 4)

after

target_labels = tf.keras.utils.to_categorical(target_labels, 4)