I'm trying to build NLP classifier, data consists of 2 columns, one with text other one represents target with 4 classes in total. I've one-hot encoded target, but when running the model.fit() method shapes do not match.
Example of data structure:
text | target |
---|---|
'such a lovely day' | 'a' |
'not so great' | 'b' |
'hello world' | 'c' |
Below is the code that I used:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import numpy as np
import pandas as pd
import string
# load and split data
df = pd.read_csv('train.csv', index_col=[0])
encoder = LabelEncoder()
target_labels = encoder.fit_transform(df['target'])
target_labels = tf.keras.utils.to_categorical(target_labels, 4)
X_train, X_test, y_train, y_test = train_test_split(df[['text']], target_labels, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)
# convert to tf dataset
raw_train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
raw_val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))
raw_test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# text cleanup
def custom_standardization(input_data):
new_line_replace = tf.strings.regex_replace(input_data, '\n', ' ')
non_alphanum_replace = tf.strings.regex_replace(new_line_replace, '[^a-zA-Z0-9_ ]', '')
stripped = tf.strings.strip(non_alphanum_replace)
lowercase = tf.strings.lower(stripped)
return tf.expand_dims(tf.strings.regex_replace(lowercase,
'[%s]' % re.escape(string.punctuation),
''), -1)
# creating layer for text vectoriazation
max_features = 10000
sequence_length = 250
vectorize_layer = tf.keras.layers.TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
def vectorize_text(text, label):
return vectorize_layer(text), label
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
# modeling
model = tf.keras.Sequential([
vectorize_layer,
tf.keras.layers.Embedding(max_features 1, 16),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(4, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
epochs = 10
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=epochs)
When running this code I get following error: ValueError: Shapes (4, 1) and (None, 4) are incompatible
I've tried to flatten out and reshape data to be (1,4) but that doesn't work, also if I change dense layer to 1 neuron, output is impossible to interpret and it's not what's required.
train_ds.take(1) <TakeDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(4,), dtype=tf.float32, name=None))>
Anybody faced the same problem before, appreciate any help?
CodePudding user response:
Just write this
target_labels = target_labels.reshape(length of training set, 1, 4)
after
target_labels = tf.keras.utils.to_categorical(target_labels, 4)