I'm trying to apply text categorization using Keras. I have imported my data as a Pandas dataframe and have converted it to a tf.Dataset. The problem is that I cannot use the TextVectorization layer of Keras as the below code throws this error:
AttributeError: 'NoneType' object has no attribute 'ndims'
My CSV's headers:
- Class Index : int32
- Title: string
- Description: string
What have I missed ? Below is my code:
import re
import string
import tensorflow as tf
import pandas as pd
from tensorflow import keras
def create_dataset(csv_file, batch_size):
df = pd.read_csv(csv_file)
labels = df.pop('Class Index').transform(lambda x: x - 1)
n_labels = len(pd.unique(labels))
ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
ds = ds.batch(batch_size)
ds = ds.shuffle(10000)
return ds, n_labels
def load_data(data_dir, batch_size):
train_ds, n_labels = create_dataset(data_dir '/train.csv', batch_size)
train_total_batches = len(train_ds)
raw_test_ds, _ = create_dataset(data_dir '/test.csv', batch_size)
raw_train_ds = train_ds.take(int(round(0.8 * train_total_batches)));
raw_val_ds = train_ds.skip(int(round(0.8 * train_total_batches))).take(int(round(0.2 * train_total_batches)));
return raw_train_ds, raw_val_ds, raw_test_ds, n_labels
raw_train_ds, raw_val_ds, raw_test_ds, n_labels = load_data('.', 64)
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_backslash = tf.strings.regex_replace(lowercase, "\'", '' '')
return tf.strings.regex_replace(stripped_backslash, '[%s]' % re.escape(string.punctuation), '')
max_features = 5000
sequence_length = 250
vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
EDIT 1: Sample from train_test
:
{'Title': <tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'Malaysia testing 3 people for bird flu; says outbreak isolated',
b"Kroger's Profit Climbs, Misses Forecast (Reuters)",
b'Blasts Shake Najaf as U.S. Planes Attack Rebels',
...(omitted)
b'Kerry Camp Makes Video to Defuse Attacks (AP)'], dtype=object)>, 'Description': <tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'Malaysian officials on Saturday were testing three people who fell ill in a village hit by the deadly H5N1 bird flu strain, after international health officials warned that the virus appeared to be entrenched in parts of ',
b'Reuters - Kroger Co. , the top U.S.\\grocer, on Tuesday said quarterly profit rose 29 percent as it\\kept a tight rein on expenses and sales rebounded.',
b' NAJAF, Iraq (Reuters) - Strong blasts were heard in the besieged city of Najaf early Sunday as U.S. military planes unleashed cannon and howitzer fire and a heavy firefight erupted.',
b'Manny Ramirez homered and drove in five runs as the Red Sox earned their fifth straight victory.',
...(omitted)
dtype=object)>}
CodePudding user response:
Since you are using a internal dictionary, you can try something like this:
import tensorflow as tf
d ={"Title": ["Malaysia testing 3 people for bird flu; says outbreak isolate",
"Kroger's Profit Climbs, Misses Forecast (Reuters)",
"Blasts Shake Najaf as U.S. Planes Attack Rebels"],
"Description": [
"Kerry Camp Makes Video to Defuse Attacks (AP)",
"Malaysian officials on Saturday were testing three people who fell ill in a village hit by the deadly H5N1 bird flu strain, after international health officials warned that the virus appeared to be entrenched in parts of ",
" NAJAF, Iraq (Reuters) - Strong blasts were heard in the besieged city of Najaf early Sunday as U.S. military planes unleashed cannon and howitzer fire and a heavy firefight erupted."
]}
train_text = tf.data.Dataset.from_tensor_slices(d).batch(2)
max_features = 5000
sequence_length = 250
vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
#This example assumes that you have already excluded the labels.
#train_text = raw_train_ds.map(lambda x, y: x)
train_text = train_text.map(lambda x: tf.concat([x['Title'], x['Description']], axis=0))
vectorize_layer.adapt(train_text)
This example assumes that you have already excluded the labels.