Keras TextVectorization adapt throws AttributeError-CodePudding

I'm trying to apply text categorization using Keras. I have imported my data as a Pandas dataframe and have converted it to a tf.Dataset. The problem is that I cannot use the TextVectorization layer of Keras as the below code throws this error:

AttributeError: 'NoneType' object has no attribute 'ndims'

My CSV's headers:

Class Index : int32
Title: string
Description: string

What have I missed ? Below is my code:

import re
import string
import tensorflow as tf
import pandas as pd
from tensorflow import keras

def create_dataset(csv_file, batch_size):
    df = pd.read_csv(csv_file)
    labels = df.pop('Class Index').transform(lambda x: x - 1)

    n_labels = len(pd.unique(labels))

    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.batch(batch_size)
    ds = ds.shuffle(10000)

    return ds, n_labels


def load_data(data_dir, batch_size):
    train_ds, n_labels = create_dataset(data_dir   '/train.csv', batch_size)
    train_total_batches = len(train_ds)

    raw_test_ds, _ = create_dataset(data_dir   '/test.csv', batch_size)

    raw_train_ds = train_ds.take(int(round(0.8 * train_total_batches)));
    raw_val_ds = train_ds.skip(int(round(0.8 * train_total_batches))).take(int(round(0.2 * train_total_batches)));

    return raw_train_ds, raw_val_ds, raw_test_ds, n_labels


raw_train_ds, raw_val_ds, raw_test_ds, n_labels = load_data('.', 64)


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_backslash = tf.strings.regex_replace(lowercase, "\'", '' '')
    return tf.strings.regex_replace(stripped_backslash, '[%s]' % re.escape(string.punctuation), '')


max_features = 5000
sequence_length = 250

vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

EDIT 1: Sample from train_test:

{'Title': <tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'Malaysia testing 3 people for bird flu; says outbreak isolated',
       b"Kroger's Profit Climbs, Misses Forecast (Reuters)",
       b'Blasts Shake Najaf as U.S. Planes Attack Rebels',
       ...(omitted)
       b'Kerry Camp Makes Video to Defuse Attacks (AP)'], dtype=object)>, 'Description': <tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'Malaysian officials on Saturday were testing three people who fell ill in a village hit by the deadly H5N1 bird flu strain, after international health officials warned that the virus appeared to be entrenched in parts of ',
       b'Reuters - Kroger Co. , the top U.S.\\grocer, on Tuesday said quarterly profit rose 29 percent as it\\kept a tight rein on expenses and sales rebounded.',
       b' NAJAF, Iraq (Reuters) - Strong blasts were heard in the  besieged city of Najaf early Sunday as U.S. military planes  unleashed cannon and howitzer fire and a heavy firefight  erupted.',
       b'Manny Ramirez homered and drove in five runs as the Red Sox earned their fifth straight victory.',
      ...(omitted)
      dtype=object)>}

CodePudding user response：

Since you are using a internal dictionary, you can try something like this:

import tensorflow as tf


d ={"Title": ["Malaysia testing 3 people for bird flu; says outbreak isolate",
           "Kroger's Profit Climbs, Misses Forecast (Reuters)",
           "Blasts Shake Najaf as U.S. Planes Attack Rebels"], 
 "Description": [
                 "Kerry Camp Makes Video to Defuse Attacks (AP)", 
                 "Malaysian officials on Saturday were testing three people who fell ill in a village hit by the deadly H5N1 bird flu strain, after international health officials warned that the virus appeared to be entrenched in parts of ", 
                 " NAJAF, Iraq (Reuters) - Strong blasts were heard in the  besieged city of Najaf early Sunday as U.S. military planes  unleashed cannon and howitzer fire and a heavy firefight  erupted."
 ]}

train_text = tf.data.Dataset.from_tensor_slices(d).batch(2)

max_features = 5000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

#This example assumes that you have already excluded the labels.
#train_text = raw_train_ds.map(lambda x, y: x)

train_text = train_text.map(lambda x: tf.concat([x['Title'], x['Description']], axis=0))
vectorize_layer.adapt(train_text)

This example assumes that you have already excluded the labels.