I'm attempting to construct input to an embedding layer for an NLP model. However, I am having problems with converting raw text data to the numerical input required by the embedding layer.
Here is some example data to illustrate what I wish to feed to the NLP model:
# 0 = negative
# 1 = positive
documents = [['topology freaking sucks man, what a waste of time!', 0], ['wow bro you a NLP fan? Tell me more I want to know', 1],
['you know, I will eventually die',0], ['the secret to happiness is to only be depresssed',0],
['what is the floor without feet', 1], ['regicide is permissable only in historical situations',1],
['I do not like delivering wehat based products for I am allergic to wheat', 0],
['Why does he ring the large bell every hour?',0],
['Wisdom comes not from experience but from knowing',1],
['Little is known of the inner workings of the feline mind', 1]]
Each document contains one sentence and one label. This data format was inspired by the tutorial prompt I am working on:
Your Task Your task in this lesson is to design a small document classification problem with 10 documents of one sentence each and associated labels of positive and negative outcomes and to train a network with word embedding on these data.
I utilize the TextVectorization function from the keras library:
# create preprocessing layer
VOCAB_SIZE = 500 # max amount of vocabulary amongst all documents
MAX_SEQUENCE_LENGTH = 50 # maximum amount of words/tokens that will be considered in each document
# output mode 'int' will assign unique integer per token, so in our example below, 'topology' is assigned the value
# 19. Notice that these integers are randomly assigned and essentially acts as a hashmap
int_vectorize_layer = TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode='int',
output_sequence_length = MAX_SEQUENCE_LENGTH
)
The issue now becomes applying this vectorized layer to the raw data documents
. Here is the following code I have to convert the raw data into a tensorflow Dataset
object:
# Applies adapted layer to tensorflow dataset
def int_vectorize_text(sentence, label):
sentence = tf.expand_dims(sentence, -1)
sentence = tf.squeeze(sentence, axis=-1)
return int_vectorize_layer(sentence), label
# passes raw data as a generator to the Dataset from_generator constructor
def generate_data(sentences, labels):
for s, l in zip(sentences,labels):
yield s, l
# split raw data between training and validation set
train_docs = documents[:8]
val_docs = documents[8:]
# separate sentences and labels
train_sentences = [d[0] for d in train_docs]
train_labels = [d[1] for d in train_docs]
val_sentences = [d[0] for d in val_docs]
val_labels = [d[1] for d in val_docs]
# convert to tensors
train_sentences_tensor = tf.convert_to_tensor(train_sentences)
train_labels_tensor = tf.convert_to_tensor(train_labels)
val_sentences_tensor = tf.convert_to_tensor(val_sentences)
val_labels_tensor = tf.convert_to_tensor(val_labels)
# build tensorflow Dataset using the above generator function on the newly constructed tensor objects
train_dataset = tf.data.Dataset.from_generator(
generate_data, (tf.string, tf.int32), args=(train_sentences_tensor, train_labels_tensor))
val_dataset = tf.data.Dataset.from_generator(
generate_data, (tf.string, tf.int32), args=(val_sentences_tensor, val_labels_tensor))
# adapt layer using training sentences
int_vectorize_layer.adapt(train_sentences)
# now here is where the error occurs
int_train_df = train_dataset.map(int_vectorize_text) # ERROR
int_val_df = val_dataset.map(int_vectorize_text)
As you can see, an error occurs when we attempt to map the int_vectorize_text
to the tensorflow dataset. Specifically, I get the following error:
TypeError Traceback (most recent call last)
/home/akagi/Documents/Projects/MLMastery NLP Tutorial/Lesson 5 - Learned Embedding.ipynb Cell 7 in <cell line: 21>()
19 # Use the map method to apply the int_vectorize_text function to each element of the dataset
20 int_vectorize_layer.adapt(train_sentences)
---> 21 int_train_df = train_dataset.map(int_vectorize_text)
22 int_val_df = val_dataset.map(int_vectorize_text)
File ~/Documents/Projects/.venv/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py:2294, in DatasetV2.map(self, map_func, num_parallel_calls, deterministic, name)
2291 if deterministic is not None and not DEBUG_MODE:
2292 warnings.warn("The `deterministic` argument has no effect unless the "
2293 "`num_parallel_calls` argument is specified.")
-> 2294 return MapDataset(self, map_func, preserve_cardinality=True, name=name)
2295 else:
2296 return ParallelMapDataset(
2297 self,
2298 map_func,
(...)
2301 preserve_cardinality=True,
2302 name=name)
File ~/Documents/Projects/.venv/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py:5499, in MapDataset.__init__(self, input_dataset, map_func, use_inter_op_parallelism, preserve_cardinality, use_legacy_function, name)
5497 self._use_inter_op_parallelism = use_inter_op_parallelism
5498 self._preserve_cardinality = preserve_cardinality
-> 5499 self._map_func = structured_function.StructuredFunctionWrapper(
...
'>' not supported between instances of 'NoneType' and 'int'
Call arguments received by layer 'text_vectorization' (type TextVectorization):
• inputs=tf.Tensor(shape=<unknown>, dtype=string)
Which seems to imply that a NoneType
is being passed. However, I checked the construction of train_dataset
and it appears to be correct. Here is what it looks like:
(<tf.Tensor: shape=(), dtype=string, numpy=b'topology freaking sucks man, what a waste of time!'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'wow bro you a NLP fan? Tell me more I want to know'>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'you know, I will eventually die'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'the secret to happiness is to only be depresssed'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'what is the floor without feet'>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'regicide is permissable only in historical situations'>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I do not like delivering wehat based products for I am allergic to wheat'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Why does he ring the large bell every hour?'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
Furthermore, if I apply int_vectorize_text
manually in a loop like so:
for x in train_dataset:
print(int_vectorize_text(x[0], x[1]))
No error occurs and I get the desired output. What is going on here?
CodePudding user response:
After reviewing @AloneTogether's clean and more appropriate solution, it appears your issue is stemming from train_dataset
and val_dataset
definitions. The documentation for the tf.data.Dataset.from_generator function recommends that one
... use the
output_signature
argument. In this case the output will be assumed to consist of objects with the classes, shapes and types defined bytf.TypeSpec
objects fromoutput_signature
argument
As you didn't use the output_signature
argument, it defaulted to using the deprecated way which uses either the output_types
argument alone, or together with output_shapes
. In your case, output_types
was set to (tf.string, tf.int32)
but because you left the output_shapes
argument empty, it defaulted to "unknown".
Later when you go to map the int_vectorize_text
function, it attempts to check if the input shape rank is greater than 1, however, it receives "shape=<unknown>" which is of type NoneType
and so the TypeError manifests when comparing with type int
.
Knowing all this, you can simply add ((), ())
as the output shape in your from_generator
function call after the output type (tf.string, tf.int32)
. Hence, replace these lines:
train_dataset = tf.data.Dataset.from_generator(
generate_data, (tf.string, tf.int32), args=(train_sentences_tensor, train_labels_tensor))
val_dataset = tf.data.Dataset.from_generator(
generate_data, (tf.string, tf.int32), args=(val_sentences_tensor, val_labels_tensor))
With:
train_dataset = tf.data.Dataset.from_generator(
generate_data, output_types=(tf.string, tf.int32), output_shapes=((), ()), args=(train_sentences_tensor, train_labels_tensor))
val_dataset = tf.data.Dataset.from_generator(
generate_data, output_types=(tf.string, tf.int32), output_shapes=((), ()), args=(val_sentences_tensor, val_labels_tensor))
Or, the TensorFlow recommended way as @AloneTogether demonstrated:
train_dataset = tf.data.Dataset.from_generator(
generate_data, output_signature=(
tf.TensorSpec(shape=(), dtype=tf.string),
tf.TensorSpec(shape=(), dtype=tf.int32)), args=(train_sentences_tensor, train_labels_tensor))
val_dataset = tf.data.Dataset.from_generator(
generate_data, output_signature=(
tf.TensorSpec(shape=(), dtype=tf.string),
tf.TensorSpec(shape=(), dtype=tf.int32)), args=(val_sentences_tensor, val_labels_tensor))
I've removed my original solution as I don't believe in propagating code that is suboptimal. Full credit to @AloneTogether for showing how it's supposed to be done. My intent with this edit is to hopefully explain the error and why it occurred so that you and future readers have a better understanding.
CodePudding user response:
Here is an example without tf.py_function
, as requested by @KyleFHartzenberg:
import tensorflow as tf
# 0 = negative
# 1 = positive
documents = [['topology freaking sucks man, what a waste of time!', 0], ['wow bro you a NLP fan? Tell me more I want to know', 1],
['you know, I will eventually die',0], ['the secret to happiness is to only be depresssed',0],
['what is the floor without feet', 1], ['regicide is permissable only in historical situations',1],
['I do not like delivering wehat based products for I am allergic to wheat', 0],
['Why does he ring the large bell every hour?',0],
['Wisdom comes not from experience but from knowing',1],
['Little is known of the inner workings of the feline mind', 1]]
VOCAB_SIZE = 500 # max amount of vocabulary amongst all documents
MAX_SEQUENCE_LENGTH = 50 # maximum amount of words/tokens that will be considered in each document
# output mode 'int' will assign unique integer per token, so in our example below, 'topology' is assigned the value
# 19. Notice that these integers are randomly assigned and essentially acts as a hashmap
int_vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode='int',
output_sequence_length = MAX_SEQUENCE_LENGTH,
)
def int_vectorize_text(sentence, label):
return int_vectorize_layer(sentence), label
def generate_data(sentences, labels):
for s, l in zip(sentences,labels):
yield s, l
train_docs = documents[:8]
val_docs = documents[8:]
train_sentences = [d[0] for d in train_docs]
train_labels = [d[1] for d in train_docs]
val_sentences = [d[0] for d in val_docs]
val_labels = [d[1] for d in val_docs]
train_sentences_tensor = tf.convert_to_tensor(train_sentences)
train_labels_tensor = tf.convert_to_tensor(train_labels)
train_dataset = tf.data.Dataset.from_generator(
generate_data, output_signature=(
tf.TensorSpec(shape=(), dtype=tf.string),
tf.TensorSpec(shape=(), dtype=tf.int32)), args=(train_sentences_tensor, train_labels_tensor))
# adapt layer using training sentences
int_vectorize_layer.adapt(train_dataset.map(lambda x, y: x))
int_train_df = train_dataset.map(int_vectorize_text)
for x, y in int_train_df:
print(x, y)
break
tf.Tensor(
[19 42 22 34 7 10 17 29 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0], shape=(50,), dtype=int64) tf.Tensor(0, shape=(), dtype=int32)