I have a dataset made of tensors. A sample tensor looks like this:
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"Some text"],
dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
Instead of giving as an input the whole dataset, I would like to iteratively get the tensors and input them to the model.
I tried this but I get a
IndexError: list index out of range
for element in dataset:
model.fit(x=element)
What is the best way to achieve the desired output?
Thank you in advance!
You can find my model here:
import pandas as pd
import tensorflow as tf
df = pd.read_csv('labeled_tweets_processed.csv')
labels = df.pop('class')
dataset = tf.data.Dataset.from_tensor_slices((df, labels))
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(dataset.map(lambda text, label: text))
BUFFER_SIZE = 2
BATCH_SIZE = 1
train_dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=64,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=['accuracy'])
and some of my dataset here:
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'text1'],
dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"text2"],
dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"text3"],
dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
CodePudding user response:
Not too sure why you want to call model.fit
in a loop but you can try something like this:
import pandas as pd
import tensorflow as tf
df = pd.DataFrame(data = {'texts': ['Some text', 'Some text', 'Some text', 'Some text', 'Some text'],
'class': [0, 0, 1, 1, 1]})
labels = df.pop('class')
dataset = tf.data.Dataset.from_tensor_slices((df, labels))
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(dataset.map(lambda text, label: text))
BUFFER_SIZE = 2
BATCH_SIZE = 1
train_dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=64,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=['accuracy'])
for x, y in train_dataset:
model.fit(x, y, epochs=2)