Create a Tensorflow Dataset from a Pandas data frame with numerous labels?-CodePudding

I am trying to load a pandas dataframe into a tensor Dataset. The columns are text[string] and labels[a list in string format]

A row would look something like: text: "Hi, this is me in here, ...." labels: [0, 1, 1, 0, 1, 0, 0, 0, ...]

Each text has the probability of 17 labels.

I can't find a way to load the data set into as an array, and call model.fit() I read numerous answers, trying to use the following code in df_to_dataset().

I can't figure out what I am missing in this ..

labels = labels.apply(lambda x: np.asarray(literal_eval(x)))  # Cast to a list
labels = labels.apply(lambda x: [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])  # Straight out list ..

#  ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

Printing one row (from the returned data set) shows:

({'text': <tf.Tensor: shape=(), dtype=string, numpy=b'Text in here'>}, <tf.Tensor: shape=(), dtype=string, numpy=b'[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0]'>)

When I don't use any casting, model.fit sends an exception, as it can't work with a string.

UnimplementedError:  Cast string to float is not supported
     [[node sparse_categorical_crossentropy/Cast (defined at <ipython-input-102-71a9fbf2d907>:4) ]] [Op:__inference_train_function_1193273]

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('labels')

  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  return ds

train_ds = df_to_dataset(df_train, batch_size=batch_size)
val_ds = df_to_dataset(df_val, batch_size=batch_size)
test_ds = df_to_dataset(df_test, batch_size=batch_size)

def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)

  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.2)(net)
  net = tf.keras.layers.Dense(17, activation='softmax', name='classifier')(net)

  return tf.keras.Model(text_input, net)


classifier_model = build_classifier_model()

loss = 'sparse_categorical_crossentropy'
metrics = ["accuracy"]
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

CodePudding user response：

You could use the tf.strings functions in the map method.

import tensorflow as tf

x = ['[0, 1, 0]', '[1, 1, 0]']


def splitter(string):
    string = tf.strings.substr(string, 1, tf.strings.length(string) - 2) # no brackets
    string = tf.strings.split(string, ', ')                              # isolate int
    string = tf.strings.to_number(string, out_type=tf.int32)             # as integer
    return string


ds = tf.data.Dataset.from_tensor_slices(x).map(splitter)

next(iter(ds))

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 1, 0])>

CodePudding user response：

Just try preprocessing your dataframe before using tf.data.Dataset.from_tensor_slices. Here is a simple example:

dummy_data = {'text': [
"Improve the physical fitness of your goldfish by getting him a bicycle",
"You are unsure whether or not to trust him but very thankful that you wore a turtle neck",
"Not all people who wander are lost", 
"There is a reason that roses have thorns",
"Charles ate the french fries knowing they would be his last meal",
"He hated that he loved what she hated about hate",
], 'labels': ['[0, 1, 1, 1, 1]', '[1, 1, 1, 0, 0]', '[1, 0, 1, 0, 0]', '[1, 0, 1, 0, 0]', '[1, 1, 1, 0, 0]', '[1, 1, 1, 0, 0]']}  
  
def remove_and_split(s):
  s = s.replace('[', '') 
  s = s.replace(']', '')  
  return s.split(',') 

df = pd.DataFrame(dummy_data)  
df["labels"] = df["labels"].apply(lambda x: [int(i) for i in remove_and_split(x)])

def df_to_dataset(dataframe, shuffle=True, batch_size=2):
  dataframe = dataframe.copy()
  labels = dataframe.pop('labels')

  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)).batch(
        batch_size)
  return ds

And don't forget to include the batch size in tf.data.Dataset.from_tensor_slices when using a Bert preprocessing layer.