I would like to split a tensorflow2 dataset into two datasets such that one contains "features" and another one contains "labels".
In the dataset, each element is a python dictionary:
{'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}
{'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}
The dataset is a prefetched dataset, which is loaded from 1000 txt (.gz) files. Each file has size 500KB-600KB. All file data cannot be loaded to memory so I have to prefetch it by batch (size is 200).
def a_func(file_paths, batch_size=200):
dataset = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(batch_size)
#split the dataset into features and labels ?
label = dataset.map(lambda x: col for col in x if col=='label') # error: "x" not defined
features = dataset.map(lambda x: col for col in x if col!='label')
return features, label
How to split the dataset into two datasets by column names ?
UPDATE
ds = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
type(ds)
# tensorflow.python.data.ops.readers.TFRecordDatasetV2
label = ds.map(lambda x: x['label'])
# TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'label'
for row in ds:
print(row.numpy()) # show a lot of hex chars
CodePudding user response:
IIUC, you can try something like this:
import tensorflow as tf
# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)
label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (x['id'], x['val1'], x['val2'], x['val3']))
for l in label:
print(l)
for f in features:
print(f)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
(<tf.Tensor: shape=(), dtype=int32, numpy=11>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.22, 0.36], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([81], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'swimming', b'running', b'jumpoing'], dtype=object)>)
(<tf.Tensor: shape=(), dtype=int32, numpy=29>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.97, 0.52], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([627], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'baseball', b'football', b'basketball'], dtype=object)>)
Or with dynamic keys:
import tensorflow as tf
import numpy as np
# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)
label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (list(map(x.get, list(np.setdiff1d(list(x.keys()),['label']))))))