tensorflow2 split dataset into two datasets by column names-CodePudding

I would like to split a tensorflow2 dataset into two datasets such that one contains "features" and another one contains "labels".

In the dataset, each element is a python dictionary:

{'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}

{'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}

The dataset is a prefetched dataset, which is loaded from 1000 txt (.gz) files. Each file has size 500KB-600KB. All file data cannot be loaded to memory so I have to prefetch it by batch (size is 200).

def a_func(file_paths, batch_size=200):
    dataset = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    #split the dataset into features and labels ? 
    label = dataset.map(lambda x: col for col in x if col=='label') # error: "x" not defined
    features = dataset.map(lambda x: col for col in x if col!='label')
    return features, label

How to split the dataset into two datasets by column names ?

UPDATE

ds = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')

type(ds)
# tensorflow.python.data.ops.readers.TFRecordDatasetV2

label = ds.map(lambda x: x['label'])

# TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'label'

for row in ds:
    print(row.numpy()) # show a lot of hex chars

CodePudding user response：

IIUC, you can try something like this:

import tensorflow as tf

# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)

label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (x['id'], x['val1'], x['val2'], x['val3']))

for l in label:
  print(l)
for f in features:
  print(f)

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
(<tf.Tensor: shape=(), dtype=int32, numpy=11>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.22, 0.36], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([81], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'swimming', b'running', b'jumpoing'], dtype=object)>)
(<tf.Tensor: shape=(), dtype=int32, numpy=29>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.97, 0.52], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([627], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'baseball', b'football', b'basketball'], dtype=object)>)

Or with dynamic keys:

import tensorflow as tf
import numpy as np

# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)

label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (list(map(x.get, list(np.setdiff1d(list(x.keys()),['label']))))))