I have a tensorflow dataset that I would like to group by key. However, my keys are strings and not integers to I can't do that:
person_tensor = tf.constant(["person1", "person2", "person1", "person3", "person3"])
value_tensor = tf.constant([1,2,3,4,5])
ds = tf.data.Dataset.from_tensor_slices((person_tensor, value_tensor)).map(lambda person, value: {'person': person, 'value': value})
# >> list(ds.as_numpy_iterator())
# [{'person': b'person1', 'value': 1},
# {'person': b'person2', 'value': 2},
# {'person': b'person1', 'value': 3},
# {'person': b'person3', 'value': 4},
# {'person': b'person3', 'value': 5}]
window_size = 10 # not very important, I don't care getting multiple groups for one key as long as there is only one key per group
ds.group_by_window(
key_func=lambda row: row['person'],
window_size=window_size,
reduce_func=lambda key, rows: rows.batch(window_size)
)
# ValueError: Tensor conversion requested dtype int64 for Tensor with dtype string: <tf.Tensor 'args_0:0' shape=() dtype=string>
This method complains about string not being able to be converted to int64. Indeed, the key_func
function is supposed to return an int64 and not a string as I did.
Is there another method to group a dataset by key?
CodePudding user response:
You could try utilizing tf.lookup.StaticHashTable
like this:
import tensorflow as tf
person_tensor = tf.constant(["person1", "person2", "person1", "person3", "person3"])
value_tensor = tf.constant([1,2,3,4,5])
k_tensor = tf.unique(person_tensor)[0]
v_tensor = tf.cast(tf.range(tf.shape(k_tensor)[0]), dtype=tf.int64)
table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(k_tensor, v_tensor),
default_value=-1)
reverse_table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(v_tensor, k_tensor),
default_value="")
ds = tf.data.Dataset.from_tensor_slices((person_tensor, value_tensor)).map(lambda person, value: {'person': table.lookup(person), 'value': value})
window_size = 10 # not very important, I don't care getting multiple groups for one key as long as there is only one key per group
ds = ds.group_by_window(
key_func=lambda row: row['person'],
window_size=window_size,
reduce_func=lambda key, rows: rows.batch(window_size)
)
ds = ds.map(lambda d: {'person': tf.unique(reverse_table.lookup(d['person']))[0], 'value': d['value']})
list(ds.as_numpy_iterator())
[{'person': array([b'person1'], dtype=object),
'value': array([1, 3], dtype=int32)},
{'person': array([b'person2'], dtype=object),
'value': array([2], dtype=int32)},
{'person': array([b'person3'], dtype=object),
'value': array([4, 5], dtype=int32)}]
Or if you really have some unique number in your strings, you could also try using tf.strings.bytes_split
and tf.strings.to_number
:
person_tensor = tf.constant(["person1", "person2", "person1", "person3", "person3"])
value_tensor = tf.constant([1,2,3,4,5])
ds = tf.data.Dataset.from_tensor_slices((person_tensor, value_tensor)).map(lambda person, value: {'person': person, 'value': value})
window_size = 10 # not very important, I don't care getting multiple groups for one key as long as there is only one key per group
ds = ds.group_by_window(
key_func=lambda row: tf.strings.to_number(tf.strings.bytes_split(row['person'])[-1], tf.int64),
window_size=window_size,
reduce_func=lambda key, rows: rows.batch(window_size)
)
ds = ds.map(lambda d: {'person': tf.unique(d['person'])[0], 'value': d['value']})
list(ds.as_numpy_iterator())
[{'person': array([b'person1'], dtype=object),
'value': array([1, 3], dtype=int32)},
{'person': array([b'person2'], dtype=object),
'value': array([2], dtype=int32)},
{'person': array([b'person3'], dtype=object),
'value': array([4, 5], dtype=int32)}]
Or you could try using tf.strings.to_hash_bucket_fast
:
person_tensor = tf.constant(["person1", "person2", "person1", "person3", "person3"])
value_tensor = tf.constant([1,2,3,4,5])
ds = tf.data.Dataset.from_tensor_slices((person_tensor, value_tensor)).map(lambda person, value: {'person': person, 'value': value})
window_size = 10 # not very important, I don't care getting multiple groups for one key as long as there is only one key per group
ds = ds.group_by_window(
key_func=lambda row: tf.strings.to_hash_bucket_fast(row['person'], 5),
window_size=window_size,
reduce_func=lambda key, rows: rows.batch(window_size)
)
ds = ds.map(lambda d: {'person': tf.unique(d['person'])[0], 'value': d['value']})
list(ds.as_numpy_iterator())
[{'person': array([b'person3'], dtype=object),
'value': array([4, 5], dtype=int32)},
{'person': array([b'person1'], dtype=object),
'value': array([1, 3], dtype=int32)},
{'person': array([b'person2'], dtype=object),
'value': array([2], dtype=int32)}]