I have a multiple time series data that looks something like this:
df = pd.DataFrame({'Time': np.tile(np.arange(5), 2),
'Object': np.concatenate([[i] * 5 for i in [1, 2]]),
'Feature1': np.random.randint(10, size=10),
'Feature2': np.random.randint(10, size=10)})
Time Object Feature1 Feature2
0 0 1 3 3
1 1 1 9 2
2 2 1 6 6
3 3 1 4 0
4 4 1 7 7
5 0 2 4 8
6 1 2 3 7
7 2 2 1 1
8 3 2 7 5
9 4 2 1 7
where each object (1 and 2) has its own data (about 2000 objects in real data). I would like to feed this data chunkwise into RNN/LSTM using tf.data.Dataset.window
in a way that different objects data don't come in one window like in this example:
dataset = tf.data.Dataset.from_tensor_slices(df)
for w in dataset.window(3, shift=1, drop_remainder=True):
print(list(w.as_numpy_iterator()))
Output:
[array([0, 1, 3, 3]), array([1, 1, 9, 2]), array([2, 1, 6, 6])]
[array([1, 1, 9, 2]), array([2, 1, 6, 6]), array([3, 1, 4, 0])]
[array([2, 1, 6, 6]), array([3, 1, 4, 0]), array([4, 1, 7, 7])]
[array([3, 1, 4, 0]), array([4, 1, 7, 7]), array([0, 2, 4, 8])] # Mixed data from both objects
[array([4, 1, 7, 7]), array([0, 2, 4, 8]), array([1, 2, 3, 7])] # Mixed data from both objects
[array([0, 2, 4, 8]), array([1, 2, 3, 7]), array([2, 2, 1, 1])]
[array([1, 2, 3, 7]), array([2, 2, 1, 1]), array([3, 2, 7, 5])]
[array([2, 2, 1, 1]), array([3, 2, 7, 5]), array([4, 2, 1, 7])]
Expected output:
[array([0, 1, 3, 3]), array([1, 1, 9, 2]), array([2, 1, 6, 6])]
[array([1, 1, 9, 2]), array([2, 1, 6, 6]), array([3, 1, 4, 0])]
[array([2, 1, 6, 6]), array([3, 1, 4, 0]), array([4, 1, 7, 7])]
[array([0, 2, 4, 8]), array([1, 2, 3, 7]), array([2, 2, 1, 1])]
[array([1, 2, 3, 7]), array([2, 2, 1, 1]), array([3, 2, 7, 5])]
[array([2, 2, 1, 1]), array([3, 2, 7, 5]), array([4, 2, 1, 7])]
Maybe there is another way to do it. The main requirement that my model should see that non-mixed data chunks come from different objects (maybe via embedding).
CodePudding user response:
Hmm, maybe just create two separate dataframes and then concatenate after windowing. That way, you will not have any overlapping:
import tensorflow as tf
import pandas as pd
import numpy as np
df = pd.DataFrame({'Time': np.tile(np.arange(5), 2),
'Object': np.concatenate([[i] * 5 for i in [1, 2]]),
'Feature1': np.random.randint(10, size=10),
'Feature2': np.random.randint(10, size=10)})
df1 = df[df['Object'] == 1]
df2 = df[df['Object'] == 2]
dataset = tf.data.Dataset.from_tensor_slices(df1).window(3, shift=1, drop_remainder=True).concatenate(tf.data.Dataset.from_tensor_slices(df2).window(3, shift=1, drop_remainder=True))
for w in dataset:
print(list(w.as_numpy_iterator()))
[array([0, 1, 3, 3]), array([1, 1, 9, 2]), array([2, 1, 6, 6])]
[array([1, 1, 9, 2]), array([2, 1, 6, 6]), array([3, 1, 4, 0])]
[array([2, 1, 6, 6]), array([3, 1, 4, 0]), array([4, 1, 7, 7])]
[array([0, 2, 4, 8]), array([1, 2, 3, 7]), array([2, 2, 1, 1])]
[array([1, 2, 3, 7]), array([2, 2, 1, 1]), array([3, 2, 7, 5])]
[array([2, 2, 1, 1]), array([3, 2, 7, 5]), array([4, 2, 1, 7])]
Another approach would be to use tf.data.Dataset.filter
like this:
import tensorflow as tf
import pandas as pd
import numpy as np
df = pd.DataFrame({'Time': np.tile(np.arange(5), 2),
'Object': np.concatenate([[i] * 5 for i in [1, 2]]),
'Feature1': np.random.randint(10, size=10),
'Feature2': np.random.randint(10, size=10)})
objects = df['Object'].unique()
dataset = tf.data.Dataset.from_tensor_slices(df)
new_datasets = None
for o in objects:
if new_datasets:
new_datasets = new_datasets.concatenate(dataset.filter(lambda x: tf.math.equal(x[1], tf.constant(o))).window(3, shift=1, drop_remainder=True))
else: new_datasets = dataset.filter(lambda x: tf.math.equal(x[1], tf.constant(o))).window(3, shift=1, drop_remainder=True)
for w in new_datasets:
print(list(w.as_numpy_iterator()))