I have a dataset which looks something like this:
time | src | a | b | c | d | e | Label
----------------------------------------
0. | 1 | # | # | # | # | # | #
1. | 1 | # | # | # | # | # | #
2. | 1 | # | # | # | # | # | #
3. | 1 | # | # | # | # | # | #
4. | 1 | # | # | # | # | # | #
....
0. | 2 | # | # | # | # | # | #
1. | 2 | # | # | # | # | # | #
2. | 2 | # | # | # | # | # | #
3. | 2 | # | # | # | # | # | #
4. | 2 | # | # | # | # | # | #
I'm training a model to predict label
against a window of [a,b,c,d,e]
values. So my X
is of shape (window_size,5)
and my y
would be the value of label
at the end of the window. All values of X
must have the same value in src
(i.e. a window of data should only come from a single source).
I've been previously compiling X/y pairs, with a little tf.keras.utils.Sequence
to hack semi usable memory management. In looking for a better way, I found tf.keras.utils.timeseries_dataset_from_array, but, based on my understanding, it would have no concept of src
, meaning a single X
datum could be from numerous src
's. How can I leverage something like tf.keras.utils.timeseries_dataset_from_array
, but have it only extract windows of data that have one src
value?
note: I'd like to get a rolling window. i.e. every possible window with overlap, from each source.
Progress
1
I successfully used timeseries_dataset_from_array
, but it doesn't respect src
# ============= Prep ===========
import tensorflow as tf
import numpy as np
#creating numpy data structures representing the problem
X = np.random.random((100,5))
y = np.random.random((100))
src = np.array([0]*50 [1]*50)
window_size = 5
#making a time series dataset which does not respect src
Xy_ds = tf.keras.utils.timeseries_dataset_from_array(X, y, batch_size = 2, sequence_length=window_size,
sequence_stride=window_size, shuffle=True)
# ============= Train ===========
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, LSTM, Flatten
#training a model, to validate the dataset is working correctly
model = Sequential()
model.add(InputLayer(input_shape=[window_size,5]))
model.add(LSTM(3))
model.add(Flatten())
model.add(Dense(1, activation='relu'))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(Xy_ds,epochs=1)
2
Implemented mdaoust's solution, but I'm getting shape errors when training
# ============= Prep ===========
import tensorflow as tf
import numpy as np
#creating numpy data structures representing the problem
X = np.random.random((100,5))
y = np.random.random((100))
src = np.expand_dims(np.array([0]*50 [1]*50),1)
window_size = 5
#appending source information to X, for filtration
X = np.append(src, X, 1)
#making a time series dataset which does not respect src
Xy_ds = tf.keras.utils.timeseries_dataset_from_array(X, y, sequence_length=window_size,
sequence_stride=1, shuffle=True)
def single_source(x,y):
source = x[:,0]
return tf.reduce_all(source == source[0])
#filtering by and removing src info
def drop_source(x,y):
return x[:, 1:], y
def set_shapes(x,y, shape):
x.set_shape(shape)
return x,y
Xy_ds = Xy_ds.filter(single_source).map(drop_source)
# ============= Train ===========
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, LSTM, Flatten
#training a model, to validate the dataset is working correctly
model = Sequential()
model.add(InputLayer(input_shape=[window_size,5]))
model.add(LSTM(3))
model.add(Flatten())
model.add(Dense(1, activation='relu'))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(Xy_ds,epochs=1)
Error:
ValueError: Input 0 is incompatible with layer sequential_3: expected shape=(None, None, 5), found shape=(None, None, 6)
Presumably related to this Github thread
I tried this and something like this, but no dice.
CodePudding user response:
The simplest thing you can do is:
Include the
source
as one of the coluns ofX
.Use
timeseries_dataset_from_array
.Use
filter
to drop slices that have mixed sources.
Xy_ds = tf.keras.utils.timeseries_dataset_from_array(...)
def single_source(x,y):
source = x[:,0]
return tf.reduce_all(source == source[0])
def drop_source(x,y):
return x[:, 1:], y
Xy_ds = Xy_ds.filter(single_source).map(drop_source)
CodePudding user response:
Based on mdaoust's answer, but the final working code.
Prep
This will create the time series dataset and do all the manipulation to format it correcly
# ============= Prep ===========
import tensorflow as tf
import numpy as np
batch_size = 32
#creating numpy data structures representing the problem
X = np.random.random((100,5))
y = np.random.random((100))
src = np.expand_dims(np.array([0]*50 [1]*50),1)
window_size = 5
#appending source information to X, for filtration
X = np.append(src, X, 1)
#making a time series dataset which does not respect src
Xy_ds = tf.keras.utils.timeseries_dataset_from_array(X, y, sequence_length=window_size, batch_size=1,
sequence_stride=1, shuffle=True)
#filtering by and removing src info
def single_source(x,y):
source = x[:,:,0]
return tf.reduce_all(source == source[0])
def drop_source(x,y):
x_ = x[:, :, 1:]
print(x_)
return x_, y
Xy_ds = Xy_ds.filter(single_source)
Xy_ds = Xy_ds.map(drop_source)
Xy_ds = Xy_ds.batch(batch_size)
#printing the dataset
i = 0
for x, y in Xy_ds:
i =1
print(x)
print(y)
print('total batches: {}'.format(i))
Training
training, just to sanity check that everything is working
# ============= Train ===========
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, LSTM, Flatten
#training a model, to validate the dataset is working correctly
model = Sequential()
model.add(InputLayer(input_shape=[window_size,5]))
model.add(LSTM(3))
model.add(Flatten())
model.add(Dense(1, activation='relu'))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(Xy_ds,epochs=1)
Important Note: in order for this to work, batching must occur after the filter and map are applied. That's why batch_size = 1
initially, then batching happens after.