I am using TF2 for hyperparameter optimization. For example, I define a range of learning rates lr= [0.0001, 0.001, 0.01]
to pass into a Trainer
function which includes a custom training loop (using Gradientape
). However, I met error when I use @tf.function
. My training structure is like this:
def Trainer(lr):
# Define the optimizer
optim = tf.keras.optimizers.experimental.Nadam(learning_rate=lr)
train_dataset, test_dataset, sample_size = dataset_load(arg)
# define model
model_config = {arg}
net = Mymodel(model_config)
step = 0
with tqdm.tqdm(total=max_step, leave=True, desc='Training') as pbar:
while step < max_step:
for signal in train_dataset:
# Calculate loss
loss = train_batch(signal and other parameter)
step = 1
pbar.update()
pbar.set_postfix(
{'loss': loss.numpy(),
'step': step})
The train_batch
function is:
@tf.function
def train_batch(signal, arg...):
with tf.GradientTape() as tape:
tape.watch(model.trainable_variables)
loss = compute_loss([signal], model)
grad = tape.gradient(loss, model.trainable_variables,
unconnected_gradients=tf.UnconnectedGradients.ZERO)
optim.apply_gradients(
zip(grad, model.trainable_variables))
del grad
return loss
For the outer loop, I define the lr
, then use:
for lr_current in lr:
Trainer(lr)
The program executes normally for the first lr_current
. But when the outer for loop goes to the second value of lr_current
, the error comes out:
ValueError: tf.function only supports singleton tf.Variables created on the first call. Make sure the tf.Variable is only created once or created outside tf.function.
I do not understand why this error comes out. I think it is related to the for loop of Trainer
function. I also tried to del net
when finishing the training but it did not work. The program runs normally for all lr_current
when I removed @tf.function
.
I have uploaded a minimal reproducible example to the Colab. Could anyone help me out? Thanks in advance!
CodePudding user response:
I have done your work, copy this code and run it into your notebook.
def Trainer():
# loss_func = tf.keras.losses.RootMeanSquaredError()
train_dataset, test_dataset, sample_size = dataset_load(time_len=100, batch_size=16)
epoch = 0
step = 0
with tqdm.tqdm(total=10, leave=True, desc='Training') as pbar:
while epoch < 10:
for signal in train_dataset:
obs_signal, obs_mask, impute_mask = genmask(signal=signal, missing_ratio=0.2,
missing_type='rm')
# Calculate loss
loss = train_batch(signal=obs_signal, obs_mask=obs_mask,
impute_mask=impute_mask)
step =1
pbar.set_postfix(
{'loss': loss.numpy(),
'step': step,
'epoch': epoch})
epoch = 1
pbar.update()
def compute_loss(signal_mask: List, diff_params):
obs_mask = signal_mask[1]
impute_mask = signal_mask[2]
# [B, T], [B, T]
epsilon_theta, eps = diffusion(signal_mask, diff_params)
# MSE loss
target_mask = obs_mask - impute_mask
residual = (epsilon_theta - eps) * target_mask
loss = tf.reduce_sum(residual**2)/ (tf.reduce_sum(target_mask)
if tf.reduce_sum(target_mask)>0 else 1.0)
return loss
def diffusion(signal_mask: List, diff_params, eps=None):
assert len(signal_mask) == 3
signal = signal_mask[0]
cond_mask = signal_mask[2]
B, L, C = signal.shape[0], signal.shape[1], signal.shape[2] # B is batchsize, C=1, L is signal length
_dh = diff_params
T, Alpha_bar = _dh["T"], _dh["Alpha_bar"]
timesteps = tf.random.uniform(
shape=[B, 1, 1], minval=0, maxval=T, dtype=tf.int32) # [B], randomly sample diffusion steps from 1~T
if eps is None:
eps = tf.random.normal(tf.shape(signal)) # random noise
extracted_alpha = tf.gather(Alpha_bar, timesteps)
transformed_X = tf.sqrt(extracted_alpha) * signal tf.sqrt(
1 - extracted_alpha) * eps # compute x_t from q(x_t|x_0)
timesteps = tf.cast(timesteps, tf.float32)
total_input = tf.stack([cond_mask * signal,
(1 - cond_mask) * transformed_X], axis=-1) # B, L, K, 2
obser_tp = tf.range(signal.shape[1])
epsilon_theta = net(
(total_input, obser_tp, cond_mask,
tf.squeeze(timesteps, axis=-1))) # predict \epsilon according to \epsilon_\theta
return epsilon_theta, eps
def dataset_load(time_len, batch_size):
train_data = np.random.randn(batch_size*10, time_len, 10)
test_data = np.random.randn(batch_size, time_len, 10)
shuffle_size_train = train_data.shape[0]
train_dataset = tf.data.Dataset. \
from_tensor_slices(train_data).shuffle(shuffle_size_train) \
.batch(batch_size, drop_remainder=True)
test_dataset = tf.convert_to_tensor(test_data)
L = train_data.shape[-2]
K = train_data.shape[-1]
return (train_dataset, test_dataset, [L, K])
def genmask(signal: tf.Tensor, missing_ratio, missing_type):
"""Generate the mask
Returns:
observed_values (tf.Tensor): [B, T, K], [B, T, K], multivariate time series with K features
observed_masks (tf.Tensor): [B, T, K], mask for observation points
impute_mask (tf.Tensor): [B, T, K], mmask for imputation target
"""
miss_ratio = missing_ratio
observed_values = signal.numpy().astype(np.single)
observed_mask = ~np.isnan(observed_values)
rand_for_mask = np.random.rand(*observed_mask.shape) * observed_mask
rand_for_mask = rand_for_mask.reshape(len(rand_for_mask), -1) # B, L*K
for i in range(len(observed_mask)): # Loop for Batch
sample_ratio = np.random.rand() if not missing_ratio else missing_ratio # missing ratio
num_observed = observed_mask[i].sum()
num_masked = round(num_observed * sample_ratio)
rand_for_mask[i][np.argpartition(rand_for_mask[i], -num_masked)[-num_masked:]] = -1
gt_masks = (rand_for_mask > 0).reshape(observed_mask.shape).astype(np.single)
observed_mask = observed_mask.astype(np.single)
return observed_values, observed_mask, gt_masks
@tf.function
def train_batch(signal, obs_mask, impute_mask):
"""Warpped training on a batch using static graph.
Args:
signal (tf.Tensor): [B, T, K], multivariate time series with K features
obs_mask (tf.Tensor): [B, T, K], mask for observation points
impute_mask (tf.Tensor): [B, T, K], mask for imputation target
Returns:
loss (float): average loss function of on a batch
"""
with tf.GradientTape() as tape:
tape.watch(net.trainable_variables)
loss = compute_loss([signal, obs_mask, impute_mask],
diffusion_hyperparams)
grad = tape.gradient(loss, net.trainable_variables,
unconnected_gradients=tf.UnconnectedGradients.ZERO)
optim.apply_gradients(
zip(grad, net.trainable_variables))
del grad
return loss
lr = [0.001, 0.002, 0.01]
for lr_iter in lr:
optim = tf.keras.optimizers.experimental.Nadam(lr_iter)
model_config = { "res_channels": 64}
net = mymodel(model_config)
diffusion_hyperparams = calc_diffusion_hyperparams(T=50, beta_0=0.0001, beta_T=0.5, strategy="quadratic")
Trainer()
tf.keras.backend.clear_session()
del net
Link to the Notebook
https://colab.research.google.com/drive/1uh3-q3hM4obKLbh93sfT25zoUbK4jfUJ?usp=sharing