Home > Enterprise >  Tensorflow2 For loop Graph Execution Valuerror
Tensorflow2 For loop Graph Execution Valuerror

Time:01-25

I am using TF2 for hyperparameter optimization. For example, I define a range of learning rates lr= [0.0001, 0.001, 0.01] to pass into a Trainer function which includes a custom training loop (using Gradientape). However, I met error when I use @tf.function. My training structure is like this:

def Trainer(lr):
    
    # Define the optimizer
    optim = tf.keras.optimizers.experimental.Nadam(learning_rate=lr)
    
    train_dataset, test_dataset, sample_size = dataset_load(arg)
    
    # define model
    model_config = {arg}
    net = Mymodel(model_config)
    step = 0
    with tqdm.tqdm(total=max_step, leave=True, desc='Training') as pbar:
        while step < max_step:
            for signal in train_dataset:
                
                # Calculate loss
                loss = train_batch(signal and other parameter)
    
                step  = 1
                pbar.update()
                pbar.set_postfix(
                    {'loss': loss.numpy(),
                     'step': step})

The train_batch function is:

@tf.function 
def train_batch(signal, arg...):
    with tf.GradientTape() as tape:
        tape.watch(model.trainable_variables)
        loss = compute_loss([signal], model)
    grad = tape.gradient(loss, model.trainable_variables,
                         unconnected_gradients=tf.UnconnectedGradients.ZERO)
    optim.apply_gradients(
        zip(grad, model.trainable_variables))
    del grad

    return loss

For the outer loop, I define the lr, then use: for lr_current in lr: Trainer(lr)

The program executes normally for the first lr_current. But when the outer for loop goes to the second value of lr_current, the error comes out:

ValueError: tf.function only supports singleton tf.Variables created on the first call. Make sure the tf.Variable is only created once or created outside tf.function.

I do not understand why this error comes out. I think it is related to the for loop of Trainer function. I also tried to del net when finishing the training but it did not work. The program runs normally for all lr_current when I removed @tf.function.

I have uploaded a minimal reproducible example to the Colab. Could anyone help me out? Thanks in advance!

CodePudding user response:

I have done your work, copy this code and run it into your notebook.

def Trainer():

    # loss_func = tf.keras.losses.RootMeanSquaredError()


    train_dataset, test_dataset, sample_size = dataset_load(time_len=100, batch_size=16)

    epoch = 0
    step = 0
    with tqdm.tqdm(total=10, leave=True, desc='Training') as pbar:
        while epoch < 10:
            for signal in train_dataset:
                obs_signal, obs_mask, impute_mask = genmask(signal=signal, missing_ratio=0.2,
                                                            missing_type='rm')
                # Calculate loss
                loss = train_batch(signal=obs_signal, obs_mask=obs_mask,
                                   impute_mask=impute_mask)

                step =1
                pbar.set_postfix(
                    {'loss': loss.numpy(),
                     'step': step,
                     'epoch': epoch})
                
            epoch  = 1
            pbar.update()

def compute_loss(signal_mask: List, diff_params):
    obs_mask = signal_mask[1]
    impute_mask = signal_mask[2]
    # [B, T], [B, T]
    epsilon_theta, eps = diffusion(signal_mask, diff_params)
    # MSE loss
    target_mask = obs_mask - impute_mask
    residual = (epsilon_theta - eps) * target_mask

    loss = tf.reduce_sum(residual**2)/ (tf.reduce_sum(target_mask)
                                        if tf.reduce_sum(target_mask)>0 else 1.0)
    return loss

def diffusion(signal_mask: List, diff_params, eps=None):

    assert len(signal_mask) == 3

    signal = signal_mask[0]
    cond_mask = signal_mask[2]

    B, L, C = signal.shape[0], signal.shape[1], signal.shape[2]  # B is batchsize, C=1, L is signal length
    _dh = diff_params
    T, Alpha_bar = _dh["T"], _dh["Alpha_bar"]
    timesteps = tf.random.uniform(
        shape=[B, 1, 1], minval=0, maxval=T, dtype=tf.int32)  # [B], randomly sample diffusion steps from 1~T

    if eps is None:
        eps = tf.random.normal(tf.shape(signal))  # random noise

    extracted_alpha = tf.gather(Alpha_bar, timesteps)
    transformed_X = tf.sqrt(extracted_alpha) * signal   tf.sqrt(
        1 - extracted_alpha) * eps  # compute x_t from q(x_t|x_0)
    timesteps = tf.cast(timesteps, tf.float32)
    total_input = tf.stack([cond_mask * signal,
                            (1 - cond_mask) * transformed_X], axis=-1)  # B, L, K, 2
    obser_tp = tf.range(signal.shape[1])

    epsilon_theta = net(
        (total_input, obser_tp, cond_mask,
         tf.squeeze(timesteps, axis=-1)))  # predict \epsilon according to \epsilon_\theta

    return epsilon_theta, eps


def dataset_load(time_len, batch_size):

    train_data = np.random.randn(batch_size*10, time_len, 10)
    test_data = np.random.randn(batch_size, time_len, 10)
    shuffle_size_train = train_data.shape[0]

    train_dataset = tf.data.Dataset. \
        from_tensor_slices(train_data).shuffle(shuffle_size_train) \
        .batch(batch_size, drop_remainder=True)
    test_dataset = tf.convert_to_tensor(test_data)
    L = train_data.shape[-2]
    K = train_data.shape[-1]

    return (train_dataset, test_dataset, [L, K])

def genmask(signal: tf.Tensor, missing_ratio, missing_type):
    """Generate the mask
    Returns:
        observed_values (tf.Tensor): [B, T, K], [B, T, K], multivariate time series with K features
        observed_masks (tf.Tensor): [B, T, K], mask for observation points
        impute_mask (tf.Tensor): [B, T, K], mmask for imputation target
    """
    miss_ratio = missing_ratio
        
    observed_values = signal.numpy().astype(np.single)
    observed_mask = ~np.isnan(observed_values)

    rand_for_mask = np.random.rand(*observed_mask.shape) * observed_mask
    rand_for_mask = rand_for_mask.reshape(len(rand_for_mask), -1) # B, L*K
    for i in range(len(observed_mask)): # Loop for Batch
        sample_ratio = np.random.rand() if not missing_ratio else missing_ratio # missing ratio
        num_observed = observed_mask[i].sum()
        num_masked = round(num_observed * sample_ratio)
        rand_for_mask[i][np.argpartition(rand_for_mask[i], -num_masked)[-num_masked:]] = -1
    gt_masks = (rand_for_mask > 0).reshape(observed_mask.shape).astype(np.single)
    observed_mask = observed_mask.astype(np.single)
    
    return observed_values, observed_mask, gt_masks
@tf.function
def train_batch(signal, obs_mask, impute_mask):
    """Warpped training on a batch using static graph.
    Args:
        signal (tf.Tensor): [B, T, K], multivariate time series with K features
        obs_mask (tf.Tensor): [B, T, K], mask for observation points
        impute_mask (tf.Tensor): [B, T, K], mask for imputation target
    Returns:
        loss (float): average loss function of on a batch
    """
    with tf.GradientTape() as tape:
        tape.watch(net.trainable_variables)
        loss = compute_loss([signal, obs_mask, impute_mask],
                                 diffusion_hyperparams)

    grad = tape.gradient(loss, net.trainable_variables,
                         unconnected_gradients=tf.UnconnectedGradients.ZERO)
    optim.apply_gradients(
        zip(grad, net.trainable_variables))
    del grad

    return loss


lr = [0.001, 0.002, 0.01]
for lr_iter in lr:
    optim = tf.keras.optimizers.experimental.Nadam(lr_iter)
    model_config = { "res_channels": 64}
    net = mymodel(model_config)
    diffusion_hyperparams = calc_diffusion_hyperparams(T=50, beta_0=0.0001, beta_T=0.5, strategy="quadratic")
    Trainer()
    tf.keras.backend.clear_session()
    del net

Link to the Notebook

https://colab.research.google.com/drive/1uh3-q3hM4obKLbh93sfT25zoUbK4jfUJ?usp=sharing

  • Related