tf2.0: Gradient Tape returns None gradient in RNN model-CodePudding

In a model with an embedding layer and SimpleRNN layer, I would like to compute the partial derivative dh_t/dh_0 for each step t.

The structure of my model, including imports and data preprocessing.
Toxic comment train data available: https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification/data?select=jigsaw-toxic-comment-train.csv
GloVe 6B 100d embeddings available: https://nlp.stanford.edu/projects/glove/

### 1. Imports 
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd 
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 2. Text data tokenisation and GloVe-100d embeddings:
def data_pp():
    train= pd.read_csv('/Users/Toxic comment data/jigsaw-toxic-comment-train.csv')   train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
    train= train.iloc[:12000,:]
    xtr, xte, ytr, yte= train_test_split(train['comment_text'].values, 
                                        train['toxic'].values,
                                        stratify= train['toxic'].values,
                                        random_state= 42, test_size= 0.2, shuffle= True)
    
    # Tokenise data
    tok= text.Tokenizer(num_words= None)
    tok.fit_on_texts(list(xtr)  list(xte))
    input_dim= len(tok.word_index) 1
    input_length= train['comment_text'].apply(lambda x: len(str(x).split())).max()
    xtr_seq= tok.texts_to_sequences(xtr); xte_seq= tok.texts_to_sequences(xte)
    xtr_pad= sequence.pad_sequences(xtr_seq, maxlen= input_length)
    xte_pad= sequence.pad_sequences(xte_seq, maxlen= input_length)
    print('Shape of tokenised training input:', xtr_pad.shape)
    return xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok
    
xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok= data_pp()

# Word embeddings
def embed_mat(input_dim, output_dim, tok):
    '''By default output_dim = 100 for GloVe 100d embeddings'''
    embedding_dict=dict()
    f= open('/Users/GloVe/glove.6B.100d.txt')
    for line in f:
        values= line.split()
        word= values[0]; coefs= asarray(values[1:], dtype= 'float32')
        embedding_dict[word]= coefs
    f.close()
    Emat= zeros((input_dim, output_dim))
    for word, i in tok.word_index.items():
        embedding_vector= embedding_dict.get(word)
        if embedding_vector is not None:
            Emat[i]= embedding_vector
    print('Embedding weight matrix has shape:', Emat.shape)
    return Emat

output_dim = 100
Emat= embed_mat(input_dim, output_dim, took)

### 3. Define model and compute gradients:
# You can let it run for a few steps and stop the process. Then inspect the first step h_t, h_0 and the computed dh_t/dh_0.
# For the case in my comment, you can remove the for-loop over the steps t, comment out ht, and compute tape.gradient(states, h0) instead.

batch_size = 100
inp= Input(batch_shape= (batch_size, input_length), name= 'input') 
emb_out= Embedding(input_dim, output_dim, input_length= input_length, 
                         weights= [Emat], trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))
rnn_allstates= rnn(emb_out, initial_state=h0) 
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad[:100], ytr[:100])).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
    for t in range(input_length):
        with tf.GradientTape() as tape:
            tape.watch(h0)
            et = embedding_layer(x_batch_train)
            states = rnn_layer(et, initial_state= h0)   # (100, 1403, 200)
            ht = states[:,t,:] 

        grad_t= tape.gradient(ht, h0)  # (100, 200)
        print('Computed gradient dht/dh0 at step ', t 1, 'in batch', b 1)
        grads_allsteps.append(grad_t)

At each step t, h_t has shape (100,200), h_0 has shape (100,200). However tape.gradient(ht, h0) returns None for every t. Below is the result of the first step:

for t in range(1):
    with tf.GradientTape() as tape:
        tape.watch(h0)
        et = embedding_layer(x_batch_train)
        #tape.watch(et)
        states = rnn_layer(et, initial_state= h0)   # (100, 1403, 200)
        ht = states[:,t,:] 
        print(ht)
        print(h0)
    grad_t = tape.gradient(ht, h0)
    tf.print(grad_t)

>>
# h_t:
tf.Tensor(
[[ 0.25634336  0.5259362   0.60045886 ... -0.4978792   0.62755316
   0.09803997]
 [ 0.58387524  0.26037565  0.5646103  ...  0.31233114  0.4853201
   0.10877549]
 [ 0.17190906  0.68681747 -0.32054633 ... -0.6139967   0.48944488
   0.06301598]
 ...
 [ 0.1985917  -0.11821499 -0.47709295 ... -0.05718012  0.16089934
   0.20585683]
 [ 0.73872745  0.503326    0.25224414 ... -0.5771631   0.03748894
   0.09212588]
 [-0.6597108  -0.43926442 -0.23546427 ...  0.26760277  0.28221437
  -0.4039318 ]], shape=(100, 200), dtype=float32)

# h_0:
tf.Tensor(
[[0.51580787 0.51664346 0.70773274 ... 0.45973232 0.7760376  0.48297063]
 [0.61048764 0.26038417 0.60392565 ... 0.7426153  0.15507504 0.57494944]
 [0.11859739 0.33591187 0.68375146 ... 0.59409297 0.5302879  0.28876984]
 ...
 [0.12401487 0.39376178 0.9850304  ... 0.21582918 0.9592233  0.5257605 ]
 [0.9401199  0.2157638  0.6445949  ... 0.36316434 0.5799403  0.3749675 ]
 [0.37230062 0.18162128 0.0739954  ... 0.21624395 0.66291    0.7807376 ]], shape=(100, 200), dtype=float32)

# dh_t/dh_0:
None

There seems to be some difficulty for Gradient tape to watch this h_0, and perform gradient computation. I have successfully used GradientTape watch the inputs e_t to the RNN layer, and computed the gradients dh_t/de_t, but this does not really provide much information about the quality of model fitting.

How can I use it to watch the fixed-time quantity h_0, and thus compute the gradient dh_t/dh_0? Thanks in advance for any help.

Reproducible test case:

### 1. Imports 
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd 
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))


inp= Input(batch_shape= (batch_size, input_length), name= 'input') 
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))

rnn_allstates= rnn(emb_out, initial_state=h0) 
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
    for t in range(input_length):
        with tf.GradientTape() as tape:
            tape.watch(h0)
            states= model_rnn(x_batch_train)
            ht = states[:,t,:] 

        grad_t= tape.gradient(ht, h0)  
        print('Computed gradient dht/dh0 at step ', t 1, 'in batch', b 1)
        grads_allsteps.append(grad_t)

Something interesting: the first-step gradient is computed and looks fine. The rest are Nones.

grads_allsteps

>>
[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
 array([[ 1.2307187 , -1.0343404 ,  0.52859926, ..., -0.09879799,
         -1.1407609 , -0.7241671 ],
        [ 1.142821  , -1.312029  ,  0.37148148, ...,  0.2300478 ,
         -1.1440411 , -0.36673146],
        [ 1.2778691 , -1.2225235 ,  0.69951147, ...,  0.17701946,
         -1.2816343 , -0.52648413],
        ...,
        [ 1.1717036 , -1.2444504 ,  0.5874837 , ..., -0.13161334,
         -1.3752006 , -0.376719  ],
        [ 1.1333262 , -1.0013355 ,  0.3363382 , ..., -0.22350994,
         -1.299541  , -0.5073889 ],
        [ 1.18489   , -0.90809333,  0.55045474, ..., -0.10550319,
         -1.0866506 , -0.58325446]], dtype=float32)>, None, None, None, None]

CodePudding user response：

You could maybe try using tf.gradients. Also rather use tf.Variable for h0:

# Your imports
#-------
### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))


inp= Input(batch_shape= (batch_size, input_length), name= 'input') 
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.Variable(tf.random.uniform((batch_size, 200)))

rnn_allstates= rnn(emb_out, initial_state=h0) 
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]


@tf.function
def calculate_t_gradients(t, x, h0):
  return tf.gradients(model_rnn(x)[:,t,:], h0)

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
    for t in range(input_length):  
      grads_allsteps.append(calculate_t_gradients(t, x_batch_train, h0))
 
print(grads_allsteps)

[[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 1.2034059 , -0.46448404,  0.6272926 , ..., -0.40906236,
         0.07618493,  0.6338958 ],
       [ 1.2781916 , -0.20411322,  0.6174417 , ..., -0.31636393,
        -0.23417974,  0.67499626],
       [ 1.113218  , -0.65086263,  0.63425934, ..., -0.66614366,
        -0.07726163,  0.53647137],
       ...,
       [ 1.3399608 , -0.54088974,  0.6213518 , ...,  0.00831087,
        -0.14397278,  0.2614633 ],
       [ 1.213171  , -0.42787278,  0.60535026, ..., -0.56198204,
        -0.09142771,  0.6212783 ],
       [ 1.1901733 , -0.5743524 ,  0.36872283, ..., -0.42522985,
        -0.0861398 ,  0.495057  ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.3487598 ,  1.2738569 , -0.48500937, ...,  0.6011117 ,
        -0.20381093,  0.45596513],
       [ 0.37931004,  1.2778724 , -0.8682532 , ...,  0.8170228 ,
         0.1456329 ,  0.23715591],
       [ 0.5984771 ,  0.92434835, -0.8879645 , ...,  0.38756457,
        -0.17436962,  0.47174054],
       ...,
       [ 0.61081064,  0.99631476, -0.5104377 , ...,  0.5042721 ,
         0.02844866,  0.34626445],
       [ 0.7126102 ,  1.0205276 , -0.60710275, ...,  0.49418694,
        -0.16092762,  0.41363668],
       [ 0.8581749 ,  1.1259711 , -0.5824491 , ...,  0.45388597,
        -0.16205123,  0.72434616]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 3.8507193e-01,  1.2925258e 00,  1.2027258e 00, ...,
         3.2430276e-01,  2.2319333e-01, -2.5218868e-01],
       [ 5.9262186e-01,  1.4497797e 00,  1.2479483e 00, ...,
         4.6175608e-01,  2.5466472e-01, -2.4279505e-01],
       [ 2.5734475e-01,  1.4562432e 00,  1.1020679e 00, ...,
         6.6081107e-01,  1.9841105e-01, -2.5595558e-01],
       ...,
       [ 5.1541841e-01,  1.6206543e 00,  9.6205616e-01, ...,
         7.2725344e-01,  2.5501373e-01, -7.7709556e-04],
       [ 4.4518453e-01,  1.6381552e 00,  1.0112666e 00, ...,
         5.5238277e-01,  2.4137528e-01, -2.6242572e-01],
       [ 6.6721851e-01,  1.5826726e 00,  1.1282607e 00, ...,
         3.2301426e-01,  2.2295776e-01,  1.1724380e-01]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.14262576,  0.578709  ,  0.1149607 , ...,  0.1229499 ,
        -0.42344815,  0.8837458 ],
       [-0.09711604,  0.04376438, -0.11737494, ...,  0.00389774,
         0.01737173,  0.17246482],
       [ 0.24414796,  0.30101255, -0.12234146, ..., -0.04850931,
        -0.31790918,  0.21326394],
       ...,
       [-0.20562285,  0.21999156,  0.02703794, ..., -0.03547464,
        -0.59052145,  0.04695258],
       [ 0.2087476 ,  0.46558812, -0.18172565, ..., -0.01167884,
        -0.20868361,  0.09055485],
       [-0.22442941,  0.16119067,  0.10854454, ...,  0.14752978,
        -0.32307786,  0.343314  ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[-1.1414615 ,  0.37376842, -1.0230722 , ...,  0.60619426,
         0.22550163, -0.6948315 ],
       [-1.0124328 ,  0.27892357, -0.96915233, ...,  0.7048603 ,
        -0.15284726, -0.6734605 ],
       [-0.8542529 ,  0.25970122, -0.90076745, ...,  0.8825682 ,
        -0.02474228, -0.55014515],
       ...,
       [-0.89430666,  0.68327624, -1.0109956 , ...,  0.31722566,
        -0.23703958, -0.6766514 ],
       [-0.8633691 ,  0.28742114, -0.9896866 , ...,  0.98315084,
         0.0115847 , -0.55474746],
       [-0.7229766 ,  0.62417865, -1.2342371 , ...,  0.85149145,
        -0.04468453, -0.60606724]], dtype=float32)>]]

You need to make sure the stateful parameter of the SimpleRNN is False, because according to the docs:

If True, the last state for each sample at index i in a batch will be used as initial state for the sample of index i in the following batch.

So, your code will also calculate gradients for each timestep if you set stateful to False.