I am trying to train a mixture model but I am unclear how to specify a trainable array argument in order to allow the weights to be updated. So if I have the following with weights hard coded
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
weights = [0.2, 0.8]
dist = tfd.Mixture(cat=tfd.Categorical(probs=weights),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
@tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
gradients = tape.gradient(loss,dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables))
return loss
for i in range(20000):
loss = train_step(X)
where X is a 1D Numpy array with shape (272, 1)
Now let's say I want to learn the weights. If I try in the Categorical distribution constructor
probs=[tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')]
then I get an error "No gradients provided for any variable"
if I try
probs=tf.Variable([tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')], trainable=True, name='weights')
then weight1 and weight2 do not appear in the list of trainablevariables. weights is listed but does not update.
What is the correct way to specify the weights to the probs argument so they will be updated during training?
CodePudding user response:
Maybe try the following:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
dist = tfd.Mixture(cat=tfd.Categorical(probs=tf.Variable([0.2, 0.8])),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
@tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
tf.print(dist.trainable_variables)
gradients = tape.gradient(loss, dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables)) #E
return loss
for i in range(10):
loss = train_step(tf.random.normal((272, 1)))
([0.2 0.8], 0, 1, 0, 1)
([0.2 0.8], -0.00999249145, 1.00999844, -0.0099981213, 1.00999963)
([0.200921655 0.799828708], -0.00638755737, 1.00682414, -0.00639217719, 1.00682521)
([0.20176363 0.799696386], -0.000149463303, 1.00765562, -0.000160227064, 1.00764322)
([0.200775564 0.800094664], 0.000889031217, 1.00637043, 0.000898908474, 1.00636196)
([0.199177444 0.800768435], -0.00115872873, 1.0025779, -0.00113528164, 1.0025754)
([0.19703567 0.801662683], -0.000830670586, 0.998396218, -0.000778611051, 0.998392522)
([0.193336055 0.80336237], 0.00244163908, 0.993740082, 0.00255049323, 0.993718445)
([0.192727238 0.803925216], 0.00376213156, 0.989788294, 0.00386576797, 0.989756942)
([0.194845349 0.802922785], 0.0022987891, 0.986021399, 0.00232516858, 0.985970497)