I build my model with the following code(tensorflow==1.14):
class Model(tf.keras.Model):
def __init__(self):
super(Model, self).__init__()
self.embedding = tf.keras.layers.Embedding(10, 5)
self.rnn = tf.keras.layers.GRU(100) # neither GRU nor LSTM works
self.final_layer = tf.keras.layers.Dense(10)
self.loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def call(self, inp):
inp_em = self.embedding(inp) # (batch_size, seq_len, embedding_size)
inp_enc = self.rnn(inp_em) # (batch_size, hidden_size)
logits = self.final_layer(inp_enc) # (batch_size, class_num)
return logits
model = Model()
inp = np.random.randint(0, 10, [5, 50], dtype=np.int32)
out = np.random.randint(0, 10, [5], dtype=np.int32)
with tf.GradientTape() as tape:
logits = model(inp)
loss = model.loss_obj(out, logits)
print(loss)
gradients = tape.gradient(tf.reduce_mean(loss), model.trainable_variables)
print('========== Trainable Variables ==========')
for v in model.trainable_variables:
print(v)
print('========== Gradients ==========')
for g in gradients:
print(g)
but when i print the gridents, the output is :
Tensor("categorical_crossentropy/weighted_loss/Mul:0", shape=(5,), dtype=float32)
========== Trainable Variables ==========
<tf.Variable 'model/embedding/embeddings:0' shape=(10, 5) dtype=float32>
<tf.Variable 'model/gru/kernel:0' shape=(5, 300) dtype=float32>
<tf.Variable 'model/gru/recurrent_kernel:0' shape=(100, 300) dtype=float32>
<tf.Variable 'model/gru/bias:0' shape=(300,) dtype=float32>
<tf.Variable 'model/dense/kernel:0' shape=(100, 10) dtype=float32>
<tf.Variable 'model/dense/bias:0' shape=(10,) dtype=float32>
========== Gradients ==========
None
None
None
None
Tensor("MatMul:0", shape=(100, 10), dtype=float32)
Tensor("BiasAddGrad:0", shape=(10,), dtype=float32)
the gridents for final layer works well, but None for GRU layers and so on.
I have tried both tf.keras.layers.LSTM
and tf.keras.layers.GRU
, the same problem exists.
Update
finally, I replace tf.GradientTape().gradient()
with tf.graidents()
:
logits = model(inp)
loss = model.loss_obj(out, logits)
gradients = tf.gradients(tf.reduce_mean(loss), model.trainable_variables)
the gradients works. But I still don't know what's the difference between these two implements.