How can one calculate the gradient on a variable with respect to another variable used in a linear combination? The following code is executed in TensorFlow eager mode.
Some more digging in older questions, a similar question showed up. However, it is not clear on how to solve this issue.
Another related question is this one, but here the same variable is reused and TensorFlow v1
.
I also read in this question that tf.assign
(v1?) does not support gradients and a potential solution is provided there.
However, I'd apply it in context of internal model weights of neural networks, but I don't know how to apply that tensor-approach in practice.
a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')
with tf.GradientTape() as tape:
c.assign(a + b)
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss, b)) # prints None
# or another attempt
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch([b,c])
c.assign(a + b)
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss, b)) # also outputs None
# Working, but c is a variable in my use case
with tf.GradientTape() as tape:
c = a + b
loss = tf.reduce_mean(c**2)
print(tape.gradient(loss, b)) # Works
Extension:
import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.01)
with tf.GradientTape(persistent=True) as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
loss = tf.norm(c) # scalar
# This works as expected
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=0.0024197185>, <tf.Tensor: shape=(), dtype=float32, numpy=0.009702832>]
# Here I would expect a 1D gradient to use the Gradient Descent method?
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=1.4518311>, <tf.Tensor: shape=(), dtype=float32, numpy=5.8216996>]
# Example what I'd like to achieve;
with tf.GradientTape() as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
loss = tf.norm(c_) # scalar
print(tape.gradient(loss,x))
# tf.Tensor(5.0933886, shape=(), dtype=float32)
A more sophisticated issue:
import tensorflow as tf
a = [tf.Variable([1.0, 2.0], name='a'), tf.Variable([5.0], name='aa'), tf.Variable(7.0, name='aaa')]
b = [tf.Variable([3.0, 4.0], name='b'), tf.Variable([6.0], name='bb'), tf.Variable(8.0, name='aaa')]
c = [tf.Variable([1.0, 1.0], name='c'), tf.Variable([1.0], name='cc'), tf.Variable(1.0, name='ccc')]
x = tf.Variable(0.5, name='x')
with tf.GradientTape(persistent=True) as tape:
c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))
print(loss, loss_without_assign)
# tf.Tensor(9.974969, shape=(), dtype=float32) tf.Tensor(9.974969, shape=(), dtype=float32)
# Gives same result
#partial_grads = tf.nest.map_structure(lambda d, e: tf.nest.map_structure(lambda f, g: tape.gradient(loss, f, output_gradients=tape.gradient(g, x)), d, e), c, c_)
partial_grads = tf.nest.map_structure(lambda d, e: tape.gradient(loss, d, output_gradients=tape.gradient(e, x)), c, c_)
# Should not use mean?
print(tf.reduce_sum(tf.nest.map_structure(lambda z: tf.reduce_mean(z), partial_grads)))
print(tape.gradient(loss_without_assign, x))
# Rather close
# tf.Tensor(2.3057716, shape=(), dtype=float32)
# tf.Tensor(2.3057709, shape=(), dtype=float32)