Update: I misunderstood the question. This is the new answer.
For this purpose, you need to update connections between the hidden layer and the second output unit only, while keep those between the hidden layer and the first output unit intact.
The first approach is to introduce two sets of variables: one for connections between the hidden layer and the first output unit, one for the rest. Then you can combine them using tf.stack
, and pass a var_list
to get the corresponding derivatives. It's like (Just for illustration. Not tested. Use with care):
out1 = tf.matmul(hidden, W_h_to_out1) + b_h_to_out1
out2 = tf.matmul(hidden, W_h_to_out2) + b_h_to_out2
out = tf.stack([out1, out2])
out = tf.transpose(tf.reshape(out, [2, -1]))
loss = some_function_of(out)
optimizer = tf.train.GradientDescentOptimizer(0.1)
train_op_second_unit = optimizer.minimize(loss, var_list=[W_h_to_out2, b_h_to_out2])
Another approach is to use a mask. This is easier to implement and more flexible when you work with some frameworks (say, slim, Keras, etc.), and I'll recommend this way. The idea to hide the first output unit to the loss function, while do not change the second output unit. This can be done using a binary variable: multiply something by 1 if you want to keep it, and multiply it by 0 to drop it. Here's the code:
import tensorflow as tf
import numpy as np
# let's make our tiny dataset: (x, y) pairs, where x = (x1, x2, x3), y = (y1, y2),
# and y1 = x1+x2+x3, y2 = x1^2+x2^2+x3^2
# n_sample data points
n_sample = 8
data_x = np.random.random((n_sample, 3))
data_y = np.zeros((n_sample, 2))
data_y[:, 0] += np.sum(data_x, axis=1)
data_y[:, 1] += np.sum(data_x**2, axis=1)
data_y += 0.01 * np.random.random((n_sample, 2)) # add some noise
# build graph
# suppose we have a network of shape [3, 4, 2], i.e.: one hidden layer of size 4.
x = tf.placeholder(tf.float32, shape=[None, 3], name='x')
y = tf.placeholder(tf.float32, shape=[None, 2], name='y')
mask = tf.placeholder(tf.float32, shape=[None, 2], name='mask')
W1 = tf.Variable(tf.random_normal(shape=[3, 4], stddev=0.1), name='W1')
b1 = tf.Variable(tf.random_normal(shape=[4], stddev=0.1), name='b1')
hidden = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
W2 = tf.Variable(tf.random_normal(shape=[4, 2], stddev=0.1), name='W2')
b2 = tf.Variable(tf.random_normal(shape=[2], stddev=0.1), name='b2')
out = tf.matmul(hidden, W2) + b2
loss = tf.reduce_mean(tf.square(out - y))
# multiply out by mask, thus out[0] is "invisible" to loss, and its gradient will not be propagated
masked_out = mask * out
loss2 = tf.reduce_mean(tf.square(masked_out - y))
optimizer = tf.train.GradientDescentOptimizer(0.1)
train_op_all = optimizer.minimize(loss) # update all variables in the network
train_op12 = optimizer.minimize(loss, var_list=[W2, b2]) # update hidden -> output layer
train_op2 = optimizer.minimize(loss2, var_list=[W2, b2]) # update hidden -> second output unit
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
mask_out1 = np.zeros((n_sample, 2))
mask_out1[:, 1] += 1.0
# print(mask_out1)
print(sess.run([hidden, out, loss, loss2], feed_dict={x: data_x, y: data_y, mask: mask_out1}))
# In this case, only out2 is updated. You see the loss and loss2 decreases.
sess.run(train_op2, feed_dict={x: data_x, y:data_y, mask: mask_out1})
print(sess.run([hidden, out, loss, loss2], feed_dict={x: data_x, y:data_y, mask: mask_out1}))
# In this case, both out1 and out2 is updated. You see the loss and loss2 decreases.
sess.run(train_op12, feed_dict={x: data_x, y:data_y, mask: mask_out1})
print(sess.run([hidden, out, loss, loss2], feed_dict={x: data_x, y:data_y, mask: mask_out1}))
# In this case, everything is updated. You see the loss and loss2 decreases.
sess.run(train_op_all, feed_dict={x: data_x, y:data_y, mask: mask_out1})
print(sess.run([hidden, out, loss, loss2], feed_dict={x: data_x, y:data_y, mask: mask_out1}))
sess.close()
=======================Below is the old answer==============================
To get derivatives w.r.t. different variables, you can pass a var_list
to decide which variable to update. Here is an example:
import tensorflow as tf
import numpy as np
# let's make our tiny dataset: (x, y) pairs, where x = (x1, x2, x3), y = (y1, y2),
# and y1 = x1+x2+x3, y2 = x1^2+x2^2+x3^2
# n_sample data points
n_sample = 8
data_x = np.random.random((n_sample, 3))
data_y = np.zeros((n_sample, 2))
data_y[:, 0] += np.sum(data_x, axis=1)
data_y[:, 1] += np.sum(data_x**2, axis=1)
data_y += 0.01 * np.random.random((n_sample, 2)) # add some noise
# build graph
# suppose we have a network of shape [3, 4, 2], i.e.: one hidden layer of size 4.
x = tf.placeholder(tf.float32, shape=[None, 3], name='x')
y = tf.placeholder(tf.float32, shape=[None, 2], name='y')
W1 = tf.Variable(tf.random_normal(shape=[3, 4], stddev=0.1), name='W1')
b1 = tf.Variable(tf.random_normal(shape=[4], stddev=0.1), name='b1')
hidden = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
W2 = tf.Variable(tf.random_normal(shape=[4, 2], stddev=0.1), name='W2')
b2 = tf.Variable(tf.random_normal(shape=[2], stddev=0.1), name='b2')
out = tf.matmul(hidden, W2) + b2
loss = tf.reduce_mean(tf.square(out - y))
optimizer = tf.train.GradientDescentOptimizer(0.1)
# You can pass a variable list to decide which variable(s) to minimize.
train_op_second_layer = optimizer.minimize(loss, var_list=[W2, b2])
# If there is no var_list, all variables will be updated.
train_op_all = optimizer.minimize(loss)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
print(sess.run([W1, b1, W2, b2, loss], feed_dict={x: data_x, y:data_y}))
# In this case, only W2 and b2 are updated. You see the loss decreases.
sess.run(train_op_second_layer, feed_dict={x: data_x, y:data_y})
print(sess.run([W1, b1, W2, b2, loss], feed_dict={x: data_x, y:data_y}))
# In this case, all variables are updated. You see the loss decreases.
sess.run(train_op_all, feed_dict={x: data_x, y:data_y})
print(sess.run([W1, b1, W2, b2, loss], feed_dict={x: data_x, y:data_y}))
sess.close()