0

I'm trying to write a DQL algorithm and I'm trying to run the following graph on tensorflow

class DQN:

def __init__(self, env, n_hidden, learning_rate):

    self.image_input = tf.placeholder(shape=[None, 128,128,3], dtype=tf.float32)
    self.conv1 = tf.contrib.layers.convolution2d(inputs=self.image_input, num_outputs=32, 
                                                 kernel_size=[8,8], stride=[4,4], padding="VALID")
    self.conv2 = tf.contrib.layers.convolution2d(inputs=self.conv1, num_outputs=64, 
                                                 kernel_size=[4,4], stride=[2,2], padding="VALID")
    self.conv3 = tf.contrib.layers.convolution2d(inputs=self.conv2, num_outputs=64, 
                                                 kernel_size=[3,3], stride=[1,1], padding="VALID")
    self.conv4 = tf.contrib.layers.convolution2d(inputs=self.conv3, num_outputs=512, 
                                                 kernel_size=[7,7], stride=[1,1], padding="VALID")

    self.conv_out = tf.contrib.layers.flatten(self.conv4)
    self.weights_1 = tf.Variable(tf.random_normal([18432, env.action_space.n], stddev=0.35), name="fully1_w")
    self.bias_1 = tf.Variable(tf.zeros(env.action_space.n), name="fully1_b")
    self.q_out = tf.add(tf.matmul(self.conv_out, self.weights_1), self.bias_1, name="q_out")
    self.predict = tf.argmax(self.q_out, 1)

    self.target_q = tf.placeholder(shape=[None],dtype=tf.float32)
    self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
    self.actions_onehot = tf.one_hot(self.actions,env.action_space.n,dtype=tf.float32)
    self.q_value = tf.reduce_sum(tf.multiply(self.q_out, self.actions_onehot), reduction_indices=1)

    self.td_error = tf.square(self.target_q - self.q_value)
    self.loss = tf.reduce_mean(self.td_error)
    self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    self.grads_and_vars = self.trainer.compute_gradients(self.loss)
    self.trainer.apply_gradients(self.grads_and_vars)

And here is the training procedure:

tf.reset_default_graph()
main_qf = DQN(env, n_hidden=10, learning_rate=1.0)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
trainables = tf.trainable_variables()

target_ops = update_target_graph(trainables,tau, mode="periodically")
grads=[]
experience_buffer = ExperienceReplay(exp_size)
total_rewards = np.zeros(num_episodes)
losses = np.zeros(num_episodes)

with tf.Session() as session:
state = env.reset()
session.run(init)
update_target(target_ops, session)

for _iter in range(num_episodes):
    state = env.reset()
    # play ===================================================================================
    done = False
    img = process_image(env.render(mode="rgb_array"))
    episode = []
    while not done:
        #e-greedy
        if np.random.rand() < epsilon:
            action = np.random.choice(range(env.action_space.n))
        else:
            feed_dict = {main_qf.image_input: img[None,:,:,:]}
            action = session.run(main_qf.predict, feed_dict=feed_dict)[0]

        new_state, reward, done, _ = env.step(action)
        new_img = process_image(env.render(mode="rgb_array"))

        experience_buffer.add((img, action, new_img,reward, done))
        # update results =========================================================================
        total_rewards[_iter] += reward


    # Adjust params (epsilon)  ===============================================================

    if epsilon >= min_epsilon:
        epsilon -= decay

    # train ==================================================================================
    prev_state, actions, new_state, rewards, is_terminal = experience_buffer.sample(batch_size)

    q_function = session.run([main_qf.q_out], feed_dict={
                                                        main_qf.image_input:prev_state})

    q_target = session.run([main_qf.predict], feed_dict={
                                                        main_qf.image_input:new_state})
    q_target = rewards + gamma * q_target * is_terminal

    loss, weights, grad  = session.run([main_qf.loss,main_qf.weights_1, main_qf.grads_and_vars], feed_dict={
                                                        main_qf.image_input : prev_state,
                                                        main_qf.target_q : q_target,
                                                        main_qf.actions : actions
        })

    losses[_iter] = loss
    update_target(target_ops, session)

But for some reason, I do not understand the training procedure is not updating the weights of the network. I tried to fetch the gradients to check if I had vanishing gradients (getting grads_and_vars), but this is not the case, The gradients have big values. I also tried to manually assign values to the variables (by calling main_qf.weights1.assing(val)) but it also doesn't work.

Is it something in the composition of my graph? Or in the way, I'm running the session? I'm completely lost on this one.

lspinheiro
  • 423
  • 1
  • 4
  • 9

1 Answers1

1

As it stands, your graph does not request minimization of the loss or an update of the gradients.

The graph element that will update the weights is the "self.trainer.apply_gradients(self.grads_and_vars)" operation. I don't see where you have called this Op from your session.run().

Try to assign this to a variable and add it to your run() and it should update the weights.

self.UpdateWeights = self.trainer.apply_gradients(self.grads_and_vars)

If you just add "self.trainer" to your run() you will not update the gradients unless you add minimize(self.loss), then you don't need the Calc/Apply gradient lines.

self.trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(self.loss)
mazecreator
  • 543
  • 1
  • 11
  • 27
  • "self.trainer.apply_gradients(self.grads_and_vars)" operation is the last line in the first code block. – lspinheiro Mar 19 '17 at 21:35
  • Yes, but it is not part of your computational graph until your .RUN() command executes it. Right now it is part of the graph, but never executed so there will never be a weight update. keep in mind, you are building a graph with the python code, it isn't actually being executed but being put into the computational graph. – mazecreator Mar 19 '17 at 22:07