I tried to code a Deep Q Network to play Atari games using Tensorflow and OpenAI's Gym. Here's my code:
import tensorflow as tf
import gym
import numpy as np
import os
env_name = 'Breakout-v0'
env = gym.make(env_name)
num_episodes = 100
input_data = tf.placeholder(tf.float32,(None,)+env.observation_space.shape)
output_labels = tf.placeholder(tf.float32,(None,env.action_space.n))
def convnet(data):
layer1 = tf.layers.conv2d(data,32,5,activation=tf.nn.relu)
layer1_dropout = tf.nn.dropout(layer1,0.8)
layer2 = tf.layers.conv2d(layer1_dropout,64,5,activation=tf.nn.relu)
layer2_dropout = tf.nn.dropout(layer2,0.8)
layer3 = tf.layers.conv2d(layer2_dropout,128,5,activation=tf.nn.relu)
layer3_dropout = tf.nn.dropout(layer3,0.8)
layer4 = tf.layers.dense(layer3_dropout,units=128,activation=tf.nn.softmax,kernel_initializer=tf.zeros_initializer)
layer5 = tf.layers.flatten(layer4)
layer5_dropout = tf.nn.dropout(layer5,0.8)
layer6 = tf.layers.dense(layer5_dropout,units=env.action_space.n,activation=tf.nn.softmax,kernel_initializer=tf.zeros_initializer)
return layer6
logits = convnet(input_data)
loss = tf.losses.sigmoid_cross_entropy(output_labels,logits)
train = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
saver = tf.train.Saver()
init = tf.global_variables_initializer()
discount_factor = 0.5
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
x = []
y = []
state = env.reset()
feed = {input_data:np.array([state])}
print('episode:', episode+1)
while True:
x.append(state)
if (episode+1)/num_episodes > np.random.uniform():
Q = sess.run(logits,feed_dict=feed)[0]
action = np.argmax(Q)
else:
action = env.action_space.sample()
state,reward,done,info = env.step(action)
Q = sess.run(logits,feed_dict=feed)[0]
new_Q = np.zeros(Q.shape)
new_Q[action] = reward+np.amax(Q)*discount_factor
y.append(new_Q)
if done:
break
for sample in range(len(x)):
_,l = sess.run([train,loss],feed_dict={input_data:[x[sample]],output_labels:[y[sample]]})
print('training loss on sample '+str(sample+1)+': '+str(l))
saver.save(sess,os.getcwd()+'/'+env_name+'-DQN.ckpt')
The Problem is that:
- The loss isn't decreasing while training and is always somewhere around 0.7 or 0.8
- When I test the network on the Breakout environment even after I trained it for 1000 episodes, the actions still seem kind of random and it rarely hits the ball.
I already tried to use different loss functions (softmax crossentropy and mean squared error), use another optimizer (Adam) and increasing the learning rate but nothing changed.
Can someone tell me how to fix this?