I've attempted to implement a DQN from scratch, without importing the neural network from a library, and have attempted to get it to work with the Luna Lander environment, however no matter the values for the parameters and even after 600 episodes the reward is always decreasing and frequently less than -300. As no matter the parameters used the issue persists, I must have made an error in how I implemented the algorithm, through I cannot find it.
I have even tried using the parameters values as suggested from this paper Solving Luna Lander with DQN though that did not solve issues. Initially I was getting issues of exploding gradients so I changed from MSE loss to use huber loss, and while that solved that specific issue the training did not go any better. The adam optimizer was also used to prevent issues from the gradient becoming too small occurring. My though is that possibly an issue is occurring with how the backpropagation algorithm was implemented however as is my understanding it is done by calculating the layers error by Weightlayer+1 matrix multiplied by Errorlayer+1 and all multiplied by the derivative of the layers activation function and updating the models parameters with this matrix multiplied by the previous layers output and the learning rate appears to be correct to me.
I even tested the algorithm on the cartpole environment, but with no luck either, as the reward remained low while the loss kept very small and fluctuating around 0
See my code below
Import the libraries
import numpy as np
import gym #holds the environments to pick from
import matplotlib.pyplot as plt
import random #for picking a random element
import cv2
from collections import deque #similar to a list, but more efficent as allows efficent appending and poping items from both sides
import time
!pip install gym[box2d] #pick an environment from the box2d library
Setup the model
env = gym.make("LunarLander-v2", render_mode='rgb_array') #setup the Taxi-v3 environment
env.reset()
#print out the initial state of this environment
im = env.render()
plt.imshow(im[0])
Implement the DQN algorithm
-many different network layouts were also used, such as three hidden layers with 32, 64 and 64 and two hidden layers both with 128 neurons each did not help and only changed how long the model took to train
class DQN():
#initilize all the attributes of this class
def __init__(self, state_size, action_size, learning_rate, replay_buffer_size, gamma, epsilon, target_update_rate, batch_size, video_episodes):
self.state_size = state_size
self.action_size = action_size
self.learning_rate = learning_rate
self.replay_max_size = replay_buffer_size
self.replay_buffer = deque(maxlen = replay_buffer_size) # specify the max elements it holds before some get removed
# define the various hyper parameters
self.gamma = gamma # amount of value the optimal future reward has 0-1
self.epsilon = epsilon # used in the e-greedy policy to determine if explotion or exploration is what is to be performed 0-1
self.target_update_rate = target_update_rate # allow the target networks values to be updated every xxx steps(or episodes)
self.batch_size = batch_size # size of the random replay buffer batch used in order to ensure limited correlation
self.video_episodes = video_episodes # the interval for the times it makes the video, as all frames quickly runs out of storage
# define the two neural networks, ensuring each are the same initially
self.main_network = None
self.target_network = None
def initilize_network(self):
# define the two neural networks, ensuring each are the same initially
self.main_network = self.build_network()
self.target_network = self.build_network()
# ensure the target network is a copy of the main by giving it the same weights
self.copy_to_target()
def build_network(self):
# the act of building and compiling the network
# also need to specify the activation function each uses
model = NN(self.learning_rate)
model.network = []
model.add(Dense(number_neurons=self.state_size, activation_value='relu')) # as 8 possible input observations to be tracked
model.add(Dense(number_neurons=32, activation_value='relu'))
model.add(Dense(number_neurons=self.action_size, activation_value='linear')) # as four possible output actions
return model
# add the elements to the replay buffer
def update_replay(self, current_state, action_performed, reward, next_state, is_terminal):
self.replay_buffer.append((current_state, action_performed, reward, next_state, is_terminal))
# determines the action to use at each step of the learning process
def EGreedyPolicy(self, current_state, current_episode, max_episodes):
#new_epsilon = max(0.1, (1 - current_episode / max_episodes) * self.epsilon) # linearly decrease the epsilon value towards 0.1
if random.uniform(0, 1) < self.epsilon:
action = random.randint(0, self.action_size - 1) # Explore the environment by picking a random action to perform
else:
Q_values = self.main_network.predict(current_state) # get all possible action values from the network
Q_values = np.hstack(Q_values).A1 # convert into an array so the best action can be found
action = np.argmax(Q_values, axis=0) # Exploit learned values to maximize the reward recieved
return action
def get_mini_batch(self):
if len(self.replay_buffer) >= self.batch_size:
current_batch_size = self.batch_size
else:
current_batch_size = len(self.replay_buffer)
return random.sample(self.replay_buffer, current_batch_size)
# train the model to improve its overall policy
def train(self, env, num_episodes, num_steps):
total_steps = 0 # used to determine when the target network gets its values updated
train_frames = [] # save a rendering of each frame of the training process
episode_rewards = []
episode_losses = []
while len(self.replay_buffer) < self.replay_max_size:
terminated = False
steps_complete = 0
current_state = env.reset()
# perform steps of the episode until the agent either terminates or reaches its max step count
while terminated == False and steps_complete < num_steps:
chosen_action = self.EGreedyPolicy(current_state, 0, num_episodes)
next_state, reward, terminated, info = env.step(chosen_action) # perform the chosen action in the environment
self.update_replay(current_state, chosen_action, reward, next_state, terminated)
current_state = next_state # update the state
steps_complete += 1
print("Replay Buffer Full")
for i in range(0, num_episodes):
terminated = False
steps_complete = 0
current_state = env.reset()
total_reward = [] # used to calculate the mean accumulative reward for each episode
total_loss = [] # used to calculate the mean accumulative reward for each episode
# perform steps of the episode until the agent either terminates or reaches its max step count
while terminated == False and steps_complete < num_steps:
chosen_action = self.EGreedyPolicy(current_state, i, num_episodes)
next_state, reward, terminated, info = env.step(chosen_action) # perform the chosen action in the environment
self.update_replay(current_state, chosen_action, reward, next_state, terminated)
# determine the traning data to use and calculate the loss
mini_batch = self.get_mini_batch()
if len(self.replay_buffer) >= self.replay_max_size: # only update if the minibatch is full
loss = self.huber_loss(mini_batch, True)
total_loss.append(loss)
if total_steps % self.target_update_rate == 0:
self.copy_to_target() # every so many steps update the target network
print("Target Network Updated")
current_state = next_state # update the state
steps_complete += 1
total_steps += 1
# update the lists for later cross checking and evaluation of the model
total_reward.append(reward)
# generate and save the frame of training every xxx episodes
if i % self.video_episodes == 0 or i == 0:
img = env.render()[0]
train_frames.append(img)
episode_cumulative_reward = np.sum(total_reward)
episode_rewards.append(episode_cumulative_reward)
episode_cumulative_loss = np.mean(total_loss)
episode_losses.append(episode_cumulative_loss)
print("Episode Complete: " + str(i) + ", Reward: " + str(episode_cumulative_reward)+ ", Loss: " + str(episode_cumulative_loss)
+ ", Epsilon: " + str(self.epsilon) + ", Target_Updates: " + str(total_steps))
if len(self.replay_buffer) >= self.replay_max_size: # only update if the minibatch is full
self.epsilon *= 0.9925
self.epsilon = max(0.1, self.epsilon)
return train_frames, episode_rewards, episode_losses
# over all of the mini_batches samples determine the error in prediction or its loss using Mean Square Error
def huber_loss(self, mini_batch, use_derivative = False):
total_loss = 0
for i in range(0, len(mini_batch)):
# determine the target value, which is the current best guess of the actual expected value
target_value = mini_batch[i][2] # the current reward
if not mini_batch[i][4]: # if the following state is not terminal get the
Q_values = self.target_network.predict(mini_batch[i][3])
Q_values = np.hstack(Q_values).A1
target_value += self.gamma * np.max(Q_values) # the max action's value
# the current guess of the values the network should generate, based on the batch items values
index_state = mini_batch[i][0]
index_action = mini_batch[i][1]
predicted_value = self.main_network.predict(index_state)[index_action] # get the q-value for all actions in the state and specifically want the one for the action a
loss_gradient = self.main_network.huber_loss(predicted_value, target_value, True) # get the loss for this data points values
# update the networks parameters based on this loss using gradient descent
self.main_network.backpropagate(loss_gradient)
total_loss += loss_gradient
total_loss /= len(mini_batch)
return total_loss
# update the target network by copying over all the main networks parameter values for weights and biases to the target network
def copy_to_target(self):
self.target_network.set_weight_bias(self.main_network.get_weight_bias())
Create the neural network
class NN():
def __init__(self, learning_rate = 0.0001):
self.learning_rate = learning_rate
self.network = None # a list holding all the layers of the network
# add a layer to the network
def add(self, layer):
# determine the number of input values the layer has so that the size of its weight matrix can be properly determined
if len(self.network) - 1 >= 0:
prev_layer = self.network[len(self.network) - 1]
layer_input_amount = prev_layer.output_amount # use the previous layers output as input into this layer
else:
layer_input_amount = layer.output_amount #as the first network layer, no previous layer inputs can be used
layer.learning_rate = self.learning_rate
layer.randomize_weight_bias(layer_input_amount) # randomly assign weights and biases initially
self.network.append(layer)
# loop through all layers and perform the feed forward method, passing each output to the next as it goes
def feed_forward(self, input):
for layer in self.network:
input = layer.feed_forward(input)
return input
def backpropagate(self, loss_grad):
for layer in self.network[::-1]:
loss_grad = layer.backpropagate(loss_grad)
# based on current values return the output of the network
def predict(self, input_state):
input_state = np.vstack(input_state)
Q_values = self.feed_forward(input_state)
return Q_values
# converted to huber loss in order to solve exploding gradient problems
def huber_loss(self, predicted_value, target_value, use_derivative):
# if the loss is small use MSE
# if the loss is quite large use MAE(mean absolute error) as this is less sensitive to outliers in the dataset, and as shown when just used MSE exploding gradient occured
delta = 1 # idea is that this constrains the loss to between 1 and -1
abs_loss = abs(target_value - predicted_value)
if use_derivative:
if abs_loss <= delta:
loss = 2*(predicted_value-target_value) / 4 # as have 4 seperate data points with the other three's values always being 0 and so would not effect the loss calculations
else:
loss = -delta*np.sign(target_value - predicted_value) / 4
else:
huber_MSE = 0.5 * (target_value - predicted_value)**2
huber_MAE = delta * abs_loss - 0.5 * delta**2
if abs_loss <= delta:
loss = huber_MSE / 4
else:
loss = huber_MAE / 4
return loss.A1[0] # convert the 1,1 matrix to a scalar
# get all the weights and biases for the network
def get_weight_bias(self):
parameters = [[],[]]
for layer in self.network:
#if hasattr(layer, "weights"):
parameters[0].append(layer.weights)
#if hasattr(layer, "bias"):
parameters[1].append(layer.bias)
return parameters
# set all the weights and biases for the network
def set_weight_bias(self, new_values):
for i in range(0, len(self.network)):
#if hasattr(self.network[i], "weights"):
self.network[i].weights = new_values[0][i]
#if hasattr(self.network[i], "bias"):
self.network[i].bias = new_values[1][i]
class Layer():
def __init__(self, number_neurons, activation_value):
self.current_output = None
self.current_input = None
self.output_amount = number_neurons
self.activation_value = activation_value
def feed_forward(self, input):
return input
def backpropagate(self, loss_grad):
return loss_grad
# the various activation functions the network uses
def activation(self, value, use_derivative = False):
if self.activation_value == 'relu':
return self.relu(value, use_derivative)
elif self.activation_value == 'linear':
return self.linear(value, use_derivative)
else:
raise Exception('No appropriate activation picked, it must be either linear or relu')
# in both, when specified used the equations derivative as the back-propogation algorithm is running, otherwise its performing a forward pass and will use the normal function
# normally returns x
# dertivative returns 1
def linear(self, value, use_derivative = False):
if use_derivative:
for i in range(len(value)):
for j in range(len(value[i])):
value[i][j] = 1
return value
# normally returns max(0,x)
# derivative returns x < 0 = 0, x = 1
def relu(self, value, use_derivative = False):
for i in range(len(value)):
for j in range(len(value[i])):
if value[i][j] < 0:
value[i][j] = 0
elif use_derivative:
value[i][j] = 1
return value
from os import wait
class Dense(Layer):
# when initilizing the layer need to specify the exact number of output neurons to have and need to know the exact number of inputs as well
def __init__(self, number_neurons, activation_value):
super().__init__(number_neurons, activation_value)
# initilize the weights and biases to random values
self.weights = None #note that weight is a matrix
self.bias = None #note that bias is a number
self.learning_rate = None
self.first_moment = None
self.second_moment = None
self.bias1 = 0.9
self.bias2 = 0.999
self.e = 10**-8
self.time = 1
def randomize_weight_bias(self, input_amount):
#give each activation function its own random value type
if self.activation_value == 'relu':
self.weights = np.random.rand(self.output_amount, input_amount) * np.sqrt(2/self.output_amount)
else:
self.weights = np.random.rand(self.output_amount, input_amount) * 0.01
self.weights = np.matrix(self.weights, dtype=np.float64, copy = True)
self.bias = 1 #give biases a large effect
self.first_moment = np.zeros((self.output_amount, input_amount))
self.second_moment = np.zeros((self.output_amount, input_amount))
def feed_forward(self, prev_layer_output):
self.current_input = prev_layer_output
self.current_output = self.weights * prev_layer_output + self.bias # perform the feed forward operation, producing a matrix of size (x, 1)
return self.activation(self.current_output) # modify the output value by the activation function
def backpropagate(self, loss_grad):
# now using the Adam optimizer with back-propagation method
if self.output_amount == env.action_space.n:
error = loss_grad * self.activation(self.current_output, True) # this is the actual error of the last layer of the network
else:
error = self.activation(self.current_output, True)
for i in range(len(loss_grad)):
for j in range(len(loss_grad[i])):
error[i][j] = loss_grad[i][j] * error[i][j]
self.first_moment = self.bias1 * self.first_moment + (1- self.bias1) * error
self.second_moment = self.bias2 * self.second_moment + (1- self.bias2) * np.power(error, 2)
m = self.first_moment/(1-np.power(self.bias1, self.time+0.1))
v = self.second_moment/(1-np.power(self.bias2, self.time+0.1))
self.weights -= (self.learning_rate * m) / (np.sqrt(v)+self.e)
self.bias -= np.mean((self.learning_rate * m) / (np.sqrt(v)+self.e))
self.time += 1
return self.weights.T * error
Training the agent
dqn = DQN(state_size=env.observation_space.shape[0], action_size=env.action_space.n, learning_rate = 0.0001, replay_buffer_size=10000, gamma = 0.999, epsilon = 1, target_update_rate = 1000, batch_size = 128, video_episodes = 20)
dqn.initilize_network()
train_frames, reward, loss = dqn.train(env, 600, 500)