Keras Double DQN average reward decreases over time and is unable to converge

Question

I am attempting to teach a Double DQN agent to run a gridworld where there is one seeker (the agent) who will try to collect all the hiders which are randomly spawned. Every step has a path_cost of -0.1 and if a hider is collected a reward of 1 is received. The DQN net receives an array with the shape (world_width,world_height,1) as the state which is a complete translation of the environment viewed from above where empty space is described as 0, seeker as 2, and hider as 3. The agent is then supposed to choose one action, either left, up, right, or down. An example configuration of the environment is shown in the image below.

gridworld

However, when training my agent the reward initially decreases in correlation to the decreasing exploration and therefore it can be assumed that when the agent follows the DQN net it will perform worse than when choosing actions randomly. Here are a few examples of the reward graphs I have received when training with different hyperparameters (y-axis is total steps where each episode is 100 steps unless it finishes).

Reward Graph

As seen the agent becomes worse at solving the environment and it is approximately when epsilon becomes equal to my min_epsilon the curve stabilizes (meaning almost no exploration or random moves).

I have tried different hyperparameters but without any apparent differences in results and would there appreciate it if someone could give me a pointer to where the problem might be.

The hyperparameters I have been mostly using is:

wandb.config.epsilon           = 1.0
wandb.config.epsilon_decay     = 0.99
wandb.config.batch_size        = 32
wandb.config.learning_rate     = 1e-3
wandb.config.gamma             = 0.8
wandb.config.min_epsilon       = 1e-1
wandb.config.buffersize        = 10000
wandb.config.epochs            = 1
wandb.config.reward_discount   = 0.01
wandb.config.episodes          = 1000

And here is my code:

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam
from collections import deque
from termcolor import colored
import wandb
from wandb.keras import WandbCallback
import numpy as np
import copy, os, random
from argparse import ArgumentParser

from plotter import plotter
from HNS import HNS

tf.keras.backend.set_floatx('float64')

wandb.init(name=name, project=project)


wandb.env.name                 = "HNS"
wandb.env.world_size           = (8, 8)
wandb.env.state_dim            = (8, 8, 1)
wandb.env.hider_count          = 2
wandb.env.action_dim           = 4
wandb.env.random_spawn         = True
wandb.env.max_steps            = 100 

wandb.config.node              = node

wandb.config.epsilon           = 1.0
wandb.config.epsilon_decay     = 0.99
wandb.config.batch_size        = 32
wandb.config.learning_rate     = 1e-3
wandb.config.gamma             = 0.8
wandb.config.min_epsilon       = 1e-1
wandb.config.buffersize        = 10000
wandb.config.epochs            = 1
wandb.config.reward_discount   = 0.01
wandb.config.episodes          = 1000

wandb.config.conv1_kernel      = (8,8)
wandb.config.conv1_filters     = 16
wandb.config.conv1_strides     = 4
wandb.config.conv1_activation  = "relu"
wandb.config.conv1_padding     = "same"

wandb.config.conv2_kernel      = (4,4)
wandb.config.conv2_filters     = 32
wandb.config.conv2_strides     = 4
wandb.config.conv2_activation  = "relu"
wandb.config.conv2_padding     = "same"

wandb.config.dense1_neurons    = 16
wandb.config.dense1_activation = "relu"

wandb.config.loss              = "mse"

parser = ArgumentParser()
parser.add_argument('--hider_count',     type=int,   default=wandb.env.hider_count)
parser.add_argument('--max_steps',       type=int,   default=wandb.env.max_steps)
parser.add_argument('--epsilon_decay',   type=float, default=wandb.config.epsilon_decay)
parser.add_argument('--min_epsilon',     type=float, default=wandb.config.min_epsilon)
parser.add_argument('--learning_rate',   type=float, default=wandb.config.learning_rate)
parser.add_argument('--gamma',           type=float, default=wandb.config.gamma)
parser.add_argument('--reward_discount', type=float, default=wandb.config.reward_discount)
parser.add_argument('--episodes',        type=int,   default=wandb.config.episodes)
parser.add_argument('--batch_size',      type=int,   default=wandb.config.batch_size)

args, unknown = parser.parse_known_args()
wandb.config.update(args, allow_val_change=True)


class ReplayBuffer:
    def __init__(self):
        self.buffer = deque(maxlen=wandb.config.buffersize)

    def put(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])

    def sample(self):
        sample = random.sample(self.buffer, wandb.config.batch_size)
        states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
        return states, actions, rewards, next_states, done

    def size(self):
        return len(self.buffer)


class ActionStatemodel:
    def __init__(self):
        self.epsilon = wandb.config.epsilon
        self.model = self.create_model()

    def create_model(self):
        # Init model
        model = tf.keras.Sequential()

        # Set up layers
        model.add(Conv2D(filters=wandb.config.conv1_filters, kernel_size=wandb.config.conv1_kernel, activation=wandb.config.conv1_activation,
                         strides=wandb.config.conv1_strides, padding=wandb.config.conv1_padding, name="conv_1", input_shape=wandb.env.state_dim))
        model.add(Conv2D(filters=wandb.config.conv2_filters, kernel_size=wandb.config.conv2_kernel, activation=wandb.config.conv2_activation,
                         strides=wandb.config.conv2_strides, padding=wandb.config.conv2_padding, name="conv_2"))

        model.add(Flatten())
        model.add(Dense(units=wandb.config.dense1_neurons, activation=wandb.config.dense1_activation,  name="dense_1"))
        model.add(Dense(wandb.env.action_dim, name="dense_2"))

        # Finalize model
        model.compile(loss=wandb.config.loss, optimizer=Adam(wandb.config.learning_rate))
        model.summary()

        return model

    # Get q-values from state
    def predict(self, state):
        return self.model.predict(state)

    # Get action from
    def get_action(self, state):
        # Predict action
        state = np.expand_dims(state, axis=0)
        q_value = self.predict(state)

        if np.random.random() < self.epsilon: return random.randint(0, wandb.env.action_dim - 1), 1
        else: return np.argmax(q_value), 0


    def train(self, states, targets):
        history = self.model.fit(states, targets, epochs=wandb.config.epochs, callbacks=[WandbCallback()], verbose=2, use_multiprocessing=True)
        return history.history["loss"][0]

class Agent:
    def __init__(self, env):
        self.env = env

        self.predict_net = ActionStatemodel()
        self.target_net = ActionStatemodel()
        self.target_update()

        self.buffer = ReplayBuffer()

    # Copy weights from model to target_model
    def target_update(self):
        weights = self.predict_net.model.get_weights()
        self.target_net.model.set_weights(weights)

    def replay(self):
        loss = 0
        for _ in range(5):
            states, actions, rewards, next_states, done = self.buffer.sample()

            # Collect predicted actions from predict_net
            predicted_q_values = self.predict_net.predict(next_states)
            predicted_actions = np.argmax(predicted_q_values, axis=1)

            # Get q values from target_net of above predicted actions
            target_q_values = self.target_net.predict(next_states)
            target_action_q_values = [np.take(target_q_values[i], predicted_actions[i]) for i in range(len(target_q_values))]

            # Create targets based on q values, reward and done
            targets = predicted_q_values.copy()
            targets[range(wandb.config.batch_size), actions] = rewards + (1 - done) * target_action_q_values * args.gamma

            loss += self.predict_net.train(states, targets)
        return loss

    def train(self):
        # Save weights for heatmap rendering

        # Main training loop
        for ep in range(wandb.config.episodes):

            # Initialization
            done, total_reward, step, loss, exploration = False, 0, 0, 0, 0
            state = self.env.reset()
            while not done and step < wandb.env.max_steps:

                # Predict and perform action
                action, e = self.predict_net.get_action(state)
                exploration += e
                next_state, reward, done, _ = self.env.step(action)
                self.buffer.put(state, action, reward * wandb.config.reward_discount, next_state, done)
                total_reward += reward

                if self.buffer.size() >= 1000 and step % 10 == 0: 
                    loss = self.replay()

                state = next_state
                step += 1

            self.target_update()

            # Update epsilon
            self.predict_net.epsilon = max(wandb.config.epsilon_decay * self.predict_net.epsilon, wandb.config.min_epsilon)


            # Calculate weights change and log weights
            pre_weights = self.get_weights(self.predict_net.model.layers)
            tar_weights = self.get_weights(self.target_net.model.layers)

            # LOG
            print(colored("EP" + str(ep) + "-Reward: " + str(total_reward) + " Done: " + str(done), "green"))
            wandb.log({"episode"      : ep,
                       "buffersize"   : self.buffer.size(),
                       "EpReward"     : total_reward,
                       "epsilon"      : self.predict_net.epsilon,
                       "done"         : int(done),
                       "Exploration"  : exploration / _,
                       "loss"         : loss,
                       "pre_weights"  : pre_weights,
                       "tar_weights"  : tar_weights
                       })
                       # "weigthUpdate" : wandb.Image(neuron_map),


    # Get weights and names for every layer of nn model
    def get_weights(self, layers):
        weigths = []
        names = []
        for layer in layers:
            wb = layer.get_weights()
            if wb:
                weigths.append(wb[0].flatten())
                names.append(layer.name)
        return weigths, names


if __name__ == "__main__":
    env = HNS(random_spawn=wandb.env.random_spawn, world_size=wandb.env.world_size, hider_count=wandb.env.hider_count)
    agent = Agent(env=env)
    agent.train()
    agent.target_net.model.save(os.path.join(wandb.run.dir, "model.h5"))

I'm unable to run your code. What's HNS in your code and how do I install it? — desert_ranger, Jul 19 '22 at 16:31

Keras Double DQN average reward decreases over time and is unable to converge

0 Answers0