q-agent is really broken, can't decide between a reward of 0 and -1

Question

I was using a dqn for something; it wasn't working. I simplified the problem so that there are 2 actions: 0 and 1. Each action corresponds to a single reward: 0 or -1. Still, my q agent is consistently confused, giving the two actions wild values in the thousands. Please, what am I doing wrong?

import numpy as np
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

class ReplayBuffer():
    def __init__(self, input_dims):
        self.mem_size = memory_size
        self.mem_cntr = 0

        self.state_memory = np.zeros((self.mem_size, *input_dims),
                                     dtype = np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims),
                                         dtype = np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype = np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype = np.int32)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = 1 - int(done)
        self.mem_cntr += 1

    def sample_buffer(self):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace = False)

        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

def build_dqn(n_actions, input_dims):
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape = input_dims),
        keras.layers.Dense(fc1_dims, activation = "relu"),
        keras.layers.Dense(fc2_dims, activation = "relu"),
        keras.layers.Dense(fc3_dims, activation = "relu"),
        # keras.layers.Dense(fc4_dims, activation = "relu"),
        keras.layers.Dense(n_actions, activation = None)])

    model.compile(optimizer = Adam(lr = learning_rate), loss = "mean_squared_error")

    return model

class Agent():
    def __init__(self, n_actions, input_dims):

        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.eps_dec = epsilon_dec
        self.eps_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = model_name
        self.memory = ReplayBuffer(input_dims)
        self.q_eval = build_dqn(n_actions, input_dims)

    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, observation):
        if(np.random.random() < self.epsilon):
            action = np.random.choice(self.action_space)
        else:
            state = np.array([observation])
            actions = self.q_eval.predict(state)

            action = np.argmax(actions[0])

            print(actions)
            print(action)

        return action

    def learn(self):
        if (self.memory.mem_cntr < self.batch_size):
            return

        states, actions, rewards, states_, dones = \
            self.memory.sample_buffer()

        q_eval = self.q_eval.predict(states)
        q_next = self.q_eval.predict(states_)

        q_target = np.copy(q_eval)
        batch_index = np.arange(self.batch_size, dtype = np.int32)

        q_target[batch_index, actions] = rewards + \
            self.gamma + np.max(q_next, axis = 1)*dones

        self.q_eval.train_on_batch(states, q_target)

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min


    def save_model(self):
        self.q_eval.save(self.model_file)

    def load_model(self):
        self.q_eval = load_model(self.model_file)

It's a super-standard dqn Agent, most of it copied from a tutorial. I cant't fathom where it could be going wrong.

score 0 · Answer 1 · edited May 25 '20 at 12:28

0

Try changing your last layer from

keras.layers.Dense(n_actions, activation = None)])

to

keras.layers.Dense(n_actions, activation = 'sigmoid')])

You say that

I simplified the problem so that there are 2 actions: 0 and 1. Each action corresponds to a single reward: 0 or -1.

Therefore, you should use sigmoid activation function in the last layer because you are trying to solve a binary classification problem.

You can read more about it here - https://keras.io/api/layers/activations/

edited May 25 '20 at 12:28

desertnaut

57,590
26
140
166

answered May 25 '20 at 12:25

Dhaval Taunk

1,662
1
9
17

Okay, this seemed to help a bit, but now if I reward action 1 with a reward of 1, still rewarding action 0 with a reward of 0, the agent begins valuing both actions with a value of 1. Why could this be? – RichKat May 25 '20 at 12:31

q-agent is really broken, can't decide between a reward of 0 and -1

1 Answers1