I was using a dqn for something; it wasn't working. I simplified the problem so that there are 2 actions: 0 and 1. Each action corresponds to a single reward: 0 or -1. Still, my q agent is consistently confused, giving the two actions wild values in the thousands. Please, what am I doing wrong?
import numpy as np
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
class ReplayBuffer():
def __init__(self, input_dims):
self.mem_size = memory_size
self.mem_cntr = 0
self.state_memory = np.zeros((self.mem_size, *input_dims),
dtype = np.float32)
self.new_state_memory = np.zeros((self.mem_size, *input_dims),
dtype = np.float32)
self.action_memory = np.zeros(self.mem_size, dtype = np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype = np.int32)
def store_transition(self, state, action, reward, state_, done):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.reward_memory[index] = reward
self.action_memory[index] = action
self.terminal_memory[index] = 1 - int(done)
self.mem_cntr += 1
def sample_buffer(self):
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, batch_size, replace = False)
states = self.state_memory[batch]
states_ = self.new_state_memory[batch]
rewards = self.reward_memory[batch]
actions = self.action_memory[batch]
terminal = self.terminal_memory[batch]
return states, actions, rewards, states_, terminal
def build_dqn(n_actions, input_dims):
model = keras.Sequential([
keras.layers.InputLayer(input_shape = input_dims),
keras.layers.Dense(fc1_dims, activation = "relu"),
keras.layers.Dense(fc2_dims, activation = "relu"),
keras.layers.Dense(fc3_dims, activation = "relu"),
# keras.layers.Dense(fc4_dims, activation = "relu"),
keras.layers.Dense(n_actions, activation = None)])
model.compile(optimizer = Adam(lr = learning_rate), loss = "mean_squared_error")
return model
class Agent():
def __init__(self, n_actions, input_dims):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon_start
self.eps_dec = epsilon_dec
self.eps_min = epsilon_end
self.batch_size = batch_size
self.model_file = model_name
self.memory = ReplayBuffer(input_dims)
self.q_eval = build_dqn(n_actions, input_dims)
def store_transition(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, observation):
if(np.random.random() < self.epsilon):
action = np.random.choice(self.action_space)
else:
state = np.array([observation])
actions = self.q_eval.predict(state)
action = np.argmax(actions[0])
print(actions)
print(action)
return action
def learn(self):
if (self.memory.mem_cntr < self.batch_size):
return
states, actions, rewards, states_, dones = \
self.memory.sample_buffer()
q_eval = self.q_eval.predict(states)
q_next = self.q_eval.predict(states_)
q_target = np.copy(q_eval)
batch_index = np.arange(self.batch_size, dtype = np.int32)
q_target[batch_index, actions] = rewards + \
self.gamma + np.max(q_next, axis = 1)*dones
self.q_eval.train_on_batch(states, q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
It's a super-standard dqn Agent, most of it copied from a tutorial. I cant't fathom where it could be going wrong.