I have an agent that has to explore a customized environment.
The environment is a grid (100 squares horizontally, 100 squares vertically, each square is 10 meters wide).
In the environment, there are a number of users (called ues) whose positions are randomized at the beginning of each episode and are stationary throughout the episode.
Each user requires a number of resources that varies depending on the agent's position (the nearer the agent gets to a user, the fewer resources it will require and the more satisfied it will be | the satisfaction of a user is measured by how many resources it was given, if it was given all the resources it needs, then it will be satisfied), so the agent has to find the position that satisfies the maximum number of users (the agent is not aware of the users' positions).
The state space contains the agent's current position, the number of satisfied users, and a list of requests (What type of resources each user is asking for)
The action space is comprised of 9 actions (move forward, backward, to the sides, stay still, etc)
The agent's position is randomized at the beginning of each episode.
Epsilon is decayed from 1 to 0.1 over 550 episodes.
The reward function is as follows:
- If the agent chose an action that takes it to a position that satisfies more users than the previous position or the agent chose an action that takes it to a position that satisfies the same number of users as the previous position and there hasn't been a previous position where the agent satisfied more users than the current position, then the reward would be of value 2.
- If the agent chose an action that takes it to a position that satisfies the same number of users as the previous position and there has been a previous position where the agent satisfied more users than the current position then the penalty would be -0.001.
- If the agent chose an action that takes it to a position that satisfies a fewer number of users than the previous position then the penalty would be -0.002.
I use soft updates to the target network in my code with tau=1e-3
My problem is that Double DQN seems to perform much worse than vanilla DQN and I do not know why. It's supposed to perform better, right? Is there something wrong with the reward function maybe? Or is there something else I'm doing wrong?
Here's what the average reward curve looks like so far:
Here's my code for Double DQN below:
DISCOUNT = 0.9 #0.99
REPLAY_MEMORY_SIZE = 10_000
MIN_REPLAY_MEMORY_SIZE = 10_000 # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 32 # How many steps (samples) to use for training
class DDQNAgent(object):
def __init__(self):
#self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_decay = 0.8
self.epsilon_min = 0.1
self.learning_rate = 10e-4 #0.0005 #0.25 #1e-4
self.tau = 1e-3
self.plot_loss_acc = PlotLearning()
# Main models
self.model_uav_pos = self._build_pos_model()
# Target networks
self.target_model_uav_pos = self._build_pos_model()
# Copy weights
self.target_model_uav_pos.set_weights(self.model_uav_pos.get_weights())
# An array with last n steps for training
self.replay_memory_pos_nn = deque(maxlen=REPLAY_MEMORY_SIZE)
tboard_log_dir_pos = os.path.join("logs", MODEL_NAME_POS_DDQN)
self.tensorboard_pos = ModifiedTensorBoard(MODEL_NAME_POS_DDQN, log_dir=tboard_log_dir_pos)
def _build_pos_model(self): # compile the DNN
# create the DNN model
dnn = self.create_pos_dnn()
opt = Adam(learning_rate=self.learning_rate) #, decay=self.epsilon_decay)
dnn.compile(loss="categorical_crossentropy", optimizer=opt, metrics=['accuracy'])
dnn.call = tf.function(dnn.call, jit_compile=True)
return dnn
''' Don't forget to normalize the inputs '''
def create_pos_dnn(self):
# initialize the input shape (The shape of an array is the number of elements in each dimension)
pos_input_shape = (2,)
requests_input_shape = (len(env.ues),)
number_of_satisfied_ues_input_shape = (1,)
# How many possible outputs we can have
output_nodes = n_possible_movements
# Initialize the inputs
uav_current_position = Input(shape=pos_input_shape, name='pos')
ues_requests = Input(shape=requests_input_shape, name='requests')
number_of_satisfied_ues = Input(shape=number_of_satisfied_ues_input_shape, name='number_of_satisfied_ues')
# Put them in a list
list_inputs = [uav_current_position, ues_requests, number_of_satisfied_ues]
# Merge all input features into a single large vector
x = layers.concatenate(list_inputs)
# Add a 1st Hidden (Dense) Layer
dense_layer_1 = Dense(512, activation="relu")(x)
# Add a 2nd Hidden (Dense) Layer
dense_layer_2 = Dense(512, activation="relu")(dense_layer_1)
# Add a 3rd Hidden (Dense) Layer
dense_layer_3 = Dense(256, activation="relu")(dense_layer_2)
# Output layer
output_layer = Dense(output_nodes, activation="softmax")(dense_layer_3)
model = Model(inputs=list_inputs, outputs=output_layer)
# return the DNN
return model
def remember_pos_nn(self, state, action, reward, next_state, done):
self.replay_memory_pos_nn.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later
def act_upon_choosing_a_new_position(self, state): # state is a tuple (uav_position, requests_array)
if np.random.rand() <= self.epsilon: # if acting randomly, take random action
return random.randrange(n_possible_movements)
pos = np.array([state[0]])
reqs = np.array([state[1]])
number_satisfaction = np.array([state[2]])
act_values = self.model_uav_pos([pos, reqs, number_satisfaction]) # if not acting randomly, predict reward value based on current state
return np.argmax(act_values[0]) #env.possible_positions[np.argmax(act_values[0])] # pick the action that will give the highest reward
def train_pos_nn(self):
print("In Training..")
# Start training only if certain number of samples is already saved
if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
print("Exiting Training: Replay Memory Not Full Enough...")
return
# Get a minibatch of random samples from memory replay table
list_memory = list(self.replay_memory_pos_nn)
random.shuffle(list_memory)
minibatch = random.sample(list_memory, MINIBATCH_SIZE)
start_time = time.time()
# Enumerate our batches
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
print('...Starting Training...')
target = 0
pos = np.array([current_state[0]])
reqs = np.array([current_state[1]])
number_satisfaction = np.array([current_state[2]])
pos_next = np.array([new_current_state[0]])
reqs_next = np.array([new_current_state[1]])
number_satisfaction_next = np.array([new_current_state[2]])
# If not a terminal state, get new q from future states, otherwise set it to 0
# almost like with Q Learning, but we use just part of equation here
if not done:
max_action = np.argmax(self.model_uav_pos([pos_next, reqs_next, number_satisfaction_next])[0])
target = reward + DISCOUNT * self.target_model_uav_pos([pos_next, reqs_next, number_satisfaction_next])[0][max_action]
else:
target = reward
# Update Q value for a given state
target_f = self.model_uav_pos([pos, reqs, number_satisfaction])
target_f = np.array(target_f)
target_f[0][action] = target
self.model_uav_pos.fit([pos, reqs, number_satisfaction], \
target_f, \
verbose=2, \
shuffle=False, \
callbacks=None, \
epochs=1 \
)
end_time = time.time()
print("Time", end_time - start_time)
# Update target network counter every episode
self.target_train()
def target_train(self):
weights = self.model_uav_pos.get_weights()
target_weights = self.target_model_uav_pos.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
self.target_model_uav_pos.set_weights(target_weights)