I am right now trying to to optimize the navigation of my robot. I first used a vanilla DQN where I optimized the parameters. The simulated robot made it to reach 8000 goals after 5000 Episodes and showed a satisfying learning performance. Now as DQN is kind of "not-best" in Reinforcement Learning, I added the DoubleDQN. Unfortunately, that one performed very bad under the same conditions. My first question is then if I implemented the DDQN right and second, how often should the target network be optimized? Right now, it is optimized after every episode. One episode can go until 500 steps (if there is no crash). I can imagine to update the target much more often (i.e. every 20 steps). But I don know how the target then keeps being able to prohibit the overestimating behaviour of the original network?
Here is the normal DQN training part:
def getQvalue(self, reward, next_target, done):
if done:
return reward
else:
return reward + self.discount_factor * np.amax(next_target)
def getAction(self, state):
if np.random.rand() <= self.epsilon:
self.q_value = np.zeros(self.action_size)
return random.randrange(self.action_size)
else:
q_value = self.model.predict(state.reshape(1, len(state)))
self.q_value = q_value
return np.argmax(q_value[0])
def trainModel(self, target=False):
mini_batch = random.sample(self.memory, self.batch_size)
X_batch = np.empty((0, self.state_size), dtype=np.float64)
Y_batch = np.empty((0, self.action_size), dtype=np.float64)
for i in range(self.batch_size):
states = mini_batch[i][0]
actions = mini_batch[i][1]
rewards = mini_batch[i][2]
next_states = mini_batch[i][3]
dones = mini_batch[i][4]
q_value = self.model.predict(states.reshape(1, len(states)))
self.q_value = q_value
if target:
next_target = self.target_model.predict(next_states.reshape(1, len(next_states)))
else:
next_target = self.model.predict(next_states.reshape(1, len(next_states)))
next_q_value = self.getQvalue(rewards, next_target, dones)
X_batch = np.append(X_batch, np.array([states.copy()]), axis=0)
Y_sample = q_value.copy()
Y_sample[0][actions] = next_q_value
Y_batch = np.append(Y_batch, np.array([Y_sample[0]]), axis=0)
if dones:
X_batch = np.append(X_batch, np.array([next_states.copy()]), axis=0)
Y_batch = np.append(Y_batch, np.array([[rewards] * self.action_size]), axis=0)
self.model.fit(X_batch, Y_batch, batch_size=self.batch_size, epochs=1, verbose=0)
Here is the update for the Double DQN:
def getQvalue(self, reward, next_target, next_q_value_1, done):
if done:
return reward
else:
a = np.argmax(next_q_value_1[0])
return reward + self.discount_factor * next_target[0][a]
def getAction(self, state):
if np.random.rand() <= self.epsilon:
self.q_value = np.zeros(self.action_size)
return random.randrange(self.action_size)
else:
q_value = self.model.predict(state.reshape(1, len(state)))
self.q_value = q_value
return np.argmax(q_value[0])
def trainModel(self, target=False):
mini_batch = random.sample(self.memory, self.batch_size)
X_batch = np.empty((0, self.state_size), dtype=np.float64)
Y_batch = np.empty((0, self.action_size), dtype=np.float64)
for i in range(self.batch_size):
states = mini_batch[i][0]
actions = mini_batch[i][1]
rewards = mini_batch[i][2]
next_states = mini_batch[i][3]
dones = mini_batch[i][4]
q_value = self.model.predict(states.reshape(1, len(states)))
self.q_value = q_value
if target:
next_q_value_1 = self.model.predict(next_states.reshape(1, len(next_states)))
next_target = self.target_model.predict(next_states.reshape(1, len(next_states)))
else:
next_q_value_1 = self.model.predict(next_states.reshape(1, len(next_states)))
next_target = self.model.predict(next_states.reshape(1, len(next_states)))
# next_q_value = self.getQvalue(rewards, next_target, next_q_value_1, dones)
X_batch = np.append(X_batch, np.array([states.copy()]), axis=0)
Y_sample = q_value.copy()
Y_sample[0][actions] = next_q_value
Y_batch = np.append(Y_batch, np.array([Y_sample[0]]), axis=0)
if dones:
X_batch = np.append(X_batch, np.array([next_states.copy()]), axis=0)
Y_batch = np.append(Y_batch, np.array([[rewards] * self.action_size]), axis=0)
self.model.fit(X_batch, Y_batch, batch_size=self.batch_size, epochs=1, verbose=0)
Basically, the change happens in the getQvalue part, where I choose the action from the original network, but then choose the action-value for that action from the target network. if target makes sure, that the target network is only used after 2000 global steps. Before it shouldn make sense (~first 10 episodes) Best regards and thanks in advance!