I am trying to code my own DQN in Python, using pytorch. I am trying it on the CartPole environment.
Although the Q-loss converaged, the model performed poorly.
Replay buffer was also used in the model with a size of 2000 and the double networks were also used.I updated the target network when the q_net network was updated 100 times.(ps: q_net was used to decide which action to choose.)
I tried to use different network architectures and different combinations of hyper-parameters, but the model still performed poorly, which just kept swaying and never kept itself balanced.
Appreciate for your help, sincerely!!
Here is the figure of Q-loss result:
Here is the code for action taking.
def take_action(self,state):
if self.learn_count<=10:
self.eposilon = 0.1
else:
self.eposilon = 0.9
decision = np.random.choice([0,1],p = [1-self.eposilon,self.eposilon])
if decision == 1:
#final_decision = self.q_net(state.cuda().detach()).argmax()
final_decision = self.q_net(torch.Tensor(state).to(self.device))
final_decision = torch.max(final_decision, 0)[1].data.numpy()
else:
final_decision = np.random.choice([i for i in range(self.action_space)])
return final_decision.item()
Here is the code for training the network.
def update_nn(self):
if self.learn_count%100 ==0:
self.synchronous_NN()
for i in range(self.num_epoches):
self.learn_count = self.learn_count+1
self.training_nn()
return None
def training_nn(self):
index = random.sample(range(self.replay_buffer.shape[0]),self.minibatch_size)
chosen_sample = self.replay_buffer[index,:]
last_state = copy.deepcopy(chosen_sample[np.isnan(chosen_sample[:,-1:]).squeeze(),:])
not_last_state = copy.deepcopy(chosen_sample[~np.isnan(chosen_sample[:,-1:]).squeeze(),:])
input_not_last_state = torch.FloatTensor(not_last_state[:,:4]).to(self.device)
action_index = torch.LongTensor(not_last_state[:,4].reshape(-1,1)).to(self.device)
action_value = self.q_net(input_not_last_state).gather(1,action_index)
max_action_value = not_last_state[:,5]+self.gamma*self.fixed_q_net(input_not_last_state).detach().max(1).values.numpy()
last_state = np.nan_to_num(last_state)
input_last_state = torch.FloatTensor(last_state[:,:4]).to(self.device)
last_action_index = torch.LongTensor(last_state[:,4].reshape(-1,1)).to(self.device)
last_action_value = self.q_net(input_last_state).gather(1,last_action_index)
last_max_action_value = last_state[:,5]
X = torch.cat([action_value,last_action_value])
y = torch.FloatTensor(np.hstack([max_action_value,last_max_action_value]).reshape(-1,1)).detach()
loss = self.loss(X, y)
self.optimizer.zero_grad() # reset the gradient to zero
loss.backward()
self.optimizer.step() # execute back propagation for one step
self.loss_curve.append(loss)
return None
here is the part of playing:
def start_to_play(self):
agent = Agent()
agent.initial_Q_network(2)
agent.initial_replay_buffer()
self.env = gym.make('CartPole-v0')
self.env = self.env.unwrapped
for i in range(self.episode):
if i%50 ==1:
agent.save_model(i)
step = 0
state = self.env.reset()
ep_r = 0
while(True):
action = agent.take_action(state)
observation, reward, done, info = self.env.step(action)
self.env.render()
#next_state = self.capture_state()
next_state = observation
x, x_dot, theta, theta_dot = observation
r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
reward = r1 + r2
ep_r = reward+ep_r
if done:
reward = reward-20
state1_np = np.array(state)
state2_np = np.array([np.nan,np.nan,np.nan,np.nan])
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
print(i,step,round(ep_r, 2))
break
else:
state1_np = np.array(state)
state2_np = np.array(next_state)
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
state = next_state
step = step+1
self.plot_curve(agent)
return None
Thanks for your time!!