I have two agents a and b:they use the same network stucture.and I try to use replaybuffer to update the parameters of networks.If i update a, then update b would bring the error:one of the variables needed for gradient computation has been modified by an inplace operation. so I try to find the inplace operation.But I couldn't do that.
while total_timesteps < episodes:
total_timesteps = total_timesteps + 1
episodes_reward = []
state = env.reset()
state.to(device)
for j in range(max_steps):
action = torch.zeros(env.num_agents, env.dim_world)
action[0] = policy_agent1.select_action(state)
action[1] = policy_agent2.select_action(state)
action_agent0 = action[0]
action_agent1 = action[1]
next_state, reward, done = env.step(action)
next_state.to(device)
action.to(device)
reward.to(device)
reward_agent = reward[0]
reward_agent1 = reward[1]
replay_buffer1.add((state, next_state, action_agent0, reward_agent, done))
replay_buffer2.add((state, next_state, action_agent1, reward_agent1, done))
# print(next_state, reward, action)
state = next_state
if done:
continue
policy_agent1.train(replay_buffer1.sample(batch_size), gamma)
policy_agent2.train(replay_buffer2.sample(batch_size), gamma)
and the train part:
def train(self, replay_buffer, gamma):
state, next_state, action, reward, done = replay_buffer
q = torch.zeros(len(reward)).to(device)
q_ = torch.zeros(len(reward)).to(device)
q_target = torch.zeros(len(reward)).to(device)
done = torch.Tensor(done).to(device)
for j, r in enumerate(reward):
q1_target = torch.zeros(len(reward)).to(device)
q_[j] = self.critic_network(torch.transpose(next_state[j].to(device), 0, 1), self.actor_network(torch.transpose(next_state[j].to(device), 0, 1)).view(1, 1))
q_target[j] = r.to(device) + (done[j] * gamma * q_[j])
q1_target = q1_target + q_target
# q_target[j] = r.to(device) + (done[j] * gamma * q_[j]).detach().clone()
q1 = torch.zeros(len(reward)).to(device)
q1[j] = self.critic_network(torch.transpose(state[j].to(device), 0, 1), action[j].view(1, 1).to(device))
q = q + q1
loss_critic = F.mse_loss(q, q1_target)
self.critic_optimizer.zero_grad()
loss_critic.backward(retain_graph=True)
self.critic_optimizer.step()
b = torch.zeros(len(reward)).to(device)
for j, _ in enumerate(reward):
b1 = torch.zeros(len(reward)).to(device)
b1[j] = self.critic_network(torch.transpose(state[j].to(device), 0, 1), self.actor_network(torch.transpose(state[j].to(device), 0, 1)).view(1, 1))
b = b + b1
# b[j] = self.critic_network(torch.transpose(state[j].to(device), 0, 1), self.actor_network(torch.transpose(state[j].to(device), 0, 1)).view(1, 1))
loss_actor = -torch.mean(b)
self.actor_optimizer.zero_grad()
loss_actor.backward(retain_graph=True)
self.actor_optimizer.step()
I also have changed some place where I think have some problems.so it will looks prolix.