1

I have two agents a and b:they use the same network stucture.and I try to use replaybuffer to update the parameters of networks.If i update a, then update b would bring the error:one of the variables needed for gradient computation has been modified by an inplace operation. so I try to find the inplace operation.But I couldn't do that.

    while total_timesteps < episodes:
    total_timesteps = total_timesteps + 1
    episodes_reward = []
    state = env.reset()
    state.to(device)
    for j in range(max_steps):
        action = torch.zeros(env.num_agents, env.dim_world)
        action[0] = policy_agent1.select_action(state)
        action[1] = policy_agent2.select_action(state)
        action_agent0 = action[0]
        action_agent1 = action[1]
        next_state, reward, done = env.step(action)
        next_state.to(device)
        action.to(device)
        reward.to(device)
        reward_agent = reward[0]
        reward_agent1 = reward[1]
        replay_buffer1.add((state, next_state, action_agent0, reward_agent, done))
        replay_buffer2.add((state, next_state, action_agent1, reward_agent1, done))
        # print(next_state, reward, action)
        state = next_state
        if done:
            continue
    policy_agent1.train(replay_buffer1.sample(batch_size), gamma)
    policy_agent2.train(replay_buffer2.sample(batch_size), gamma)

and the train part:

    def train(self, replay_buffer, gamma):
    state, next_state, action, reward, done = replay_buffer
    q = torch.zeros(len(reward)).to(device)
    q_ = torch.zeros(len(reward)).to(device)
    q_target = torch.zeros(len(reward)).to(device)
    done = torch.Tensor(done).to(device)

    for j, r in enumerate(reward):
        q1_target = torch.zeros(len(reward)).to(device)
        q_[j] = self.critic_network(torch.transpose(next_state[j].to(device), 0, 1), self.actor_network(torch.transpose(next_state[j].to(device), 0, 1)).view(1, 1))
        q_target[j] = r.to(device) + (done[j] * gamma * q_[j])
        q1_target = q1_target + q_target
        # q_target[j] = r.to(device) + (done[j] * gamma * q_[j]).detach().clone()
        q1 = torch.zeros(len(reward)).to(device)
        q1[j] = self.critic_network(torch.transpose(state[j].to(device), 0, 1), action[j].view(1, 1).to(device))
        q = q + q1
    loss_critic = F.mse_loss(q, q1_target)
    self.critic_optimizer.zero_grad()
    loss_critic.backward(retain_graph=True)
    self.critic_optimizer.step()

    b = torch.zeros(len(reward)).to(device)
    for j, _ in enumerate(reward):
        b1 = torch.zeros(len(reward)).to(device)
        b1[j] =  self.critic_network(torch.transpose(state[j].to(device), 0, 1), self.actor_network(torch.transpose(state[j].to(device), 0, 1)).view(1, 1))
        b = b + b1
        # b[j] = self.critic_network(torch.transpose(state[j].to(device), 0, 1), self.actor_network(torch.transpose(state[j].to(device), 0, 1)).view(1, 1))
    loss_actor = -torch.mean(b)
    self.actor_optimizer.zero_grad()
    loss_actor.backward(retain_graph=True)
    self.actor_optimizer.step()

I also have changed some place where I think have some problems.so it will looks prolix.

Mr_qiaozhi
  • 13
  • 2

0 Answers0