0

I am trying to apply DeepQLearning class and DQN class, but there is a problem with calculation.

The agent act something in state, and the state is expressed by int data. For example, 0,1,2,3,... And, I should calculate the q_values with the state, but there is an error. The Error is that "RuntimeError: mat1 and mat2 shapes cannot be multiplied"

My class code is below, and the state_tensor shape in the greedy method is like tensor([2.]) (Example)

I know the problem is exactly in the forward method in DQN class because of mat size, but I don't know what to change... please help me I'm really suffered by this issue...

class DeepQLearning(Agent):
    def __init__(self, states_dim, actions_dim, discount=1, update_rate=0.001, eps_init=0.8, eps_lower_bound=0.1, anneal=0.001, buffer_size=1000, device='cpu'):
        super().__init__(states_dim, actions_dim, discount,
                         update_rate, eps_init, eps_lower_bound, anneal)
        
    ###
        self.buffer_size = buffer_size
        self.replay_buffer = deque(maxlen=buffer_size)

        # Create the policy network
        self.policy_net = DQN(states_dim, actions_dim)
        self.target_net = DQN(states_dim, actions_dim)

        # Set the device
        self.device = device
        self.policy_net.to(device)
        self.target_net.to(device)

        # Define the optimizer
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=update_rate)

    def greedy(self, state):
        # Convert the state to a tensor
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        # Pass the state through the policy network to get the Q-values
        q_values = self.policy_net(state_tensor)
        # Select the action with the highest Q-value
        action = q_values.argmax(dim=1).item()
        return action

    def update_q(self, batch_size):
            # Check if the replay buffer contains enough transitions
        if len(self.replay_buffer) < batch_size:
            return

        # Sample a batch of transitions from the replay buffer
        states, actions, rewards, next_states = self.replay_buffer.sample(batch_size)

        # Convert the states, actions, rewards, and next states to tensors
        states_tensor = torch.tensor(states, dtype=torch.float32)
        actions_tensor = torch.tensor(actions, dtype=torch.long)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
        next_states_tensor = torch.tensor(next_states, dtype=torch.float32)

        # Calculate the Q-values for the current states and the next states
        q_values = self.policy_net(states_tensor).gather(1, actions_tensor.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_net(next_states_tensor).max(dim=1).values.detach()

        # Calculate the target Q-values
        target_q_values = rewards_tensor + self.discount * next_q_values

        # Calculate the loss
        loss = F.mse_loss(q_values, target_q_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    ###        


class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        ###

        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

        ###
        
    def forward(self, x):
        ###
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

        ###

I don't know how to fix the size or input data...

원종진
  • 3
  • 2

0 Answers0