I am trying to apply DeepQLearning class and DQN class, but there is a problem with calculation.
The agent act something in state, and the state is expressed by int data. For example, 0,1,2,3,... And, I should calculate the q_values with the state, but there is an error. The Error is that "RuntimeError: mat1 and mat2 shapes cannot be multiplied"
My class code is below, and the state_tensor shape in the greedy method is like tensor([2.]) (Example)
I know the problem is exactly in the forward method in DQN class because of mat size, but I don't know what to change... please help me I'm really suffered by this issue...
class DeepQLearning(Agent):
def __init__(self, states_dim, actions_dim, discount=1, update_rate=0.001, eps_init=0.8, eps_lower_bound=0.1, anneal=0.001, buffer_size=1000, device='cpu'):
super().__init__(states_dim, actions_dim, discount,
update_rate, eps_init, eps_lower_bound, anneal)
###
self.buffer_size = buffer_size
self.replay_buffer = deque(maxlen=buffer_size)
# Create the policy network
self.policy_net = DQN(states_dim, actions_dim)
self.target_net = DQN(states_dim, actions_dim)
# Set the device
self.device = device
self.policy_net.to(device)
self.target_net.to(device)
# Define the optimizer
self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=update_rate)
def greedy(self, state):
# Convert the state to a tensor
state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
# Pass the state through the policy network to get the Q-values
q_values = self.policy_net(state_tensor)
# Select the action with the highest Q-value
action = q_values.argmax(dim=1).item()
return action
def update_q(self, batch_size):
# Check if the replay buffer contains enough transitions
if len(self.replay_buffer) < batch_size:
return
# Sample a batch of transitions from the replay buffer
states, actions, rewards, next_states = self.replay_buffer.sample(batch_size)
# Convert the states, actions, rewards, and next states to tensors
states_tensor = torch.tensor(states, dtype=torch.float32)
actions_tensor = torch.tensor(actions, dtype=torch.long)
rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
next_states_tensor = torch.tensor(next_states, dtype=torch.float32)
# Calculate the Q-values for the current states and the next states
q_values = self.policy_net(states_tensor).gather(1, actions_tensor.unsqueeze(1)).squeeze(1)
next_q_values = self.target_net(next_states_tensor).max(dim=1).values.detach()
# Calculate the target Q-values
target_q_values = rewards_tensor + self.discount * next_q_values
# Calculate the loss
loss = F.mse_loss(q_values, target_q_values)
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
###
class DQN(nn.Module):
def __init__(self, input_size, output_size):
super(DQN, self).__init__()
###
self.fc1 = nn.Linear(input_size, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, output_size)
###
def forward(self, x):
###
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
###
I don't know how to fix the size or input data...