Is there anything wrong with my update policy network function or my DQN in my deep-q-learning

Question

I want to train deep q-learning to solve a Rubik's cube given 10 possible moves (I am implementing this for an Arduino project and don't have access to every side of the cube which is why I only allowed it 10 moves). I trained this model over the last few days with almost no improvement and wanted to see if there was something wrong with my update_policy_network function or if something was wrong with how I was calculating my q_values in my DQN. This is my code:

import torch.nn as nn
from function import shuffle, newTurn, checkSolved, checkAlign
import numpy as np
import random
from collections import namedtuple
import torch.optim as optim
import torch.nn.functional as F
import math
import torch

#Format of experiences to insert into our replaybuffer
Experience = namedtuple('Experience',['state','action','reward','next_state','done'])

#Stores and dispenses batches of experience to train our model on
class ReplayBuffer(object):
    def __init__(self,capacity, inputSize):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        self.inputSize = inputSize
    
    def addExperience(self,experience):
        if len(self.buffer) < self.capacity:
            self.buffer.append(experience)
    
    def sampleBatch(self,batch_size):
        batch = random.sample(self.buffer,batch_size)
        return batch
    
    def __len__(self):
        return len(self.buffer)


class Environment():
    def __init__(self):
        self.state = shuffle()
        self.oneHotEncoding = np.eye(6)

    def shuffle(self):
        self.state = shuffle()

    def encode(self):
        return torch.tensor(self.state)
    
    def checkGoal(self):
        global maxGoal
        h = checkAlign(self.state)
        if h > maxGoal:
            maxGoal = h
            print(f"{maxGoal} GOAL")
            return h

    def turn(self,action):
        reward=-1
        done=False
        self.state=newTurn(self.state,action)
        h = self.checkGoal()
        if h:
            reward+=(30*maxGoal)
        if h==6:
            done=True
            reward+=1000
        return reward, done

class DQN(nn.Module):
    def __init__(self,out,embedding_dim):
        super(DQN, self).__init__()
        self.embedding = nn.Embedding(6*9,embedding_dim)
        self.convolutional_layers = nn.Sequential(
            nn.Conv2d(in_channels=9, out_channels=32, kernel_size=3), 
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.ReLU(),
        )

        self.feature_layers = nn.Sequential(
            nn.Linear(640, 256),
            nn.ReLU(),
            nn.Linear(256, out),
            nn.ReLU()
        )

    def forward(self, x):
        x=x.long()
        embedded = self.embedding(x)
        if len(embedded.size())==4:
            embedded = embedded.view(x.size(0),9,6,-1)
            x = self.convolutional_layers(embedded)
            x = x.view(x.size(0),10,640)
            x = self.feature_layers(x)
            x = x.transpose(1,2).sum(dim=2)
        else:
            embedded = embedded.view(x.size(0),9,6,-1)
            x = self.convolutional_layers(embedded)
            x = x.view(x.size(0),-1)
            x = self.feature_layers(x)
            x = x.t().sum()
        return x

def randomTurn():
    global cube, totalReward, done
    cubeState = cube.encode().to('cuda')
    action = random.randint(0,9)
    reward,done = cube.turn(action)
    totalReward+=reward
    newState = cube.encode().to('cuda')
    newExperience = Experience(cubeState,action,reward,newState,done)
    return newExperience

def makeAMove():
    global stepsDone, totalReward, done, cube
    sample=random.random()
    stepsDone+=1
    state = cube.encode().to('cuda')
    if sample > epsilon:
        #Exploitation
        with torch.no_grad():
            pns=policy_net(state)
            move=pns.argmax().item()
            cubeState = cube.encode().to('cuda')
            reward,done = cube.turn(move)
            newState = cube.encode().to('cuda')
            totalReward+=reward
            newExperience = Experience(cubeState,move,reward,newState,done)
            return newExperience
    else:
        #Exploration
        return randomTurn()

def update_policy_network(policy_net, target_net, optimizer, batch, gamma):
    states = torch.stack([experience.state for experience in batch]).to('cuda')
    actions = torch.tensor([experience.action for experience in batch], dtype=torch.long).to('cuda')
    rewards = torch.tensor([experience.reward for experience in batch], dtype=torch.float32).to('cuda')
    next_states = torch.stack([experience.next_state for experience in batch]).to('cuda')
    dones = torch.tensor([experience.done for experience in batch], dtype=torch.float32).to('cuda')

    # Compute Q-values for current states using the policy network
    q_values = policy_net(states)
    q_values = q_values.gather(1,actions.unsqueeze(1)).squeeze()

    # Compute target Q-values using the target network
    with torch.no_grad():
        next_q_values = target_net(next_states).max(1)[0]
        target_q_values = rewards + gamma * next_q_values * (1 - dones)

    # Calculate the loss and perform gradient descent
    loss = nn.functional.smooth_l1_loss(q_values, target_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

if __name__ == "__main__":
    #Training Phase
    cube = Environment()
    replay_buffer = ReplayBuffer(capacity=2000,inputSize=(6,9))
    batchsize = 128
    gamma = 0.99
    epsilon=0.1
    TAU = 0.005
    LR = 1e-4
    episodeLength=150
    numEps=5000
    target_network_update_interval = 200
    numFinish = 0
    maxReward=-500

    policy_net = DQN(10,54).to('cuda')
    target_net = DQN(10,54).to('cuda')
    policy_net.load_state_dict(torch.load('model.pth'))
    target_net.load_state_dict(torch.load('target.pth'))
    #target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
    episodeDuration = []
    for z in range(5):
        numFinish = 0
        maxReward = -500
        for episode in range(numEps):
            stepsDone=0
            totalReward=0
            cube.shuffle()
            done = False
            maxGoal = 0

            for step in range(episodeLength):
                ep=makeAMove()
                replay_buffer.addExperience(ep)

                if len(replay_buffer)>=batchsize:
                    batch = replay_buffer.sampleBatch(batchsize)
                    update_policy_network(policy_net,target_net,optimizer,batch,gamma)
                
                if done:
                    numFinish+=1
                    break

                if episode % target_network_update_interval == 0:
                    target_net.load_state_dict(policy_net.state_dict())
                    target_net.eval()
            if totalReward > maxReward:
                maxReward = totalReward
        
        torch.save(policy_net.state_dict(), "model.pth")
        torch.save(target_net.state_dict(),"target.pth")
        print("Finished training and saved model")

I have tried using a one hot encoding approach before using embedding. I read that embeddings may work better for this type of environment. I have tried setting different values for batch_size, network update interval, reward values etc. with little to no difference in performance. If my code looks sketchy, its probably because I asked ChatGPT to help me with writing it.

Is there anything wrong with my update policy network function or my DQN in my deep-q-learning

0 Answers0