0

I am using Q learning and the program should be able to play the game after some tries but it is not learning even when the epsilon value if 0.1.

I have tried changing the batch size the memory size. I have changed the code to give -1 reward if the player dies.

import gym 
import numpy as np  
import random
import tensorflow as tf
import numpy as np
from time import time
import keyboard
import sys
import time


env = gym.make("Breakout-ram-v4")
observationSpace = env.observation_space
actionSpace=  env.action_space
episode = 500

class Model_QNN :
    def __init__(self):
        self.memory = []
        self.MAX_MEMORY_TO_USE = 60_000
        self.gamma = 0.9
        self.model = tf.keras.Sequential([
                tf.keras.layers.Flatten(input_shape=(128,1)),
                tf.keras.layers.Dense(256,activation="relu"),
                tf.keras.layers.Dense(64,activation="relu"),
                tf.keras.layers.Dense(actionSpace.n , activation=  "softmax")
            ])
        self.model.compile(optimizer="adam",loss="mse",metrics=["accuracy"])

    def remember(self, steps , done):
        self.memory.append([steps,done])
        if(len(self.memory) >= self.MAX_MEMORY_TO_USE):
            del self.memory[0]
    def replay(self,batch_size= 32):
        states, targets_f = [], []
        if(len(self.memory)< batch_size) :
            return 
        else: 
            mini = random.sample(self.memory,batch_size)
            states ,targets  = [],  [] 
            for steps , done  in mini :
                target= steps[2] ;
                if not done :
                    target = steps[2]  + (self.gamma* np.amax(self.model.predict(steps[3].reshape(1,128,1))[0]))
                target_f = self.model.predict(steps[0].reshape(1,128,1))
                target_f[0][steps[1]] = target
                states.append(steps[0])
                targets.append(target_f[0])
            self.model.fit(np.array(states).reshape(len(states),128,1), np.array(targets),verbose=0,epochs=10)
    def act(self,state,ep):
        if(random.random()< ep):
            action = actionSpace.sample()
        else :
            np.array([state]).shape
            action= self.model.predict(state.reshape(1,128,1))
            action = np.argmax(action)
        return  action;
    def saveModel (self):
        print("Saving")
        self.model.save("NEWNAMEDONE")
    def saveBackup(self,num):
        self.model.save("NEWNAME"+str(int(num)))
def main():
    agent= Model_QNN();
    epsilon=0.9
    t_end = time.time()
    score=  0
    for e in range(2000):
        print("Working on episode : "+str(e)+" eps "+str(epsilon)+" Score  " + str(score))
        preState = env.reset()
        preState,reward,done,_ = env.step(1)
        mainLife=5
        done = False
        score=  0
        icount = 0
        render=False
        if e % 400 ==0 and not e==0:
            render =True
        while not done:
            icount+=1
            if render:
                env.render()
            if keyboard.is_pressed('q'):
                agent.saveBackup(100)
                agent.saveModel()
                quit()
            rewrd=0
            if ( _["ale.lives"] < mainLife ):
                mainLife-=1
                rewrd=-1
                action=1
            else: 
                action = agent.act(preState,epsilon)
            newState,reward,done,_ = env.step(action)
            if rewrd ==-1 :
                reward =-1
            agent.remember([preState/255,action,reward,newState/255],done);
            preState= newState;
            score+=reward 
            if done :
                break
        agent.replay(1024)
        if epsilon >= 0.18 :
           epsilon = epsilon * 0.995;
        if ((e+1)%500==0):
            agent.saveBackup((e+1)/20)
    agent.saveModel()


if __name__=='__main__':
    main()

There is no error message the program should learn and it is not

desertnaut
  • 57,590
  • 26
  • 140
  • 166
ShIvam Rawat
  • 51
  • 1
  • 8

1 Answers1

1

Why are you using Softmax on your output layer? If you want to use Softmax use Cross-Entropy as your loss. However, it looks like you're trying to implement a value based learning system. The activation function on your output layer should be linear.

I suggest you try your implementation on Cartpole-v0 then LunarLanding-v2 first. Those are solved environments and a great place to sanity check your code.

"There is no error message the program should learn and it is not." Welcome to ML where things fail silently.

NomNomNom
  • 318
  • 5
  • 14