import tensorflow as tf
import keras
import numpy as np
import gym
import random
from keras.layers import *
model = keras.models.Sequential()
model.add(Dense(12,activation = 'tanh',input_shape = (4,)))
model.add(Dense(2,activation = 'linear'))
model.compile(optimizer = 'adam',loss = 'MSE',metrics = ['accuracy'])
env = gym.make("CartPole-v1")
def preprocessing(state):
return np.reshape(state,(1,4))
replay_mem = []
replay_size = 32
reward_list = []
local_mean = list()
for episode in range(2000):
done = False
state = env.reset()
count = 0
reward_tot = 0
model.save("cartpole-dqn.h5")
while not done:
count += 1
e = (1/(episode/200+1))
#epslion greedy search
Q = model.predict(preprocessing(state))
if e>np.random.rand(1):
action = env.action_space.sample()
else:
action = np.argmax(Q)
#take_action and set reward
state_next, reward, done, info = env.step(action)
if done:
reward = - 100
#replay_mem
replay_mem.append([state,action,reward,state_next,done])
if len(replay_mem)>2048:
del replay_mem[0]
state = state_next
reward_tot += reward
state = state_next
Q_list = []
state_list = []
#set this_replay_Size
if len(replay_mem)<replay_size:
this_replay_size = len(replay_mem)
else:
this_replay_size = replay_size
#sample random batch from replay_memory
for sample in random.sample(replay_mem,this_replay_size):
state_m,action_m,reward_m,state_next_m,done_m = sample
if done:
Q[0,action] = reward_m
else:
Q_new = model.predict(preprocessing(state_next_m))
Q[0,action] = reward_m + 0.97*np.max(Q_new)
Q_list.append(Q)
state_list.append(state_m)
#convert to nupmy array and train
Q_list = np.array(Q_list)
state_list = np.array(state_list)
hist = model.fit(state_list,Q_list,epochs = 5,verbose = 0)
#print("Done :",done," Reward :",reward," Reward_total :",reward_tot)
local_mean.append(reward_tot)
reward_list.append(reward_tot)
if episode%10 == 0:
print("Episode :",episode+1," Reward_total :", reward_tot," Reward_local_mean :",np.mean(local_mean))
local_mean = list()
print("*******End Learning")
this is my full code of my model
i implement dqn algorithm
what's wrong with my code? i've train this model abbout 1000episode but there no progress
what should i change? should i train more episode? or is there anything wrong with my implementation? i've been working on this project soo long help me please