I try to learn the concept of reinforcement learning at the moment. Hereby, I tried to implement the SARSA algorithm for the cart pole example using tensorflow. I compared my algorithm to algorithms which use a linear approximation function for the q-value function and find my algorithm to be very similar. Unfortunately, my implementation seems to be false or inefficient as the learning success is rather limited. Is there anyone who can tell me if I am doing something wrong and what it is? The code of my implementation is:
import numpy as np
import matplotlib.pylab as plt
import random
import gym
#define a neural network which returns two action dependent q-values given a state
neural_net = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation = 'relu', input_shape = [4]),
tf.keras.layers.Dense(2)
])
#return the neural network's q-value for a specific action
def q_value(state, action):
return neural_net(tf.convert_to_tensor([state]))[0, action]
#act either randomly or choose the action which maximizes the q-value
def policy(state, epsilon):
values = neural_net(tf.convert_to_tensor([state]))
if np.random.rand() < epsilon:
return random.choice([0, 1])
else:
return np.argmax(values)
#intialize gym environment
env = gym.make('CartPole-v0')
#hyperparameters
discount = 0.99
optimizer = tf.keras.optimizers.Adam()
episodes = 1000
epsilon = 0.30
#collect reward for each episode
rewards = []
for episode in range(episodes):
#start trajectory for episode
state = env.reset()
#record rewards during episode
sum_returns = 0
#decrease random action after the first 100 episodes
if episode == 100:
epsilon = 0.10
#Q-learning
while True:
action = policy(state, epsilon)
next_state, reward, done, _ = env.step(action)
next_action = policy(next_state, epsilon)
sum_returns += 1
if done:
with tf.GradientTape() as tape:
tape.watch(neural_net.trainable_variables)
q_hat = q_value(state, action)
y = reward
loss = tf.square(y - q_hat)
gradients = tape.gradient(loss, neural_net.trainable_variables)
optimizer.apply_gradients(zip(gradients, neural_net.trainable_variables))
break
else:
with tf.GradientTape() as tape:
tape.watch(neural_net.trainable_variables)
q_hat = q_value(state, action)
y = reward + discount * q_value(next_state, next_action)
loss = tf.square(y - q_hat)
gradients = tape.gradient(loss, neural_net.trainable_variables)
optimizer.apply_gradients(zip(gradients, neural_net.trainable_variables))
state = next_state
rewards.append(sum_returns)
#plot learning over time
plt.plot([episode for episode in range(episodes)], rewards)
plt.show()```