Using this code:
import gym
import numpy as np
import time
"""
SARSA on policy learning python implementation.
This is a python implementation of the SARSA algorithm in the Sutton and Barto's book on
RL. It's called SARSA because - (state, action, reward, state, action). The only difference
between SARSA and Qlearning is that SARSA takes the next action based on the current policy
while qlearning takes the action with maximum utility of next state.
Using the simplest gym environment for brevity: https://gym.openai.com/envs/FrozenLake-v0/
"""
def init_q(s, a, type="ones"):
"""
@param s the number of states
@param a the number of actions
@param type random, ones or zeros for the initialization
"""
if type == "ones":
return np.ones((s, a))
elif type == "random":
return np.random.random((s, a))
elif type == "zeros":
return np.zeros((s, a))
def epsilon_greedy(Q, epsilon, n_actions, s, train=False):
"""
@param Q Q values state x action -> value
@param epsilon for exploration
@param s number of states
@param train if true then no random actions selected
"""
if train or np.random.rand() < epsilon:
action = np.argmax(Q[s, :])
else:
action = np.random.randint(0, n_actions)
return action
def sarsa(alpha, gamma, epsilon, episodes, max_steps, n_tests, render = True, test=False):
"""
@param alpha learning rate
@param gamma decay factor
@param epsilon for exploration
@param max_steps for max step in each episode
@param n_tests number of test episodes
"""
env = gym.make('Taxi-v3')
n_states, n_actions = env.observation_space.n, env.action_space.n
Q = init_q(n_states, n_actions, type="ones")
print('Q shape:' , Q.shape)
timestep_reward = []
for episode in range(episodes):
print(f"Episode: {episode}")
total_reward = 0
s = env.reset()
print('s:' , s)
a = epsilon_greedy(Q, epsilon, n_actions, s)
t = 0
done = False
while t < max_steps:
if render:
env.render()
t += 1
s_, reward, done, info = env.step(a)
total_reward += reward
a_ = epsilon_greedy(Q, epsilon, n_actions, s_)
if done:
Q[s, a] += alpha * ( reward - Q[s, a] )
else:
Q[s, a] += alpha * ( reward + (gamma * Q[s_, a_] ) - Q[s, a] )
s, a = s_, a_
if done:
if render:
print(f"This episode took {t} timesteps and reward {total_reward}")
timestep_reward.append(total_reward)
break
# print('Updated Q values:' , Q)
if render:
print(f"Here are the Q values:\n{Q}\nTesting now:")
if test:
test_agent(Q, env, n_tests, n_actions)
return timestep_reward
def test_agent(Q, env, n_tests, n_actions, delay=0.1):
for test in range(n_tests):
print(f"Test #{test}")
s = env.reset()
done = False
epsilon = 0
total_reward = 0
while True:
time.sleep(delay)
env.render()
a = epsilon_greedy(Q, epsilon, n_actions, s, train=True)
print(f"Chose action {a} for state {s}")
s, reward, done, info = env.step(a)
total_reward += reward
if done:
print(f"Episode reward: {total_reward}")
time.sleep(1)
break
if __name__ =="__main__":
alpha = 0.4
gamma = 0.999
epsilon = 0.9
episodes = 200
max_steps = 20
n_tests = 20
timestep_reward = sarsa(alpha, gamma, epsilon, episodes, max_steps, n_tests)
print(timestep_reward)
from :
A sample Q table generated is :
[[ 1. 1. 1. 1. 1. 1. ]
[ 0.5996 0.5996 0.5996 0.35936 0.5996 1. ]
[ 0.19936016 0.35936 0.10336026 0.35936 0.35936 -5.56063984]
...
[ 0.35936 0.5996 0.35936 0.5996 1. 1. ]
[ 1. 0.5996 1. 1. 1. 1. ]
[ 0.35936 0.5996 1. 1. 1. 1. ]]
The columns representing the actions and rows representing the corresponding states.
Can the state be represented by a vector ? The Q table cells are not contained by vectors of size > 1 so how should these states be represented ? For example if I'm in state [2] can this be represented as an n dimensional vector ?
Put another way if Q[1,3] = 4 can the Q state 1 with action 3 be represented as vector[1,3,2,12,3] ? If so then is the state_number->state_attributes mapping stored in a separate lookup table ?