I need help with the Qvalue method. I have most of the code working but when calculating the Q value things are not right. I know if the arrow points towards an area where there is no state it needs to bounce back and if it points to the gray box it is also supposed to point back. I would like some guidance.
I know the Q equation is supposed to look like this:
Here is my code:
from cell import states
import pygame
import drawfn
ACTION_EAST=0
ACTION_SOUTH=1
ACTION_WEST=2
ACTION_NORTH=3
TRANSITION_SUCCEED=0.8 #The probability that by taking action A, it moves to the expected destination state S'. Here the state S' represents the new state that the action A aims to move to.
TRANSITION_FAIL=0.2 #The probability that by taking action A, it moves to an unexpected destination state S'. For example, by taking action East, you may moves to the neighboring direction North or South. So the probability of going to North or South is 0.1. We assume the two directions evenly split the value of TRANSITION_FAIL 0.2
GAMMA=0.9 #the discount factor
ACTION_REWARD=-0.1 #The instantaneous for taking each action (we assume the four actions (N/E/W/S) has the same reward)
CONVERGENCE=0.0000001 #The threshold for convergence to determine a stop sign
cur_convergence=100
#####Implement the below functions ############################
#make sure the arrow will bounce back if the arrow points to the empty or the gray box
def computeQValue(s,action):
print('Compute Q Values')
#does not return anything
#try every action
#s is state of each cell
#action from value 0-3 0-east, 1-south, 2-west, 3-north
#For each cell based on action taken the q value is calculated
#update the state data with the q value
# Compute Q-values for the given action and state
global state_value
global q_values
global policy
global transition
transitions = {0:(1,0),1:(0,-1), 2:(-1,0),3:(0,1)}
i = 0
#loops through each row
for row in states:
#loops through state in each row
next_state_value = s.state_value
for state in states:
i += 1
print('loop' +str(i))
#Compute Q-values for the given action and state
if action == ACTION_EAST:
s.q_values[0] = ACTION_REWARD + GAMMA * (TRANSITION_SUCCEED * next_state_value + (TRANSITION_FAIL * (s.q_values[1] + s.q_values[3]))/2)
print(s.q_values[0])
elif action == ACTION_SOUTH:
s.q_values[1] = ACTION_REWARD + GAMMA * (TRANSITION_SUCCEED * next_state_value + (TRANSITION_FAIL * (s.q_values[2] + s.q_values[0]))/2)
print(s.q_values[1])
elif action == ACTION_WEST:
s.q_values[2] = ACTION_REWARD + GAMMA * (TRANSITION_SUCCEED * next_state_value + (TRANSITION_FAIL * (s.q_values[1] + s.q_values[3]))/2)
print(s.q_values[2])
else:
s.q_values[3] = ACTION_REWARD + GAMMA * (TRANSITION_SUCCEED * next_state_value + (TRANSITION_FAIL * (s.q_values[0] + s.q_values[2]))/2)
print(s.q_values[3])
def valueIteration():
print('Value Iteration.')
#does not return anything
#called in a loop
#use the computeQValue and update the state value of each cell
#ideally the policy should be obtained less than 100 iterations possible
#use the cur_convergence and convergence
global CONVERGENCE
global cur_convergence
global conv_gap
if (cur_convergence >= CONVERGENCE):
print('at line 62')
for r in states:
for s in r:
old_qval= s.state_value
print('cell location' + str(s.location))
for i in range(4):
computeQValue(s, i)
for r in states:
for s in r:
if(s.location == (1,1) or s.location == (3,0) or s.location == (3,1)):
pass
else:
s.state_value = max(s.q_values)
new_qval = s.state_value
cur_convergence = new_qval-old_qval
else:
print('at line 78')
CONVERGENCE = 101