My teacher gave the following problem:
Consider the following MDP with 3 states and rewards. There are two possible actions - RED and BLUE. The state transitions probabilites are given on the edges, and S2 is a terminal state. Assume that the initial policy is: π(S0) = B; π(S1) = R.
We were asked for what γ values (0<γ<1) the optimal policy would be:
(a) π∗(S0) = R; π∗(S1) = B;
(b) π∗(S0) = B; π∗(S1) = R;
(c) π∗(S0) = R; π∗(S1) = R;
I've shown that for (a) the answer is γ = 0.1, and couldn't find γ values for (b) and (c). The teacher said that for (b) any γ > 0.98 would work, and for (c) γ = 0.5. I think he's wrong, and have writtenthe following python script , which follows the algorithm in the textbook (Russell and Norvig AIMA), and indeed for any γ value the only policy I get is (a). However the teacher says he's not wrong, and that my script must be buggy. How can i definitely show that such policies are impossible?
S0 = "S0"
S1 = "S1"
S2 = "S2"
BLUE = "blue"
RED = "red"
gamma = 0.5 # TODO MODIFY GAMMA HERE
# P(s'|s,a)
P_destination_start_action = \
{
(S0,S0, BLUE):0.5,(S0,S0,RED):0.9, (S0,S1,BLUE):0.8,(S0,S1,RED):0, (S0,S2, BLUE):0,(S0,S2,RED):0,
(S1,S0, BLUE):0.5,(S1,S0,RED):0, (S1,S1,BLUE):0.2,(S1,S1,RED):0.6, (S1,S2, BLUE):0,(S1,S2,RED):0,
(S2,S0, BLUE):0, (S2,S0,RED):0.1, (S2,S1,BLUE):0 ,(S2,S1,RED):0.4,(S2,S2, BLUE):1,(S2,S2,RED):1
}
class MDP:
def __init__(self):
self.states = [S0, S1, S2]
self.actions = [BLUE, RED]
self.P_dest_start_action = P_destination_start_action
self.rewards = {S0: -2, S1: -5, S2: 0}
def POLICY_EVALUATION(policy_vec, utility_vec, mdp):
new_utility_vector = {}
for s in mdp.states:
to_sum = [(mdp.P_dest_start_action[(s_tag, s, policy_vec[s])] * utility_vec[s_tag])
for s_tag in mdp.states]
new_utility_vector[s] = mdp.rewards[s] + gamma * sum(to_sum)
return new_utility_vector
def POLICY_ITERATION(mdp):
utility_vector = {state: 0 for state in mdp.states}
policy_vector = {S0: BLUE, S1: RED, S2: RED}
unchanged = False
while not unchanged:
utility_vector = POLICY_EVALUATION(policy_vector, utility_vector, mdp)
unchanged = True
for s in mdp.states:
BLUE_sum = sum([(mdp.P_dest_start_action[(s_tag, s, BLUE)] * utility_vector[s_tag])
for s_tag in mdp.states])
RED_sum = sum([(mdp.P_dest_start_action[(s_tag, s, RED)] * utility_vector[s_tag])
for s_tag in mdp.states])
if policy_vector[s] == RED and BLUE_sum > RED_sum:
policy_vector[s] = BLUE
unchanged = False
elif policy_vector[s] == BLUE and RED_sum > BLUE_sum:
policy_vector[s] = RED
unchanged = False
return policy_vector
if __name__ == "__main__":
Q2_mdp = MDP()
new_policy_vec = POLICY_ITERATION(Q2_mdp)
print("===========================END===============================")
print("S_O policy =", new_policy_vec[S0], " ,S_1 Policy =", new_policy_vec[S1])