0

i try to develop q-learning algorithm for reinforcement learning, this my code:

import numpy as np
R = np.matrix ([[-1, 0, -1, -1, 0, -1, -1, -1, -1], 
            [-1, -1, 100, 0, -1, -1, -1, -1, -1], 
            [-1, -1, 100, -1, -1, -1, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1, -1, -1, -1], 
            [-1, -1, -1, -1, -1, 100, 0, -1, -1], 
            [-1, -1, -1, -1, -1, 100, -1, -1, -1], 
            [-1, -1, -1, -1, -1, -1, -1, 100, 0], 
            [-1, -1, -1, -1, -1, -1, -1, 100, -1],
            [-1, -1, -1, -1, -1, -1, -1, -1, -1]])
    # Q matrix
Q = np.matrix(np.zeros([9,9]))

# Gamma (learning parameter)
gamma = 0.4

# Initial state. (Usually to be chosen at random)
initial_state = 1

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0) [1]
    return av_act

# Get available actions in the current state
available_act = available_actions(initial_state)

# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

#sample next action to be performed
action = sample_next_action(available_act)

# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]

    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Update Q matrix
update(initial_state, action, gamma)

# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)

# Testing 

# Goal state = 2

current_state = 1
steps = [current_state]

while current_state != 2:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)

    steps.append(next_step_index)
    current_state = next_step_index

# Print selected sequence of steps
print("Selected path:")
print(steps)

but i have always this error that i didn't understand:

ValueError Traceback (most recent call last) in 46 current_state = np.random.randint(0, int(Q.shape[0])) 47 available_act = available_actions(current_state) ---> 48 action = sample_next_action(available_act) 49 update(current_state, action, gamma) 50

in sample_next_action(available_actions_range) 19 # This function chooses at random which action to be performed within the range of all the available actions. 20 def sample_next_action(available_actions_range): ---> 21 next_action = int(np.random.choice(available_act, 1)) 22 return next_action 23

mtrand.pyx in mtrand.RandomState.choice()

ValueError: 'a' cannot be empty unless no samples are taken

any help please!

student
  • 31
  • 1
  • 8
  • 1
    Try printing `available_act` it seems like it may be empty. – BenedictWilkins Oct 29 '19 at 10:58
  • yes that's the problem, whereas in the matrix, when I'm in state 1 there are two possible actions that are either 2 or 3 – student Oct 29 '19 at 11:03
  • R = np.matrix ([[-1, -1, -1, -1, 0, -1], # [-1, -1, -1, 0, -1, 100], # [-1, -1, -1, 0, -1, -1], # [-1, 0, 0, -1, 0, -1], # [0, -1, -1, 0, -1, 100], # [-1, 0, -1, -1, 0, 100]]) and when i use this matrix, i haven't this error !! – student Oct 29 '19 at 11:05
  • Possibly `R[state,]` should be `R[state]`. try printing `current state row` and see if it is what you expect. – BenedictWilkins Oct 29 '19 at 11:08
  • I am not sure of the meaning of R is this the reward matrix ? What is the state ? An index ? 1d index or 2d? – BenedictWilkins Oct 29 '19 at 11:11
  • R is the reward matrix, state is the row in the matrix. i try to follow this tuto https://www.youtube.com/watch?v=LzaWrmKL1Z4 . but when i change the matrix i get this error – student Oct 29 '19 at 11:59

1 Answers1

0

There are quite a few flaws with the code:

  1. Change the data structure of R and Q to be:

    R = np.array... Q = np.zeros([9, 9])

  2. Change the R matrix for state 3 and state 8 so that at least one action is available. So just add one value greater than zero in those rows.

  3. Change available_actions definition to be:

    def available_actions(state):
         current_state_row = R[state, :]
         av_act = np.where(current_state_row >= 0)[0]
    
  4. Change line 39 for proper indexing

    max_index = np.where(Q[action,] == np.max(Q[action, :]))[0]

  5. Change line 73 for proper indexing

    next_step_index = np.where(Q[current_state,:] == np.max(Q[current_state,:]))[0]

With these changes, you should be able to get a value.
Final result would be:

Selected path:[1, 2]

import numpy as np
R = np.array([[-1, 0, -1, -1, 0, -1, -1, -1, -1],
            [-1, -1, 100, 0, -1, -1, -1, -1, -1],
            [-1, -1, 100, -1, -1, -1, -1, -1, -1],
            [-1, -1, -1, -1, -1, 0, -1, -1, -1],
            [-1, -1, -1, -1, -1, 100, 0, -1, -1],
            [-1, -1, -1, -1, -1, 100, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1, -1, 100, 0],
            [-1, -1, -1, -1, -1, -1, -1, 100, -1],
            [-1, -1, -1, -1, 0, -1, -1, -1, -1]])
    # Q matrix
Q = np.zeros([9,9])

# Gamma (learning parameter)
gamma = 0.4

# Initial state. (Usually to be chosen at random)
initial_state = 1

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state, :]
    av_act = np.where(current_state_row >= 0)[0]
    return av_act

# Get available actions in the current state
available_act = available_actions(initial_state)

# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

#sample next action to be performed
action = sample_next_action(available_act)

# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action, :]))[0]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]

    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Update Q matrix
update(initial_state, action, gamma)

# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)

# Testing

# Goal state = 2

current_state = 1
steps = [current_state]

while current_state != 2:
    next_step_index = np.where(Q[current_state,:] == np.max(Q[current_state,:]))[0]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)

    steps.append(next_step_index)
    current_state = next_step_index

# Print selected sequence of steps
print("Selected path:")
print(steps)
Rui Nian
  • 2,544
  • 18
  • 32