I am training an agent using demonstrations of another agent provided in the form of (state, action, reward, next_state) tuples. I am using Keras and Sklearn.
This is how the q learning works:
def q_learning_model():
NUM_STATES = len(states)
NUM_ACTIONS = 4
GAMMA = 0.99
model_in = tf.keras.layers.Input(shape=(1,), dtype=tf.int32)
tmp = tf.one_hot(model_in, NUM_STATES)
tmp = tf.keras.layers.Dense(NUM_ACTIONS, use_bias=False)(tmp)
model_out = tf.squeeze(tmp, axis=1)
q_function = tf.keras.Model(model_in, model_out)
state = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name="State")
action = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name="Action")
reward = tf.keras.layers.Input(shape=(1,), name="Reward")
next_state = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name="Next_State")
td_target = reward + GAMMA * tf.reduce_max(q_function(next_state), axis=-1)
predictions = tf.gather(q_function(state), action, axis=-1)
train_model = tf.keras.Model(
inputs=[state, action, reward, next_state],
outputs=[predictions, td_target]
)
# to date it still feels as if tf.stop_gradient is a horrible
# hack similar to DDQL to stabelize the algorithm
td_error = 0.5 * tf.abs(tf.stop_gradient(td_target) - predictions) ** 2
train_model.add_loss(td_error, [state, action, reward, next_state])
predicted_action = tf.argmax(q_function(state), axis=-1)
correct_predictions = tf.keras.metrics.categorical_accuracy(
action, predicted_action)
train_model.add_metric(correct_predictions,
name="Matched_Actions", aggregation="mean")
return q_function, train_model
In the main function I call an external data file as follows:
states, actions, rewards, next_states = load_data("data.csv")
indices = np.arange(len(states))
and I train my agent:
q_scores = list()
policy_scores = list()
for train_idx, test_idx in KFold(shuffle=True).split(indices):
train_data = [
states[train_idx, ...],
actions[train_idx, ...],
rewards[train_idx, ...],
next_states[train_idx, ...],
]
test_data = [
states[test_idx, ...],
actions[test_idx, ...],
rewards[test_idx, ...],
next_states[test_idx, ...],
]
q_function, train_q = q_learning_model()
del q_function
train_q.compile(optimizer="sgd", experimental_run_tf_function=False)
train_q.fit(train_data)
_, score = train_q.evaluate(test_data)
q_scores.append(score)
policy_fn, train_policy = q_learning_model()
del policy_fn
train_policy.compile(optimizer="sgd", experimental_run_tf_function=False)
train_policy.fit(train_data)
_, score = train_policy.evaluate(test_data)
policy_scores.append(score)
All seem to work but I get the following error:
self.results[0] += batch_outs[0] * (batch_end - batch_start)
ValueError: operands could not be broadcast together with shapes (32,32,32) (3,3,3) (32,32,32)
Even though my train_data shapes (for state, action, reward, next_state) are the following:
train_data[0].shape -> (1123,)
train_data[1].shape -> (1123,)
train_data[2].shape -> (1123,)
train_data[3].shape -> (1123,)
Let me know if you have faced similar problems and how did you solve them. Feel free to reply also if you notice some other bug in the code.
Thank you for your time and support