I'm trying to understand how to use @tf.function properly in a A2C problem.
I constantly get the following error:
Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.
The agent is built as follows:
class Agent():
learning_rate = 0.0001
CLIP_EDGE = 1e-8
entropy = 0.0001
critic_weight = 0.95
def __init__(self,state_shape,action_size,hidden_neurons,memory,learning_rate = learning_rate, CLIP_EDGE = CLIP_EDGE, entropy = entropy,
critic_weight = critic_weight, actor_name = "actor",critic_name = "critic", policy_name = "policy",main_folder = "main_folder"):
self.state_shape = state_shape
self.action_size = action_size
self.hidden_neurons = hidden_neurons
self.memory = memory
self.learning_rate = learning_rate
self.CLIP_EDGE = CLIP_EDGE
self.entropy = entropy
self.critic_weight = critic_weight
self.actor_name = actor_name
self.critic_name = critic_name
self.policy_name = policy_name
self.main_folder = main_folder
self.actor, self.critic, self.policy = self.build_networks()
def act(self, state):
"""Selects an action for the agent to take given a game state.
Args:
state (list of numbers): The state of the environment to act on.
traning (bool): True if the agent is training.
Returns:
(int) The index of the action to take.
"""
# If not acting randomly, take action with highest predicted value.
state_batch = np.expand_dims(state, axis=0)
probabilities = self.policy.predict(state_batch)[0]
action = np.random.choice(self.action_size, p=probabilities)
return action
def learn(self, print_variables=False):
"""Trains the Deep Q Network based on stored experiences."""
gamma = self.memory.gamma
experiences = self.memory.sample()
state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
# One hot enocde actions
actions = np.zeros([len(action_mb), self.action_size])
actions[np.arange(len(action_mb)), action_mb] = 1
#Apply TD(0)
discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
state_values = self.critic.predict([state_mb])
advantages = discount_mb - np.squeeze(state_values)
if print_variables:
print("discount_mb", discount_mb)
print("next_value", next_value)
print("state_values", state_values)
print("advantages", advantages)
else:
self.actor.train_on_batch(
[state_mb, advantages], [actions, discount_mb])
def build_networks(self):
"""Creates Actor Critic Neural Networks.
Creates a hidden-layer Policy Gradient Neural Network. The loss
function is altered to be a log-likelihood function weighted
by an action's advantage.
"""
state_input = Input(shape=self.state_shape, name='frames')
advantages = Input((1,), name='advantages') # PG, A instead of G
# PG
actor_1 = Dense(units=self.hidden_neurons, activation="relu",name='actor1')(state_input)
actor_3 = Dense(units=int(self.hidden_neurons), activation="relu",name='actor3')(actor_1)
adrop_1 = Dropout(0.2,name='actor_drop_1')(actor_3)
actor_4 = Dense(units = self.hidden_neurons, activation="relu")(adrop_1)
probabilities = Dense(self.action_size, activation='softmax',name='actor_output')(actor_4)
# DQN
critic_1 = Dense(units = self.hidden_neurons,activation="relu",name='critic1')(state_input)
critic_3 = Dense(units = int(self.hidden_neurons), activation="relu",name='critic3')(critic_1)
cdrop_1 = Dropout(0.2,name='critic_drop_1')(critic_3)
critic_4 = Dense(units = self.hidden_neurons, activation="relu")(cdrop_1) #activation era relu por error... se cambio a elu, MONITOREAR
values = Dense(1, activation='linear',name='critic_output')(critic_4)
def actor_loss(y_true, y_pred): # PG
y_pred_clipped = K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)
log_lik = y_true*K.log(y_pred_clipped)
entropy_loss = y_pred * K.log(K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)) # New
return K.sum(-log_lik * advantages) - (self.entropy * K.sum(entropy_loss))
# Train both actor and critic at the same time.
actor = Model(
inputs=[state_input, advantages], outputs=[probabilities, values])
actor.compile(
loss=[actor_loss, 'mean_squared_error'], # [PG, DQN]
loss_weights=[1, self.critic_weight], # [PG, DQN]
optimizer=Adam(learning_rate=self.learning_rate))#,clipnorm=1.0))
critic = Model(inputs=[state_input], outputs=[values])
policy = Model(inputs=[state_input], outputs=[probabilities])
tf.keras.utils.plot_model(actor,f"{self.main_folder}/Agents/{self.actor_name}.png",show_shapes=True)
tf.keras.utils.plot_model(critic,f"{self.main_folder}/Agents/{self.critic_name}.png",show_shapes=True)
tf.keras.utils.plot_model(policy,f"{self.main_folder}/Agents/{self.policy_name}.png",show_shapes=True)
return actor, critic, policy
The loop where the agent interacts with the environment is this:
with tf.Graph().as_default():
agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
acion = agent.act(state)
state,reward,done,info = environment.step(action)
next_value = agent.critic.predict([[state]])
agent.memory.add((state,action,reward,done,next_value))
if agent.memory.full():
agent.learn()
This works fine. My problem comes when I try to switch to use @tf.function because it seems (afaik) that increases training speed (also did a little of benchmark in a jupyter notebook and it is actually faster).
The "refactored" code is this:
The main loop:
agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
acion = agent.act(state)
state,reward,done,info = environment.step(action)
next_value = agent.model_predict(agent.critic,[[state]]).numpy() #REMOVED .predict FROM MODEL
agent.memory.add((state,action,reward,done,next_value))
if agent.memory.full():
agent.learn()
The modified functions in the Agent class:
@tf.function #NEW FUNCTION ADDED USING @tf.function
def model_predict(self,model,x):
return model(x)
def act(self, state): #MODIFIED FUNCTION, NOW USES self.model_predict
"""Selects an action for the agent to take given a game state.
Args:
state (list of numbers): The state of the environment to act on.
traning (bool): True if the agent is training.
Returns:
(int) The index of the action to take.
"""
# If not acting randomly, take action with highest predicted value.
state_batch = np.expand_dims(state, axis=0)
probabilities = self.model_predict(self.policy,state_batch).numpy()[0]
action = np.random.choice(self.action_size, p=probabilities)
return action
def learn(self, print_variables=False): #MODIFIED FUNCTION, NOW USES self.model_predict
"""Trains the Deep Q Network based on stored experiences."""
gamma = self.memory.gamma
experiences = self.memory.sample()
state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
# One hot enocde actions
actions = np.zeros([len(action_mb), self.action_size])
actions[np.arange(len(action_mb)), action_mb] = 1
#Apply TD(0)
discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
state_values = self.model_predict(self.critic,[state_mb]).numpy()
advantages = discount_mb - np.squeeze(state_values)
if print_variables:
print("discount_mb", discount_mb)
print("next_value", next_value)
print("state_values", state_values)
print("advantages", advantages)
else:
self.actor.train_on_batch(
[state_mb, advantages], [actions, discount_mb])
The error is triggered when self.actor.train_on_batch is executed, giving me the error mentioned above. Why this happens and what I'm doing wrong?