2

I'm trying to understand how to use @tf.function properly in a A2C problem.

I constantly get the following error:

Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.

The agent is built as follows:

class Agent():
    learning_rate = 0.0001
    CLIP_EDGE = 1e-8
    entropy = 0.0001
    critic_weight = 0.95
    def __init__(self,state_shape,action_size,hidden_neurons,memory,learning_rate = learning_rate, CLIP_EDGE = CLIP_EDGE, entropy = entropy, 
                 critic_weight = critic_weight, actor_name = "actor",critic_name = "critic", policy_name = "policy",main_folder = "main_folder"):
        
        self.state_shape = state_shape
        self.action_size = action_size
        self.hidden_neurons = hidden_neurons
        self.memory = memory
        self.learning_rate = learning_rate
        self.CLIP_EDGE = CLIP_EDGE
        self.entropy = entropy
        self.critic_weight = critic_weight
        self.actor_name = actor_name
        self.critic_name = critic_name
        self.policy_name = policy_name
        self.main_folder = main_folder
        
        self.actor, self.critic, self.policy = self.build_networks()
        
     
            
    def act(self, state):
        """Selects an action for the agent to take given a game state.

        Args:
            state (list of numbers): The state of the environment to act on.
            traning (bool): True if the agent is training.

        Returns:
            (int) The index of the action to take.
        """
        # If not acting randomly, take action with highest predicted value.
        state_batch = np.expand_dims(state, axis=0)
        probabilities = self.policy.predict(state_batch)[0]
        action = np.random.choice(self.action_size, p=probabilities)
        return action
    
    
    def learn(self, print_variables=False):
        """Trains the Deep Q Network based on stored experiences."""
        gamma = self.memory.gamma
        experiences = self.memory.sample()
        state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
        
        # One hot enocde actions
        actions = np.zeros([len(action_mb), self.action_size])
        actions[np.arange(len(action_mb)), action_mb] = 1

        #Apply TD(0)
        discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
        state_values = self.critic.predict([state_mb])
        advantages = discount_mb - np.squeeze(state_values)
        
        
        if print_variables:
            print("discount_mb", discount_mb)
            print("next_value", next_value)
            print("state_values", state_values)
            print("advantages", advantages)
        else:
            self.actor.train_on_batch(
                [state_mb, advantages], [actions, discount_mb])
        
            
    def build_networks(self):
        """Creates Actor Critic Neural Networks.

        Creates a hidden-layer Policy Gradient Neural Network. The loss
        function is altered to be a log-likelihood function weighted
        by an action's advantage.

        """

        state_input = Input(shape=self.state_shape, name='frames')
        advantages = Input((1,), name='advantages')  # PG, A instead of G

        # PG
        actor_1 = Dense(units=self.hidden_neurons, activation="relu",name='actor1')(state_input)
        actor_3 = Dense(units=int(self.hidden_neurons), activation="relu",name='actor3')(actor_1)
        adrop_1 = Dropout(0.2,name='actor_drop_1')(actor_3)
        actor_4 = Dense(units = self.hidden_neurons, activation="relu")(adrop_1)
        probabilities = Dense(self.action_size, activation='softmax',name='actor_output')(actor_4)

        # DQN
        critic_1 = Dense(units = self.hidden_neurons,activation="relu",name='critic1')(state_input)
        critic_3 = Dense(units = int(self.hidden_neurons), activation="relu",name='critic3')(critic_1)
        cdrop_1 = Dropout(0.2,name='critic_drop_1')(critic_3)
        critic_4 = Dense(units = self.hidden_neurons, activation="relu")(cdrop_1) #activation era relu por error... se cambio a elu, MONITOREAR
        values = Dense(1, activation='linear',name='critic_output')(critic_4)

        def actor_loss(y_true, y_pred):  # PG
            y_pred_clipped = K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)
            log_lik = y_true*K.log(y_pred_clipped)
            entropy_loss = y_pred * K.log(K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE))  # New
            return K.sum(-log_lik * advantages) - (self.entropy * K.sum(entropy_loss))

        # Train both actor and critic at the same time.
        actor = Model(
            inputs=[state_input, advantages], outputs=[probabilities, values])
        actor.compile(
            loss=[actor_loss, 'mean_squared_error'],  # [PG, DQN]
            loss_weights=[1, self.critic_weight],  # [PG, DQN]
            optimizer=Adam(learning_rate=self.learning_rate))#,clipnorm=1.0))

        critic = Model(inputs=[state_input], outputs=[values])
        policy = Model(inputs=[state_input], outputs=[probabilities])

        tf.keras.utils.plot_model(actor,f"{self.main_folder}/Agents/{self.actor_name}.png",show_shapes=True)
        tf.keras.utils.plot_model(critic,f"{self.main_folder}/Agents/{self.critic_name}.png",show_shapes=True)
        tf.keras.utils.plot_model(policy,f"{self.main_folder}/Agents/{self.policy_name}.png",show_shapes=True)
        
        return actor, critic, policy

The loop where the agent interacts with the environment is this:

with tf.Graph().as_default():
    agent = Agent()
    environment = Environment()
    state = environment.reset()
    done = False
    while not done:
        acion = agent.act(state)
        state,reward,done,info = environment.step(action)
        next_value = agent.critic.predict([[state]])
        agent.memory.add((state,action,reward,done,next_value))
        if agent.memory.full():
            agent.learn()

This works fine. My problem comes when I try to switch to use @tf.function because it seems (afaik) that increases training speed (also did a little of benchmark in a jupyter notebook and it is actually faster).

The "refactored" code is this:

The main loop:

agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
    acion = agent.act(state)
    state,reward,done,info = environment.step(action)
    next_value = agent.model_predict(agent.critic,[[state]]).numpy() #REMOVED .predict FROM MODEL
    agent.memory.add((state,action,reward,done,next_value))
    if agent.memory.full():
        agent.learn()

The modified functions in the Agent class:

@tf.function #NEW FUNCTION ADDED USING @tf.function
def model_predict(self,model,x):
    return model(x)

def act(self, state): #MODIFIED FUNCTION, NOW USES self.model_predict
    """Selects an action for the agent to take given a game state.
    
    Args:
        state (list of numbers): The state of the environment to act on.
        traning (bool): True if the agent is training.
    
    Returns:
        (int) The index of the action to take.
    """
    # If not acting randomly, take action with highest predicted value.
    state_batch = np.expand_dims(state, axis=0)
    probabilities = self.model_predict(self.policy,state_batch).numpy()[0]
    action = np.random.choice(self.action_size, p=probabilities)
    return action


def learn(self, print_variables=False): #MODIFIED FUNCTION, NOW USES self.model_predict
    """Trains the Deep Q Network based on stored experiences."""
    gamma = self.memory.gamma
    experiences = self.memory.sample()
    state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
            
    # One hot enocde actions
    actions = np.zeros([len(action_mb), self.action_size])
    actions[np.arange(len(action_mb)), action_mb] = 1
    
    #Apply TD(0)
    discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
    state_values = self.model_predict(self.critic,[state_mb]).numpy()
    advantages = discount_mb - np.squeeze(state_values)
            
            
    if print_variables:
        print("discount_mb", discount_mb)
        print("next_value", next_value)
        print("state_values", state_values)
        print("advantages", advantages)
    else:
        self.actor.train_on_batch(
            [state_mb, advantages], [actions, discount_mb])

The error is triggered when self.actor.train_on_batch is executed, giving me the error mentioned above. Why this happens and what I'm doing wrong?

xerac
  • 147
  • 8
  • See this answer [Cannot convert a symbolic Keras input/output to a numpy array TypeError when using sampled_softmax in tensorflow 2.4](https://stackoverflow.com/questions/65366442/cannot-convert-a-symbolic-keras-input-output-to-a-numpy-array-typeerror-when-usi) – I_Al-thamary Feb 17 '22 at 09:35

0 Answers0