actor critic method for multiple continuous variables

Question

I am using the code below (adapted from https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/rl/ipynb/actor_critic_cartpole.ipynb) to try and calibrate two continuous variables. The variables are run through a dummy function and the function stops once they are close enough to the objective values. I currently have the issue that the gradient is only implemented on the second variable while the first one remains the same. What needs to be changed so that both variables are changed according to their respective gradient?

num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="sigmoid")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

def gaussian_noise_layer(input_layer, std=1):
    noise = tf.random.normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32) 
    return input_layer + noise

def evaluate_estimation(X) :
    minB =np.array([0,0]);
    maxB=np.array([195,60]);
    correctVal =np.array([130,20])
    X = minB + gaussian_noise_layer(X) * (maxB - minB)  
    done      = (max((abs(correctVal-X)/correctVal)*100)<1)
    rld_e    =np.array([X[0]-correctVal[0],(X[0]-correctVal[0])/correctVal[0],X[1]-correctVal[1],(X[1]-correctVal[1])/correctVal[1]])
    reward    =sum( ( ( 1/((X-correctVal)+eps))**2)**0.5)
    return rld_e, reward, done

optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_historys = [[],[]]
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:  # Run until solved
    state = np.zeros(num_inputs)
    episode_reward = 0
    with tf.GradientTape() as tape:
        
        for timestep in range(1, max_steps_per_episode):
            
            # env.render(); Adding this line would show the attempts
            # of the agent in a pop up window.

            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # here action == action__probs
            
            #used sigmoid function so no need to use tf.math.log?
            action_probs_historys[0].append(action_probs[0,0])
            action_probs_historys[1].append(action_probs[0,1])
            #action_probs_history.append(tf.math.log(action_probs[0,0]))
            #for i, policy_branch in enumerate(policy):
             # action_probs_history.append(policy_branch)
            #action_probs_history = torch.stack(action_probs_history, dim=1)

            # Apply the sampled action in our environment
            
            state, reward, done = evaluate_estimation( action_probs[0])
            print(timestep,action_probs[0])
            rewards_history.append(reward)
            episode_reward += reward


            if done:
                print("correct value was found",action_probs[0])
                raise Exception
                break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
        print("Update running reward to check condition for solving",running_reward)

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        print("normalize",returns)

        # Calculating loss values to update our network
        
        actor_losses = [[],[]]
        critic_losses = []
        for i, action_probs_history in enumerate( action_probs_historys):
          history = zip(action_probs_history, critic_value_history, returns)
          print("get hubert loss")
          for log_prob, value, ret in history:
              # At this point in history, the critic estimated that we would get a
              # total reward = `value` in the future. We took an action with log probability
              # of `log_prob` and ended up recieving a total reward = `ret`.
              # The actor must be updated so that it predicts an action that leads to
              # high rewards (compared to critic's estimate) with high probability.
              
              diff = ret - value
              actor_losses[i].append(-log_prob * diff)  # actor loss
              #print(actor_losses,-log_prob , diff)

              # The critic must be updated so that it predicts a better estimate of
              # the future rewards.
              if i ==0:
                critic_losses.append(
                    huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                )
              

        # Backpropagation
        #print("backpopagation",np.sum(critic_losses),critic_losses)
        #loss_value=[[],[]]
        #for i, actor_losse in enumerate( actor_losses):
        loss_value1 = sum(actor_losses[0]) + sum(critic_losses)
        loss_value2 = sum(actor_losses[1]) + sum(critic_losses)
        print("loss_value",[loss_value1,loss_value2])
        grads = tape.gradient([loss_value1,loss_value2], model.trainable_variables)
        #print(loss_value,actor_losses,critic_losses)
        #raise Exception
        #print(loss_value,np.sum(critic_losses),model.trainable_variables, actor_losses )
        print("grads",grads)

        optimizer.apply_gradients(zip(grads, model.trainable_variables))


        # Clear the loss and reward history
        print("clear")
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

actor critic method for multiple continuous variables

0 Answers0