I am using the code below (adapted from https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/rl/ipynb/actor_critic_cartpole.ipynb) to try and calibrate two continuous variables. The variables are run through a dummy function and the function stops once they are close enough to the objective values. I currently have the issue that the gradient is only implemented on the second variable while the first one remains the same. What needs to be changed so that both variables are changed according to their respective gradient?
num_inputs = 4
num_actions = 2
num_hidden = 128
inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="sigmoid")(common)
critic = layers.Dense(1)(common)
model = keras.Model(inputs=inputs, outputs=[action, critic])
def gaussian_noise_layer(input_layer, std=1):
noise = tf.random.normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32)
return input_layer + noise
def evaluate_estimation(X) :
minB =np.array([0,0]);
maxB=np.array([195,60]);
correctVal =np.array([130,20])
X = minB + gaussian_noise_layer(X) * (maxB - minB)
done = (max((abs(correctVal-X)/correctVal)*100)<1)
rld_e =np.array([X[0]-correctVal[0],(X[0]-correctVal[0])/correctVal[0],X[1]-correctVal[1],(X[1]-correctVal[1])/correctVal[1]])
reward =sum( ( ( 1/((X-correctVal)+eps))**2)**0.5)
return rld_e, reward, done
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_historys = [[],[]]
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
while True: # Run until solved
state = np.zeros(num_inputs)
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(1, max_steps_per_episode):
# env.render(); Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs, critic_value = model(state)
critic_value_history.append(critic_value[0, 0])
# here action == action__probs
#used sigmoid function so no need to use tf.math.log?
action_probs_historys[0].append(action_probs[0,0])
action_probs_historys[1].append(action_probs[0,1])
#action_probs_history.append(tf.math.log(action_probs[0,0]))
#for i, policy_branch in enumerate(policy):
# action_probs_history.append(policy_branch)
#action_probs_history = torch.stack(action_probs_history, dim=1)
# Apply the sampled action in our environment
state, reward, done = evaluate_estimation( action_probs[0])
print(timestep,action_probs[0])
rewards_history.append(reward)
episode_reward += reward
if done:
print("correct value was found",action_probs[0])
raise Exception
break
# Update running reward to check condition for solving
running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
print("Update running reward to check condition for solving",running_reward)
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = []
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.insert(0, discounted_sum)
# Normalize
returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
returns = returns.tolist()
print("normalize",returns)
# Calculating loss values to update our network
actor_losses = [[],[]]
critic_losses = []
for i, action_probs_history in enumerate( action_probs_historys):
history = zip(action_probs_history, critic_value_history, returns)
print("get hubert loss")
for log_prob, value, ret in history:
# At this point in history, the critic estimated that we would get a
# total reward = `value` in the future. We took an action with log probability
# of `log_prob` and ended up recieving a total reward = `ret`.
# The actor must be updated so that it predicts an action that leads to
# high rewards (compared to critic's estimate) with high probability.
diff = ret - value
actor_losses[i].append(-log_prob * diff) # actor loss
#print(actor_losses,-log_prob , diff)
# The critic must be updated so that it predicts a better estimate of
# the future rewards.
if i ==0:
critic_losses.append(
huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
)
# Backpropagation
#print("backpopagation",np.sum(critic_losses),critic_losses)
#loss_value=[[],[]]
#for i, actor_losse in enumerate( actor_losses):
loss_value1 = sum(actor_losses[0]) + sum(critic_losses)
loss_value2 = sum(actor_losses[1]) + sum(critic_losses)
print("loss_value",[loss_value1,loss_value2])
grads = tape.gradient([loss_value1,loss_value2], model.trainable_variables)
#print(loss_value,actor_losses,critic_losses)
#raise Exception
#print(loss_value,np.sum(critic_losses),model.trainable_variables, actor_losses )
print("grads",grads)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
# Clear the loss and reward history
print("clear")
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()