in MTRF/algorithms/softlearning/algorithms/phased_sac.py [0:0]
def _init_actor_updates(self):
"""Create minimization operations for policies and entropies.
Creates a `tf.optimizer.minimize` operations for updating
policy and entropy with gradient descent, and adds them to
`self._training_ops` attribute.
See Section 4.2 in [1], for further information of the policy update,
and Section 5 in [1] for further information of the entropy update.
"""
self._log_alphas = []
self._alpha_optimizers = []
self._alphas = []
self._policy_optimizers = []
self._policy_losses = []
for i, policy in enumerate(self._policies):
policy_inputs = flatten_input_structure({
name: self._placeholders['observations'][name]
for name in policy.observation_keys
})
actions = policy.actions(policy_inputs)
log_pis = policy.log_pis(policy_inputs, actions)
assert log_pis.shape.as_list() == [None, 1]
log_alpha = tf.compat.v1.get_variable(
f'log_alpha_{i}',
dtype=tf.float32,
initializer=0.0)
alpha = tf.exp(log_alpha)
self._log_alphas.append(log_alpha)
self._alphas.append(alpha)
if isinstance(self._target_entropy, Number):
alpha_loss = -tf.reduce_mean(
log_alpha * tf.stop_gradient(log_pis + self._target_entropy))
alpha_optimizer = tf.compat.v1.train.AdamOptimizer(
self._policy_lr, name=f'alpha_optimizer_{i}')
self._alpha_optimizers.append(alpha_optimizer)
alpha_train_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
self._training_ops_per_policy[i].update({
f'temperature_alpha_{i}': alpha_train_op
})
if self._action_prior == 'normal':
policy_prior = tfp.distributions.MultivariateNormalDiag(
loc=tf.zeros(self._action_shape),
scale_diag=tf.ones(self._action_shape))
policy_prior_log_probs = policy_prior.log_prob(actions)
elif self._action_prior == 'uniform':
policy_prior_log_probs = 0.0
Q_observations = {
name: self._placeholders['observations'][name]
for name in self._Qs_per_policy[i][0].observation_keys
}
Q_inputs = flatten_input_structure({
**Q_observations, 'actions': actions})
Q_log_targets = tuple(Q(Q_inputs) for Q in self._Qs_per_policy[i])
min_Q_log_target = tf.reduce_min(Q_log_targets, axis=0)
if self._reparameterize:
policy_kl_losses = (
alpha * log_pis
- min_Q_log_target
- policy_prior_log_probs)
else:
raise NotImplementedError
assert policy_kl_losses.shape.as_list() == [None, 1]
self._policy_losses.append(policy_kl_losses)
policy_loss = tf.reduce_mean(policy_kl_losses)
policy_optimizer = tf.compat.v1.train.AdamOptimizer(
learning_rate=self._policy_lr,
name=f"policy_optimizer_{i}")
self._policy_optimizers.append(policy_optimizer)
policy_train_op = policy_optimizer.minimize(
loss=policy_loss,
var_list=policy.trainable_variables)
self._training_ops_per_policy[i].update({f'policy_train_op_{i}': policy_train_op})