def _init_actor_updates()

in MTRF/algorithms/softlearning/algorithms/multi_sac.py [0:0]


    def _init_actor_updates(self):
        """Create minimization operations for policies and entropies.

        Creates a `tf.optimizer.minimize` operations for updating
        policy and entropy with gradient descent, and adds them to
        `self._training_ops` attribute.

        See Section 4.2 in [1], for further information of the policy update,
        and Section 5 in [1] for further information of the entropy update.
        """

        self._log_alphas = []
        self._alpha_optimizers = []
        self._alphas = []
        self._policy_optimizers = []
        self._policy_losses = []

        for i, policy in enumerate(self._policies):
            policy_inputs = flatten_input_structure({
                name: self._placeholders['observations'][name]
                for name in policy.observation_keys
            })
            actions = policy.actions(policy_inputs)
            log_pis = policy.log_pis(policy_inputs, actions)

            assert log_pis.shape.as_list() == [None, 1]

            log_alpha = tf.compat.v1.get_variable(
                f'log_alpha_{i}',
                dtype=tf.float32,
                initializer=0.0)
            alpha = tf.exp(log_alpha)
            self._log_alphas.append(log_alpha)
            self._alphas.append(alpha)

            if isinstance(self._target_entropy, Number):
                alpha_loss = -tf.reduce_mean(
                    log_alpha * tf.stop_gradient(log_pis + self._target_entropy))

                alpha_optimizer = tf.compat.v1.train.AdamOptimizer(
                    self._policy_lr, name=f'alpha_optimizer_{i}')
                self._alpha_optimizers.append(alpha_optimizer)
                alpha_train_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
                self._training_ops_per_policy[i].update({
                    f'temperature_alpha_{i}': alpha_train_op
                })

            if self._action_prior == 'normal':
                policy_prior = tfp.distributions.MultivariateNormalDiag(
                    loc=tf.zeros(self._action_shape),
                    scale_diag=tf.ones(self._action_shape))
                policy_prior_log_probs = policy_prior.log_prob(actions)
            elif self._action_prior == 'uniform':
                policy_prior_log_probs = 0.0

            Q_observations = {
                name: self._placeholders['observations'][name]
                for name in self._Qs_per_policy[i][0].observation_keys
            }
            Q_inputs = flatten_input_structure({
                **Q_observations, 'actions': actions})
            Q_log_targets = tuple(Q(Q_inputs) for Q in self._Qs_per_policy[i])
            min_Q_log_target = tf.reduce_min(Q_log_targets, axis=0)

            if self._reparameterize:
                policy_kl_losses = (
                    alpha * log_pis
                    - min_Q_log_target
                    - policy_prior_log_probs)
            else:
                raise NotImplementedError

            assert policy_kl_losses.shape.as_list() == [None, 1]

            self._policy_losses.append(policy_kl_losses)
            policy_loss = tf.reduce_mean(policy_kl_losses)

            policy_optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=self._policy_lr,
                name=f"policy_optimizer_{i}")

            self._policy_optimizers.append(policy_optimizer)

            policy_train_op = policy_optimizer.minimize(
                loss=policy_loss,
                var_list=policy.trainable_variables)

            self._training_ops_per_policy[i].update({f'policy_train_op_{i}': policy_train_op})