MTRF/algorithms/softlearning/algorithms/multi_sac.py [187:253]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self._init_diagnostics_ops()

    def _init_external_rewards(self):
        self._unscaled_ext_rewards = [
            self._placeholders['rewards'] for _ in range(self._num_goals)]

    def _get_Q_targets(self):
        Q_targets = []

        self._placeholders['reward'].update({
            f'running_ext_rew_std_{i}': tf.compat.v1.placeholder(
                tf.float32, shape=(), name=f'running_ext_rew_std_{i}')
            for i in range(self._num_goals)
        })

        (self._unscaled_int_rewards,
         self._int_rewards,
         self._normalized_ext_rewards,
         self._ext_rewards,
         self._total_rewards) = [], [], [], [], []

        for i, policy in enumerate(self._policies):
            policy_inputs = flatten_input_structure({
                name: self._placeholders['next_observations'][name]
                for name in policy.observation_keys
            })
            next_actions = policy.actions(policy_inputs)
            next_log_pis = policy.log_pis(policy_inputs, next_actions)

            next_Q_observations = {
                name: self._placeholders['next_observations'][name]
                for name in self._Qs_per_policy[i][0].observation_keys
            }
            next_Q_inputs = flatten_input_structure(
                {**next_Q_observations, 'actions': next_actions})
            next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets_per_policy[i])

            min_next_Q = tf.reduce_min(next_Qs_values, axis=0)
            next_values = min_next_Q - self._alphas[i] * next_log_pis

            terminals = tf.cast(self._placeholders['terminals'], next_values.dtype)

            if self._rnd_int_rew_coeffs[i] > 0:
                self._unscaled_int_rewards.append(tf.clip_by_value(
                    self._rnd_errors[i] / self._placeholders['reward'][f'running_int_rew_std_{i}'],
                    0, 1000
                ))
            else:
                self._unscaled_int_rewards.append(0)
            self._int_rewards.append(self._rnd_int_rew_coeffs[i] * self._unscaled_int_rewards[i])

            if self._ext_reward_coeffs[i] > 0:
                self._normalized_ext_rewards.append(
                    self._unscaled_ext_rewards[i] / self._placeholders['reward'][f'running_ext_rew_std_{i}'])
            else:
                self._normalized_ext_rewards.append(0)
            self._ext_rewards.append(self._ext_reward_coeffs[i] * self._normalized_ext_rewards[i])

            self._total_rewards.append(self._ext_rewards[i] + self._int_rewards[i])

            Q_target = td_target(
                reward=self._reward_scale * self._total_rewards[i],
                discount=self._discount,
                next_value=(1 - terminals) * next_values)
            Q_targets.append(tf.stop_gradient(Q_target))

        return Q_targets
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



MTRF/algorithms/softlearning/algorithms/phased_sac.py [184:250]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self._init_diagnostics_ops()

    def _init_external_rewards(self):
        self._unscaled_ext_rewards = [
            self._placeholders['rewards'] for _ in range(self._num_goals)]

    def _get_Q_targets(self):
        Q_targets = []

        self._placeholders['reward'].update({
            f'running_ext_rew_std_{i}': tf.compat.v1.placeholder(
                tf.float32, shape=(), name=f'running_ext_rew_std_{i}')
            for i in range(self._num_goals)
        })

        (self._unscaled_int_rewards,
         self._int_rewards,
         self._normalized_ext_rewards,
         self._ext_rewards,
         self._total_rewards) = [], [], [], [], []

        for i, policy in enumerate(self._policies):
            policy_inputs = flatten_input_structure({
                name: self._placeholders['next_observations'][name]
                for name in policy.observation_keys
            })
            next_actions = policy.actions(policy_inputs)
            next_log_pis = policy.log_pis(policy_inputs, next_actions)

            next_Q_observations = {
                name: self._placeholders['next_observations'][name]
                for name in self._Qs_per_policy[i][0].observation_keys
            }
            next_Q_inputs = flatten_input_structure(
                {**next_Q_observations, 'actions': next_actions})
            next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets_per_policy[i])

            min_next_Q = tf.reduce_min(next_Qs_values, axis=0)
            next_values = min_next_Q - self._alphas[i] * next_log_pis

            terminals = tf.cast(self._placeholders['terminals'], next_values.dtype)

            if self._rnd_int_rew_coeffs[i] > 0:
                self._unscaled_int_rewards.append(tf.clip_by_value(
                    self._rnd_errors[i] / self._placeholders['reward'][f'running_int_rew_std_{i}'],
                    0, 1000
                ))
            else:
                self._unscaled_int_rewards.append(0)
            self._int_rewards.append(self._rnd_int_rew_coeffs[i] * self._unscaled_int_rewards[i])

            if self._ext_reward_coeffs[i] > 0:
                self._normalized_ext_rewards.append(
                    self._unscaled_ext_rewards[i] / self._placeholders['reward'][f'running_ext_rew_std_{i}'])
            else:
                self._normalized_ext_rewards.append(0)
            self._ext_rewards.append(self._ext_reward_coeffs[i] * self._normalized_ext_rewards[i])

            self._total_rewards.append(self._ext_rewards[i] + self._int_rewards[i])

            Q_target = td_target(
                reward=self._reward_scale * self._total_rewards[i],
                discount=self._discount,
                next_value=(1 - terminals) * next_values)
            Q_targets.append(tf.stop_gradient(Q_target))

        return Q_targets
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



