MTRF/algorithms/softlearning/algorithms/multi_sac.py [68:167]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            **kwargs,
    ):
        """
        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy: A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.
            Qs: Q-function approximators. The min of these
                approximators will be used. Usage of at least two Q-functions
                improves performance by reducing overestimation bias.
            pool (`PoolBase`): Replay pool to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.
            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise.
        """

        super(SAC, self).__init__(**kwargs)

        self._training_environment = training_environment
        self._evaluation_environment = evaluation_environment
        assert hasattr(self._training_environment, "set_goal"), (
            "Need to implement `set_goal(goal_index)` method in training environment.")

        self._policies = policies
        self._Qs_per_policy = Qs_per_policy
        self._samplers = samplers
        self._pools = pools

        self._num_goals = num_goals
        self._goal_index = 0
        self._epoch_length *= num_goals
        self._n_epochs *= num_goals

        error_msg = 'Mismatch between number of policies, Qs, and samplers'
        assert len(self._policies) == num_goals, error_msg
        assert len(self._Qs_per_policy) == num_goals, error_msg
        assert len(self._samplers) == num_goals, error_msg

        self._Q_targets_per_policy = Q_targets_per_policy
        self._training_ops_per_policy = [{} for _ in range(num_goals)]

        self._policy_lr = lr
        self._Q_lr = lr

        self._reward_scale = reward_scale
        self._target_entropy = (
            -np.prod(self._training_environment.action_space.shape)
            if target_entropy == 'auto'
            else target_entropy)

        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        self._reparameterize = reparameterize
        self._plotter = plotter

        self._her_iters = her_iters
        self._base_env = training_environment.unwrapped

        self._save_full_state = save_full_state
        self._save_eval_paths = save_eval_paths

        self._n_episodes_elapsed = 0
        self._num_grad_steps_taken_per_policy = [0 for _ in range(self._num_goals)]

        self._normalize_ext_reward_gamma = normalize_ext_reward_gamma
        if ext_reward_coeffs:
            assert len(ext_reward_coeffs) == num_goals, (
                "Mismatch between number of goals and provided extrinsic reward coeffs")
            self._ext_reward_coeffs = ext_reward_coeffs
        else:
            self._ext_reward_coeffs = [1 for _ in range(num_goals)]
        self._running_ext_rew_stds = [1 for _ in range(num_goals)]
        self._rnd_int_rew_coeffs = []

        self._rnd_targets, self._rnd_predictors = None, None
        assert len(rnd_int_rew_coeffs) == num_goals, (
            "Mismatch between number of goals and provided intrinsic reward coeffs")
        self._rnd_int_rew_coeffs = rnd_int_rew_coeffs
        self._rnd_lr = rnd_lr
        self._rnd_gamma = rnd_gamma
        self._running_int_rew_stds = [1 for _ in range(num_goals)]
        if rnd_networks:
            assert len(rnd_networks) == num_goals
            self._rnd_targets = [rnd_network_pair[0] for rnd_network_pair in rnd_networks]
            self._rnd_predictors = [rnd_network_pair[1] for rnd_network_pair in rnd_networks]
        else:
            # RND reward coefficients must be 0 if there are no RND networks passed in
            for i in range(len(self._rnd_int_rew_coeffs)):
                self._rnd_int_rew_coeffs[i] = 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


MTRF/algorithms/softlearning/algorithms/phased_sac.py [69:169]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            **kwargs,
    ):
        """
        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy: A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.
            Qs: Q-function approximators. The min of these
                approximators will be used. Usage of at least two Q-functions
                improves performance by reducing overestimation bias.
            pool (`PoolBase`): Replay pool to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.
            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise.
        """

        super(SAC, self).__init__(**kwargs)

        self._training_environment = training_environment
        self._evaluation_environment = evaluation_environment
        assert hasattr(self._training_environment, "set_goal"), (
            "Need to implement `set_goal(goal_index)` method in training environment.")

        self._policies = policies
        self._Qs_per_policy = Qs_per_policy
        self._samplers = samplers
        self._pools = pools

        self._num_goals = num_goals
        self._goal_index = 0
        self._epoch_length *= num_goals
        self._n_epochs *= num_goals

        error_msg = 'Mismatch between number of policies, Qs, and samplers'
        assert len(self._policies) == num_goals, error_msg
        assert len(self._Qs_per_policy) == num_goals, error_msg
        assert len(self._samplers) == num_goals, error_msg

        self._Q_targets_per_policy = Q_targets_per_policy
        self._training_ops_per_policy = [{} for _ in range(num_goals)]

        self._policy_lr = lr
        self._Q_lr = lr

        self._reward_scale = reward_scale
        self._target_entropy = (
            -np.prod(self._training_environment.action_space.shape)
            if target_entropy == 'auto'
            else target_entropy)

        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        self._reparameterize = reparameterize
        self._plotter = plotter

        self._her_iters = her_iters
        self._base_env = training_environment.unwrapped

        self._save_full_state = save_full_state
        self._save_eval_paths = save_eval_paths

        self._n_episodes_elapsed = 0
        self._num_grad_steps_taken_per_policy = [0 for _ in range(self._num_goals)]

        self._normalize_ext_reward_gamma = normalize_ext_reward_gamma

        if ext_reward_coeffs:
            assert len(ext_reward_coeffs) == num_goals, (
                "Mismatch between number of goals and provided extrinsic reward coeffs")
            self._ext_reward_coeffs = ext_reward_coeffs
        else:
            self._ext_reward_coeffs = [1 for _ in range(num_goals)]
        self._running_ext_rew_stds = [1 for _ in range(num_goals)]
        self._rnd_int_rew_coeffs = []

        self._rnd_targets, self._rnd_predictors = None, None
        assert len(rnd_int_rew_coeffs) == num_goals, (
            "Mismatch between number of goals and provided intrinsic reward coeffs")
        self._rnd_int_rew_coeffs = rnd_int_rew_coeffs
        self._rnd_lr = rnd_lr
        self._rnd_gamma = rnd_gamma
        self._running_int_rew_stds = [1 for _ in range(num_goals)]
        if rnd_networks:
            assert len(rnd_networks) == num_goals
            self._rnd_targets = [rnd_network_pair[0] for rnd_network_pair in rnd_networks]
            self._rnd_predictors = [rnd_network_pair[1] for rnd_network_pair in rnd_networks]
        else:
            # RND reward coefficients must be 0 if there are no RND networks passed in
            for i in range(len(self._rnd_int_rew_coeffs)):
                self._rnd_int_rew_coeffs[i] = 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -