in tf_agents/bandits/policies/boltzmann_reward_prediction_policy.py [0:0]
def __init__(self,
time_step_spec: types.TimeStep,
action_spec: types.NestedTensorSpec,
reward_network: types.Network,
temperature: types.FloatOrReturningFloat = 1.0,
boltzmann_gumbel_exploration_constant: Optional[
types.Float] = None,
observation_and_action_constraint_splitter: Optional[
types.Splitter] = None,
accepts_per_arm_features: bool = False,
constraints: Tuple[constr.NeuralConstraint, ...] = (),
emit_policy_info: Tuple[Text, ...] = (),
num_samples_list: Sequence[tf.Variable] = (),
name: Optional[Text] = None):
"""Builds a BoltzmannRewardPredictionPolicy given a reward network.
This policy takes a tf_agents.Network predicting rewards and chooses an
action with weighted probabilities (i.e., using a softmax over the network
estimates of value for each action).
Args:
time_step_spec: A `TimeStep` spec of the expected time_steps.
action_spec: A nest of BoundedTensorSpec representing the actions.
reward_network: An instance of a `tf_agents.network.Network`,
callable via `network(observation, step_type) -> (output, final_state)`.
temperature: float or callable that returns a float. The temperature used
in the Boltzmann exploration.
boltzmann_gumbel_exploration_constant: optional positive float. When
provided, the policy implements Neural Bandit with Boltzmann-Gumbel
exploration from the paper:
N. Cesa-Bianchi et al., "Boltzmann Exploration Done Right", NIPS 2017.
observation_and_action_constraint_splitter: A function used for masking
valid/invalid actions with each state of the environment. The function
takes in a full observation and returns a tuple consisting of 1) the
part of the observation intended as input to the network and 2) the
mask. The mask should be a 0-1 `Tensor` of shape
`[batch_size, num_actions]`. This function should also work with a
`TensorSpec` as input, and should output `TensorSpec` objects for the
observation and mask.
accepts_per_arm_features: (bool) Whether the policy accepts per-arm
features.
constraints: iterable of constraints objects that are instances of
`tf_agents.bandits.agents.NeuralConstraint`.
emit_policy_info: (tuple of strings) what side information we want to get
as part of the policy info. Allowed values can be found in
`policy_utilities.PolicyInfo`.
num_samples_list: list or tuple of tf.Variable's. Used only in
Boltzmann-Gumbel exploration. Otherwise, empty.
name: The name of this policy. All variables in this module will fall
under that name. Defaults to the class name.
Raises:
NotImplementedError: If `action_spec` contains more than one
`BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
"""
policy_utilities.check_no_mask_with_arm_features(
accepts_per_arm_features, observation_and_action_constraint_splitter)
flat_action_spec = tf.nest.flatten(action_spec)
if len(flat_action_spec) > 1:
raise NotImplementedError(
'action_spec can only contain a single BoundedTensorSpec.')
self._temperature = temperature
action_spec = flat_action_spec[0]
if (not tensor_spec.is_bounded(action_spec) or
not tensor_spec.is_discrete(action_spec) or
action_spec.shape.rank > 1 or
action_spec.shape.num_elements() != 1):
raise NotImplementedError(
'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
'Found {}.'.format(action_spec))
self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
self._action_offset = action_spec.minimum
reward_network.create_variables()
self._reward_network = reward_network
self._constraints = constraints
self._boltzmann_gumbel_exploration_constant = (
boltzmann_gumbel_exploration_constant)
self._num_samples_list = num_samples_list
if self._boltzmann_gumbel_exploration_constant is not None:
if self._boltzmann_gumbel_exploration_constant <= 0.0:
raise ValueError(
'The Boltzmann-Gumbel exploration constant is expected to be ',
'positive. Found: ', self._boltzmann_gumbel_exploration_constant)
if self._action_offset > 0:
raise NotImplementedError('Action offset is not supported when ',
'Boltzmann-Gumbel exploration is enabled.')
if accepts_per_arm_features:
raise NotImplementedError(
'Boltzmann-Gumbel exploration is not supported ',
'for arm features case.')
if len(self._num_samples_list) != self._expected_num_actions:
raise ValueError(
'Size of num_samples_list: ', len(self._num_samples_list),
' does not match the expected number of actions:',
self._expected_num_actions)
self._emit_policy_info = emit_policy_info
predicted_rewards_mean = ()
if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
predicted_rewards_mean = tensor_spec.TensorSpec(
[self._expected_num_actions])
bandit_policy_type = ()
if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
bandit_policy_type = (
policy_utilities.create_bandit_policy_type_tensor_spec(shape=[1]))
if accepts_per_arm_features:
# The features for the chosen arm is saved to policy_info.
chosen_arm_features_info = (
policy_utilities.create_chosen_arm_features_info_spec(
time_step_spec.observation))
info_spec = policy_utilities.PerArmPolicyInfo(
predicted_rewards_mean=predicted_rewards_mean,
bandit_policy_type=bandit_policy_type,
chosen_arm_features=chosen_arm_features_info)
else:
info_spec = policy_utilities.PolicyInfo(
predicted_rewards_mean=predicted_rewards_mean,
bandit_policy_type=bandit_policy_type)
self._accepts_per_arm_features = accepts_per_arm_features
super(BoltzmannRewardPredictionPolicy, self).__init__(
time_step_spec, action_spec,
policy_state_spec=reward_network.state_spec,
clip=False,
info_spec=info_spec,
emit_log_probability='log_probability' in emit_policy_info,
observation_and_action_constraint_splitter=(
observation_and_action_constraint_splitter),
name=name)