in tf_agents/bandits/agents/neural_linucb_agent.py [0:0]
def __init__(
self,
time_step_spec: types.TimeStep,
action_spec: types.BoundedTensorSpec,
encoding_network: types.Network,
encoding_network_num_train_steps: int,
encoding_dim: int,
optimizer: types.Optimizer,
variable_collection: Optional[NeuralLinUCBVariableCollection] = None,
alpha: float = 1.0,
gamma: float = 1.0,
epsilon_greedy: float = 0.0,
observation_and_action_constraint_splitter: Optional[
types.Splitter] = None,
accepts_per_arm_features: bool = False,
distributed_train_encoding_network: bool = False,
# Params for training.
error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error,
gradient_clipping: Optional[float] = None,
# Params for debugging.
debug_summaries: bool = False,
summarize_grads_and_vars: bool = False,
train_step_counter: Optional[tf.Variable] = None,
emit_policy_info: Sequence[Text] = (),
emit_log_probability: bool = False,
dtype: tf.DType = tf.float64,
name: Optional[Text] = 'neural_linucb_agent'):
"""Initialize an instance of `NeuralLinUCBAgent`.
Args:
time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
describing the number of actions for this agent.
encoding_network: a Keras network that encodes the observations.
encoding_network_num_train_steps: how many training steps to run for
training the encoding network before switching to LinUCB. If negative,
the encoding network is assumed to be already trained.
encoding_dim: the dimension of encoded observations.
optimizer: The optimizer to use for training.
variable_collection: Instance of `NeuralLinUCBVariableCollection`.
Collection of variables to be updated by the agent. If `None`, a new
instance of `LinearBanditVariables` will be created. Note that this
collection excludes the variables owned by the encoding network.
alpha: (float) positive scalar. This is the exploration parameter that
multiplies the confidence intervals.
gamma: a float forgetting factor in [0.0, 1.0]. When set to
1.0, the algorithm does not forget.
epsilon_greedy: A float representing the probability of choosing a random
action instead of the greedy action.
observation_and_action_constraint_splitter: A function used for masking
valid/invalid actions with each state of the environment. The function
takes in a full observation and returns a tuple consisting of 1) the
part of the observation intended as input to the bandit agent and
policy, and 2) the boolean mask. This function should also work with a
`TensorSpec` as input, and should output `TensorSpec` objects for the
observation and mask.
accepts_per_arm_features: (bool) Whether the policy accepts per-arm
features.
distributed_train_encoding_network: (bool) whether to train the encoding
network or not. This applies only in distributed training setting. When
set to true this agent will train the encoding network. Otherwise, it
will assume the encoding network is already trained and will train
LinUCB on top of it.
error_loss_fn: A function for computing the error loss, taking parameters
labels, predictions, and weights (any function from tf.losses would
work). The default is `tf.losses.mean_squared_error`.
gradient_clipping: A float representing the norm length to clip gradients
(or None for no clipping.)
debug_summaries: A Python bool, default False. When True, debug summaries
are gathered.
summarize_grads_and_vars: A Python bool, default False. When True,
gradients and network variable summaries are written during training.
train_step_counter: An optional `tf.Variable` to increment every time the
train op is run. Defaults to the `global_step`.
emit_policy_info: (tuple of strings) what side information we want to get
as part of the policy info. Allowed values can be found in
`policy_utilities.PolicyInfo`.
emit_log_probability: Whether the NeuralLinUCBPolicy emits
log-probabilities or not. Since the policy is deterministic, the
probability is just 1.
dtype: The type of the parameters stored and updated by the agent. Should
be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
name: a name for this instance of `NeuralLinUCBAgent`.
Raises:
TypeError if variable_collection is not an instance of
`NeuralLinUCBVariableCollection`.
ValueError if dtype is not one of `tf.float32` or `tf.float64`.
"""
tf.Module.__init__(self, name=name)
common.tf_agents_gauge.get_cell('TFABandit').set(True)
self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
action_spec)
self._num_models = 1 if accepts_per_arm_features else self._num_actions
self._observation_and_action_constraint_splitter = (
observation_and_action_constraint_splitter)
self._accepts_per_arm_features = accepts_per_arm_features
self._alpha = alpha
if variable_collection is None:
variable_collection = NeuralLinUCBVariableCollection(
self._num_models, encoding_dim, dtype)
elif not isinstance(variable_collection, NeuralLinUCBVariableCollection):
raise TypeError('Parameter `variable_collection` should be '
'of type `NeuralLinUCBVariableCollection`.')
self._variable_collection = variable_collection
self._gamma = gamma
if self._gamma < 0.0 or self._gamma > 1.0:
raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')
self._dtype = dtype
if dtype not in (tf.float32, tf.float64):
raise ValueError(
'Agent dtype should be either `tf.float32 or `tf.float64`.')
self._epsilon_greedy = epsilon_greedy
reward_layer = tf.keras.layers.Dense(
self._num_models,
kernel_initializer=tf.random_uniform_initializer(
minval=-0.03, maxval=0.03),
use_bias=False,
activation=None,
name='reward_layer')
encoding_network.create_variables()
self._encoding_network = encoding_network
reward_layer.build(input_shape=tf.TensorShape([None, encoding_dim]))
self._reward_layer = reward_layer
self._encoding_network_num_train_steps = encoding_network_num_train_steps
self._encoding_dim = encoding_dim
self._optimizer = optimizer
self._error_loss_fn = error_loss_fn
self._gradient_clipping = gradient_clipping
train_step_counter = tf.compat.v1.train.get_or_create_global_step()
self._distributed_train_encoding_network = (
distributed_train_encoding_network)
policy = neural_linucb_policy.NeuralLinUCBPolicy(
encoding_network=self._encoding_network,
encoding_dim=self._encoding_dim,
reward_layer=self._reward_layer,
epsilon_greedy=self._epsilon_greedy,
actions_from_reward_layer=self.actions_from_reward_layer,
cov_matrix=self.cov_matrix,
data_vector=self.data_vector,
num_samples=self.num_samples,
time_step_spec=time_step_spec,
alpha=alpha,
emit_policy_info=emit_policy_info,
emit_log_probability=emit_log_probability,
accepts_per_arm_features=accepts_per_arm_features,
distributed_use_reward_layer=distributed_train_encoding_network,
observation_and_action_constraint_splitter=(
observation_and_action_constraint_splitter))
training_data_spec = None
if accepts_per_arm_features:
training_data_spec = bandit_spec_utils.drop_arm_observation(
policy.trajectory_spec)
super(NeuralLinUCBAgent, self).__init__(
time_step_spec=time_step_spec,
action_spec=policy.action_spec,
policy=policy,
collect_policy=policy,
train_sequence_length=None,
training_data_spec=training_data_spec,
debug_summaries=debug_summaries,
summarize_grads_and_vars=summarize_grads_and_vars,
train_step_counter=train_step_counter)
self._as_trajectory = data_converter.AsTrajectory(
self.data_context, sequence_length=None)