def __init__()

in tf_agents/bandits/agents/neural_linucb_agent.py [0:0]


  def __init__(
      self,
      time_step_spec: types.TimeStep,
      action_spec: types.BoundedTensorSpec,
      encoding_network: types.Network,
      encoding_network_num_train_steps: int,
      encoding_dim: int,
      optimizer: types.Optimizer,
      variable_collection: Optional[NeuralLinUCBVariableCollection] = None,
      alpha: float = 1.0,
      gamma: float = 1.0,
      epsilon_greedy: float = 0.0,
      observation_and_action_constraint_splitter: Optional[
          types.Splitter] = None,
      accepts_per_arm_features: bool = False,
      distributed_train_encoding_network: bool = False,
      # Params for training.
      error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error,
      gradient_clipping: Optional[float] = None,
      # Params for debugging.
      debug_summaries: bool = False,
      summarize_grads_and_vars: bool = False,
      train_step_counter: Optional[tf.Variable] = None,
      emit_policy_info: Sequence[Text] = (),
      emit_log_probability: bool = False,
      dtype: tf.DType = tf.float64,
      name: Optional[Text] = 'neural_linucb_agent'):
    """Initialize an instance of `NeuralLinUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      encoding_network: a Keras network that encodes the observations.
      encoding_network_num_train_steps: how many training steps to run for
        training the encoding network before switching to LinUCB. If negative,
        the encoding network is assumed to be already trained.
      encoding_dim: the dimension of encoded observations.
      optimizer: The optimizer to use for training.
      variable_collection: Instance of `NeuralLinUCBVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariables` will be created. Note that this
        collection excludes the variables owned by the encoding network.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      epsilon_greedy: A float representing the probability of choosing a random
        action instead of the greedy action.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      distributed_train_encoding_network: (bool) whether to train the encoding
        network or not. This applies only in distributed training setting. When
        set to true this agent will train the encoding network. Otherwise, it
        will assume the encoding network is already trained and will train
        LinUCB on top of it.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the NeuralLinUCBPolicy emits
        log-probabilities or not. Since the policy is deterministic, the
        probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
      name: a name for this instance of `NeuralLinUCBAgent`.

    Raises:
      TypeError if variable_collection is not an instance of
        `NeuralLinUCBVariableCollection`.
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
        action_spec)
    self._num_models = 1 if accepts_per_arm_features else self._num_actions
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    self._accepts_per_arm_features = accepts_per_arm_features
    self._alpha = alpha
    if variable_collection is None:
      variable_collection = NeuralLinUCBVariableCollection(
          self._num_models, encoding_dim, dtype)
    elif not isinstance(variable_collection, NeuralLinUCBVariableCollection):
      raise TypeError('Parameter `variable_collection` should be '
                      'of type `NeuralLinUCBVariableCollection`.')
    self._variable_collection = variable_collection
    self._gamma = gamma
    if self._gamma < 0.0 or self._gamma > 1.0:
      raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')
    self._dtype = dtype
    if dtype not in (tf.float32, tf.float64):
      raise ValueError(
          'Agent dtype should be either `tf.float32 or `tf.float64`.')
    self._epsilon_greedy = epsilon_greedy

    reward_layer = tf.keras.layers.Dense(
        self._num_models,
        kernel_initializer=tf.random_uniform_initializer(
            minval=-0.03, maxval=0.03),
        use_bias=False,
        activation=None,
        name='reward_layer')

    encoding_network.create_variables()
    self._encoding_network = encoding_network
    reward_layer.build(input_shape=tf.TensorShape([None, encoding_dim]))
    self._reward_layer = reward_layer
    self._encoding_network_num_train_steps = encoding_network_num_train_steps
    self._encoding_dim = encoding_dim
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()
    self._distributed_train_encoding_network = (
        distributed_train_encoding_network)

    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        encoding_network=self._encoding_network,
        encoding_dim=self._encoding_dim,
        reward_layer=self._reward_layer,
        epsilon_greedy=self._epsilon_greedy,
        actions_from_reward_layer=self.actions_from_reward_layer,
        cov_matrix=self.cov_matrix,
        data_vector=self.data_vector,
        num_samples=self.num_samples,
        time_step_spec=time_step_spec,
        alpha=alpha,
        emit_policy_info=emit_policy_info,
        emit_log_probability=emit_log_probability,
        accepts_per_arm_features=accepts_per_arm_features,
        distributed_use_reward_layer=distributed_train_encoding_network,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))

    training_data_spec = None
    if accepts_per_arm_features:
      training_data_spec = bandit_spec_utils.drop_arm_observation(
          policy.trajectory_spec)
    super(NeuralLinUCBAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=policy.action_spec,
        policy=policy,
        collect_policy=policy,
        train_sequence_length=None,
        training_data_spec=training_data_spec,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)

    self._as_trajectory = data_converter.AsTrajectory(
        self.data_context, sequence_length=None)