def Params()

in lingvo/core/base_model.py [0:0]
204 lines of code
2 McCabe index (conditional complexity)

  def Params(cls):
    p = super().Params()
    p.Define('input', None, 'Input generator Params.')
    p.Define('encoder', None, 'Encoder Params.')
    p.Define('online_encoder', None, 'Online Encoder Params.')
    p.Define('decoder', None, 'Decoder Params.')
    p.Define(
        'task_global_step', False,
        'Whether or not to use task-specific global steps, which causes each '
        'task to use its own global_step instead of the true global_step. '
        'NOTE: this may be severely broken. Verify carefully!')
    p.Define(
        'defer_global_step_update', False,
        'Whether or not to defer the global step update. This is used when '
        'doing gradient accumulation, which update the global step only when '
        'weights are updated. Currently this supports only true global step.')
    p.Define('train', hyperparams.Params(),
             'Params to control how this task should be trained.')

    p.Define('ml_perf', hyperparams.Params(), 'MlPerf configuration.')

    tp = p.train
    tp.Define(
        'start_up_delay_steps', 200, 'i-th replica starts training after '
        'i*(i+1)/2*start_up_delay_steps steps')
    tp.Define('max_steps', 4 * 10**6, 'Maximum number of training steps.')
    tp.Define(
        'tpu_steps_per_loop', 1000, 'The number of training steps per '
        'training loop for TPUs. Note that this is not used by '
        'ExecutorTpu, which relies on ProgramSchedule.')
    tp.Define(
        'tpu_device_order_mode', None,
        'A device_assignment_lib.DeviceOrderMode enum that determines whether '
        'to assign devices in a way that the order of replicas or '
        'model-parallel cores will form a ring or mesh, or let the library to '
        'choose. Default None to AUTO.')
    tp.Define(
        'tpu_computation_shape', None,
        'A 4-element list that describes how virtual cores (which we specify '
        'in TF computation) should be mapped to one or more logical cores.')
    tp.Define(
        'vn_start_step', 200000000,
        'Step starting from which variational noise is added to '
        'params values during training.')
    tp.Define('vn_std', 0.0, 'Std of the variational noise.')
    tp.Define('early_stop', early_stop.EarlyStop.Params(),
              'Early stopping based on dev-set performance.')
    tp.Define(
        'ema_decay', 0.0,
        'If > 0, enable ExponentialMovingAverage during training '
        'with the give decay. '
        'Must be < 1. Disabled if <= 0. '
        'Note that TPU embedding does not support EMA, so if used together, '
        'there will be a mix of EMA and non-EMA variables in the model and the '
        'quality may be affected, so use them together at your own risk.')
    tp.Define(
        'ema_decay_moving_vars', None,
        'If True, include variables from collection "moving_vars" in ema.')
    tp.Define(
        'init_from_checkpoint_rules', {},
        'If not None, a dictionary with keys corresponding to a checkpoint '
        'path and values corresponding to variable loading rules is expected. '
        'Each key is expected to be a path to a checkpoint from which to '
        '(re-)initialize part of the model. Variables are only loaded from this'
        ' path during initialization and will override values provided by '
        'initialization.\n'
        'The corresponding values (loading_rules) are expected to be a tuple '
        'consisting of two list: loading rules, and ignore rules, respectively.'
        'The first list (loading rules) contains the list of variables '
        'which should be initialized from the checkpoint: each element in the '
        'list is a pair of strings. The first element is a regex and the '
        'second is a python format string. If a variable in the model matches '
        'a regex, we rename using the format string to determine the '
        'corresponding var in the checkpoint. If a model variable would match '
        'multiple loading rules, the first rule that matches is used.\n'
        'The second list (ignore rules) is a list of regexes which specify '
        'variables in the model which should not be initialized using the '
        'loading rules. Thus, if a variable in the model to be trained matches '
        'one of the rules in the loading rules, as well as one of the regular '
        'expressions in the ignore rules, the variable will not be initialized '
        'from the checkpoint, but will instead be initialized from the '
        'variable initalizer defined in the graph.'
        'Examples:'
        '{"checkpoint_path": ([("(.*)", "%s")], [])} will initialize all the '
        'model parameters from the checkpoint_path.')
    tp.Define(
        'init_from_checkpoint_override', None,
        'If set, override keys in init_from_checkpoint_rules with this. '
        'Once set, only one key is expected in '
        'init_from_checkpoint_rules. This is for easier param override '
        'when using --model_params_override or in xm.'
        'Default is None, nothing override; If set to empty string, the '
        'defined init_from_checkpoint_rules will be ignored.')
    tp.Define(
        'pruning_hparams_dict', None, 'Pruning related hyperparameters. A dict '
        'with hyperparameter: value pairs. See google-research.model_pruning.')
    tp.Define(
        'enqueue_max_steps', -1, 'Max enqueue steps. -1 meaning no limit.'
        ' This flag should be set for unit-test only.')
    tp.Define('save_interval_seconds', 60 * 10,
              'Generates a checkpoint roughly once every this many seconds.')
    tp.Define(
        'save_interval_steps', None,
        'Generates a checkpoint roughly once every this many training '
        'steps. Supersedes save_interval_seconds if not None.')
    tp.Define('save_max_to_keep', 100,
              'Maximum number of recent checkpoints to keep.')
    tp.Define('save_keep_checkpoint_every_n_hours', 0.5,
              'How often to keep a checkpoint.')
    tp.Define('async_checkpointing', False,
              'Checkpointing asynchronously. Currently only support executor.')
    tp.Define(
        'checkpoint_finite_check', False,
        'Whether to santiy check variables to be finite when saving '
        'checkpoints. Currently only support custom saver.')
    tp.Define(
        'keep_per_example_loss', False,
        'If True, checks if per-example metrics contain a key named \'loss\', '
        'and if so copies it to the main metrics dictionary under key '
        '\'per_example_loss\'.')
    tp.Define('summary_interval_steps', 100,
              'Generates a summary roughly once every this many steps.')
    # The following params must mirror those in Learner.Params().
    # TODO(rpang): migrate existing params to use learner and
    # delete legacy params.
    # LINT.IfChange
    tp.Define(
        'learner', None, 'One or a list of optimization programs. '
        'If None, uses a Learner created from the legacy params '
        'defined below: learning_rate, lr_schedule, optimizer, etc.')
    tp.Define(
        'l2_regularizer_weight', None,
        'If not None, L2 regularization to apply to the weights. '
        'Otherwise, disable L2 regularization.')
    tp.Define(
        'l1_regularizer_weight', None,
        'If not None, L1 regularization to apply to the weights. '
        'Otherwise, disable L1 regularization.')
    tp.Define('learning_rate', 0.0, 'learning rate to use.')
    tp.Define(
        'clip_gradient_norm_to_value', 0.0,
        'Clip gradient by global norm to this value. This is similar to '
        'the behaviour of tf.clip_by_global_norm, if you are looking for '
        'tf.clip_by_norm refer to clip_gradient_single_norm_to_value. Note '
        'these are mutually exclusive.')
    tp.Define(
        'clip_gradient_single_norm_to_value', 0.0,
        'Clip gradient by single tensor norm to this value. This is '
        'similar to the behaviour of tf.clip_by_norm. Note this is mutually '
        'exlusive to using clip_gradient_norm_to_value.')
    tp.Define('grad_norm_to_clip_to_zero', 0.0,
              'Clip gradient to 0 if its norm exceeds this value.')
    tp.Define('grad_norm_tracker', None, 'Params for GradNormTracker.')
    tp.Define('optimizer', optimizer.Adam.Params(), 'Params for the optimizer.')
    tp.Define('lr_schedule', schedule.ContinuousSchedule.Params(),
              'Learning rate decay schedule.')
    tp.Define(
        'bprop_variable_filter', None,
        'If set, only backprop variables whose names partially match '
        'this regexp (re.search).')
    tp.Define(
        'bprop_variable_exclusion', None,
        'If set, do not backprop variables whose names partially match '
        'this regexp (re.search).')
    tp.Define(
        'grad_aggregation_method', tf.AggregationMethod.EXPERIMENTAL_TREE,
        'Specifies the method used to combine gradient terms. Accepted '
        'values are constants defined in the class AggregationMethod.')
    tp.Define(
        'gate_gradients', False,
        'If True, add a tuple around the gradients returned for an '
        'operations. This avoids some race conditions.')
    tp.Define('colocate_gradients_with_ops', True,
              'If True, try colocating gradients with the corresponding op.')
    tp.Define('scale_gradients', True,
              'Whether to apply gradients adjustment and scaling.')
    tp.Define(
        'learner_use_variable_scope', True,
        'Create children of learner in tf.variable_scope. This may need '
        'to be set to False for compatibility with the existing '
        'checkpoints trained from legacy code. New models should always '
        'set this to True.')
    # LINT.ThenChange(learner.py)
    p.Define('eval', hyperparams.Params(),
             'Params to control how this task should be evaled.')
    ep = p.eval
    ep.Define(
        'samples_per_summary', 1000,
        'If > 0, generates one summary after this many samples, at most. '
        'If == 0 or the dataset has fewer examples, evaluate the whole set.')
    ep.Define(
        'decoder_samples_per_summary', None,
        'If > 0, each decoder summary will contain at most this many samples. '
        'If None, defaults to the actual value of `p.eval.samples_per_summary` '
        'for backwards compatibility.')
    ep.Define(
        'load_checkpoint_from', '',
        'If not Empty, specifies a location for the checkpoint that '
        'should be used for eval. One example format is a '
        'checkpoint directory of a training run.')
    ep.Define('start_eval_after', 0,
              'Start evaluation after specified number of steps.')
    ep.Define('start_decoder_after', 0,
              'Only decode checkpoints after this step.')
    ep.Define(
        'eval_all_checkpoints', False,
        'Compute evaluation metrics for every checkpoint saved by the Trainer.')
    ep.Define(
        'decode_all_checkpoints', False,
        'Compute decoder metrics for every checkpoint saved by the Trainer.')
    return p