in lingvo/core/base_model.py [0:0]
def Params(cls):
p = super().Params()
p.Define('input', None, 'Input generator Params.')
p.Define('encoder', None, 'Encoder Params.')
p.Define('online_encoder', None, 'Online Encoder Params.')
p.Define('decoder', None, 'Decoder Params.')
p.Define(
'task_global_step', False,
'Whether or not to use task-specific global steps, which causes each '
'task to use its own global_step instead of the true global_step. '
'NOTE: this may be severely broken. Verify carefully!')
p.Define(
'defer_global_step_update', False,
'Whether or not to defer the global step update. This is used when '
'doing gradient accumulation, which update the global step only when '
'weights are updated. Currently this supports only true global step.')
p.Define('train', hyperparams.Params(),
'Params to control how this task should be trained.')
p.Define('ml_perf', hyperparams.Params(), 'MlPerf configuration.')
tp = p.train
tp.Define(
'start_up_delay_steps', 200, 'i-th replica starts training after '
'i*(i+1)/2*start_up_delay_steps steps')
tp.Define('max_steps', 4 * 10**6, 'Maximum number of training steps.')
tp.Define(
'tpu_steps_per_loop', 1000, 'The number of training steps per '
'training loop for TPUs. Note that this is not used by '
'ExecutorTpu, which relies on ProgramSchedule.')
tp.Define(
'tpu_device_order_mode', None,
'A device_assignment_lib.DeviceOrderMode enum that determines whether '
'to assign devices in a way that the order of replicas or '
'model-parallel cores will form a ring or mesh, or let the library to '
'choose. Default None to AUTO.')
tp.Define(
'tpu_computation_shape', None,
'A 4-element list that describes how virtual cores (which we specify '
'in TF computation) should be mapped to one or more logical cores.')
tp.Define(
'vn_start_step', 200000000,
'Step starting from which variational noise is added to '
'params values during training.')
tp.Define('vn_std', 0.0, 'Std of the variational noise.')
tp.Define('early_stop', early_stop.EarlyStop.Params(),
'Early stopping based on dev-set performance.')
tp.Define(
'ema_decay', 0.0,
'If > 0, enable ExponentialMovingAverage during training '
'with the give decay. '
'Must be < 1. Disabled if <= 0. '
'Note that TPU embedding does not support EMA, so if used together, '
'there will be a mix of EMA and non-EMA variables in the model and the '
'quality may be affected, so use them together at your own risk.')
tp.Define(
'ema_decay_moving_vars', None,
'If True, include variables from collection "moving_vars" in ema.')
tp.Define(
'init_from_checkpoint_rules', {},
'If not None, a dictionary with keys corresponding to a checkpoint '
'path and values corresponding to variable loading rules is expected. '
'Each key is expected to be a path to a checkpoint from which to '
'(re-)initialize part of the model. Variables are only loaded from this'
' path during initialization and will override values provided by '
'initialization.\n'
'The corresponding values (loading_rules) are expected to be a tuple '
'consisting of two list: loading rules, and ignore rules, respectively.'
'The first list (loading rules) contains the list of variables '
'which should be initialized from the checkpoint: each element in the '
'list is a pair of strings. The first element is a regex and the '
'second is a python format string. If a variable in the model matches '
'a regex, we rename using the format string to determine the '
'corresponding var in the checkpoint. If a model variable would match '
'multiple loading rules, the first rule that matches is used.\n'
'The second list (ignore rules) is a list of regexes which specify '
'variables in the model which should not be initialized using the '
'loading rules. Thus, if a variable in the model to be trained matches '
'one of the rules in the loading rules, as well as one of the regular '
'expressions in the ignore rules, the variable will not be initialized '
'from the checkpoint, but will instead be initialized from the '
'variable initalizer defined in the graph.'
'Examples:'
'{"checkpoint_path": ([("(.*)", "%s")], [])} will initialize all the '
'model parameters from the checkpoint_path.')
tp.Define(
'init_from_checkpoint_override', None,
'If set, override keys in init_from_checkpoint_rules with this. '
'Once set, only one key is expected in '
'init_from_checkpoint_rules. This is for easier param override '
'when using --model_params_override or in xm.'
'Default is None, nothing override; If set to empty string, the '
'defined init_from_checkpoint_rules will be ignored.')
tp.Define(
'pruning_hparams_dict', None, 'Pruning related hyperparameters. A dict '
'with hyperparameter: value pairs. See google-research.model_pruning.')
tp.Define(
'enqueue_max_steps', -1, 'Max enqueue steps. -1 meaning no limit.'
' This flag should be set for unit-test only.')
tp.Define('save_interval_seconds', 60 * 10,
'Generates a checkpoint roughly once every this many seconds.')
tp.Define(
'save_interval_steps', None,
'Generates a checkpoint roughly once every this many training '
'steps. Supersedes save_interval_seconds if not None.')
tp.Define('save_max_to_keep', 100,
'Maximum number of recent checkpoints to keep.')
tp.Define('save_keep_checkpoint_every_n_hours', 0.5,
'How often to keep a checkpoint.')
tp.Define('async_checkpointing', False,
'Checkpointing asynchronously. Currently only support executor.')
tp.Define(
'checkpoint_finite_check', False,
'Whether to santiy check variables to be finite when saving '
'checkpoints. Currently only support custom saver.')
tp.Define(
'keep_per_example_loss', False,
'If True, checks if per-example metrics contain a key named \'loss\', '
'and if so copies it to the main metrics dictionary under key '
'\'per_example_loss\'.')
tp.Define('summary_interval_steps', 100,
'Generates a summary roughly once every this many steps.')
# The following params must mirror those in Learner.Params().
# TODO(rpang): migrate existing params to use learner and
# delete legacy params.
# LINT.IfChange
tp.Define(
'learner', None, 'One or a list of optimization programs. '
'If None, uses a Learner created from the legacy params '
'defined below: learning_rate, lr_schedule, optimizer, etc.')
tp.Define(
'l2_regularizer_weight', None,
'If not None, L2 regularization to apply to the weights. '
'Otherwise, disable L2 regularization.')
tp.Define(
'l1_regularizer_weight', None,
'If not None, L1 regularization to apply to the weights. '
'Otherwise, disable L1 regularization.')
tp.Define('learning_rate', 0.0, 'learning rate to use.')
tp.Define(
'clip_gradient_norm_to_value', 0.0,
'Clip gradient by global norm to this value. This is similar to '
'the behaviour of tf.clip_by_global_norm, if you are looking for '
'tf.clip_by_norm refer to clip_gradient_single_norm_to_value. Note '
'these are mutually exclusive.')
tp.Define(
'clip_gradient_single_norm_to_value', 0.0,
'Clip gradient by single tensor norm to this value. This is '
'similar to the behaviour of tf.clip_by_norm. Note this is mutually '
'exlusive to using clip_gradient_norm_to_value.')
tp.Define('grad_norm_to_clip_to_zero', 0.0,
'Clip gradient to 0 if its norm exceeds this value.')
tp.Define('grad_norm_tracker', None, 'Params for GradNormTracker.')
tp.Define('optimizer', optimizer.Adam.Params(), 'Params for the optimizer.')
tp.Define('lr_schedule', schedule.ContinuousSchedule.Params(),
'Learning rate decay schedule.')
tp.Define(
'bprop_variable_filter', None,
'If set, only backprop variables whose names partially match '
'this regexp (re.search).')
tp.Define(
'bprop_variable_exclusion', None,
'If set, do not backprop variables whose names partially match '
'this regexp (re.search).')
tp.Define(
'grad_aggregation_method', tf.AggregationMethod.EXPERIMENTAL_TREE,
'Specifies the method used to combine gradient terms. Accepted '
'values are constants defined in the class AggregationMethod.')
tp.Define(
'gate_gradients', False,
'If True, add a tuple around the gradients returned for an '
'operations. This avoids some race conditions.')
tp.Define('colocate_gradients_with_ops', True,
'If True, try colocating gradients with the corresponding op.')
tp.Define('scale_gradients', True,
'Whether to apply gradients adjustment and scaling.')
tp.Define(
'learner_use_variable_scope', True,
'Create children of learner in tf.variable_scope. This may need '
'to be set to False for compatibility with the existing '
'checkpoints trained from legacy code. New models should always '
'set this to True.')
# LINT.ThenChange(learner.py)
p.Define('eval', hyperparams.Params(),
'Params to control how this task should be evaled.')
ep = p.eval
ep.Define(
'samples_per_summary', 1000,
'If > 0, generates one summary after this many samples, at most. '
'If == 0 or the dataset has fewer examples, evaluate the whole set.')
ep.Define(
'decoder_samples_per_summary', None,
'If > 0, each decoder summary will contain at most this many samples. '
'If None, defaults to the actual value of `p.eval.samples_per_summary` '
'for backwards compatibility.')
ep.Define(
'load_checkpoint_from', '',
'If not Empty, specifies a location for the checkpoint that '
'should be used for eval. One example format is a '
'checkpoint directory of a training run.')
ep.Define('start_eval_after', 0,
'Start evaluation after specified number of steps.')
ep.Define('start_decoder_after', 0,
'Only decode checkpoints after this step.')
ep.Define(
'eval_all_checkpoints', False,
'Compute evaluation metrics for every checkpoint saved by the Trainer.')
ep.Define(
'decode_all_checkpoints', False,
'Compute decoder metrics for every checkpoint saved by the Trainer.')
return p