in sockeye/arguments.py [0:0]
def add_training_args(params):
train_params = params.add_argument_group("Training parameters")
add_batch_args(train_params)
# TODO(migration): Update after removing MXNet code
train_params.add_argument('--loss',
default=C.CROSS_ENTROPY_WITOUT_SOFTMAX_OUTPUT,
choices=[C.CROSS_ENTROPY, C.CROSS_ENTROPY_WITOUT_SOFTMAX_OUTPUT],
help='Loss to optimize. Default: %(default)s.')
train_params.add_argument('--label-smoothing',
default=0.1,
type=float,
help='Smoothing constant for label smoothing. Default: %(default)s.')
train_params.add_argument('--label-smoothing-impl',
default='mxnet',
choices=['mxnet', 'fairseq', 'torch'],
help='Choose label smoothing implementation. Default: %(default)s. '
'`torch` requires PyTorch 1.10.')
train_params.add_argument('--length-task',
type=str,
default=None,
choices=[C.LENGTH_TASK_RATIO, C.LENGTH_TASK_LENGTH],
help='If specified, adds an auxiliary task during training to predict source/target length ratios '
'(mean squared error loss), or absolute lengths (Poisson) loss. Default %(default)s.')
train_params.add_argument('--length-task-weight',
type=float_greater_or_equal(0.0),
default=1.0,
help='The weight of the auxiliary --length-task loss. Default %(default)s.')
train_params.add_argument('--length-task-layers',
type=int_greater_or_equal(1),
default=1,
help='Number of fully-connected layers for predicting the length ratio. Default %(default)s.')
train_params.add_argument('--target-factors-weight',
type=float,
nargs='+',
default=[1.0],
help='Weights of target factor losses. If one value is given, it applies to all '
'secondary target factors. For multiple values, the number of weights given has '
'to match the number of target factors. Default: %(default)s.')
train_params.add_argument('--optimized-metric',
default=C.PERPLEXITY,
choices=C.METRICS,
help='Metric to optimize with early stopping {%(choices)s}. Default: %(default)s.')
train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_INTERVAL,
type=int_greater_or_equal(1),
default=4000,
help='Checkpoint and evaluate every x updates (update-interval * batches). '
'Default: %(default)s.')
train_params.add_argument('--min-samples',
type=int,
default=None,
help='Minimum number of samples before training can stop. Default: %(default)s.')
train_params.add_argument('--max-samples',
type=int,
default=None,
help='Maximum number of samples. Default: %(default)s.')
train_params.add_argument('--min-updates',
type=int,
default=None,
help='Minimum number of updates before training can stop. Default: %(default)s.')
train_params.add_argument('--max-updates',
type=int,
default=None,
help='Maximum number of updates. Default: %(default)s.')
train_params.add_argument('--max-seconds',
type=int,
default=None,
help='Training will stop on the next checkpoint after reaching the maximum seconds. '
'Default: %(default)s.')
train_params.add_argument('--max-checkpoints',
type=int,
default=None,
help='Maximum number of checkpoints to continue training the model '
'before training is stopped. '
'Default: %(default)s.')
train_params.add_argument('--max-num-checkpoint-not-improved',
type=int,
default=None,
help='Maximum number of checkpoints the model is allowed to not improve in '
'<optimized-metric> on validation data before training is stopped. '
'Default: %(default)s.')
train_params.add_argument('--checkpoint-improvement-threshold',
type=float,
default=0.,
help='Improvement in <optimized-metric> over specified number of checkpoints must exceed '
'this value to be considered actual improvement. Default: %(default)s.')
train_params.add_argument('--min-num-epochs',
type=int,
default=None,
help='Minimum number of epochs (passes through the training data) '
'before training can stop. Default: %(default)s.')
train_params.add_argument('--max-num-epochs',
type=int,
default=None,
help='Maximum number of epochs (passes through the training data) Default: %(default)s.')
train_params.add_argument('--embed-dropout',
type=multiple_values(2, data_type=float),
default=(.0, .0),
help='Dropout probability for source & target embeddings. Use "x:x" to specify separate '
'values. Default: %(default)s.')
train_params.add_argument('--transformer-dropout-attention',
type=multiple_values(2, data_type=float),
default=(0.1, 0.1),
help='Dropout probability for multi-head attention. Use "x:x" to specify separate '
'values for encoder & decoder. Default: %(default)s.')
train_params.add_argument('--transformer-dropout-act',
type=multiple_values(2, data_type=float),
default=(0.1, 0.1),
help='Dropout probability before activation in feed-forward block. Use "x:x" to specify '
'separate values for encoder & decoder. Default: %(default)s.')
train_params.add_argument('--transformer-dropout-prepost',
type=multiple_values(2, data_type=float),
default=(0.1, 0.1),
help='Dropout probability for pre/postprocessing blocks. Use "x:x" to specify separate '
'values for encoder & decoder. Default: %(default)s.')
train_params.add_argument('--optimizer',
default=C.OPTIMIZER_ADAM,
choices=C.OPTIMIZERS,
help='SGD update rule. Default: %(default)s.')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--optimizer-params',
type=simple_dict(),
default=None,
help='Additional optimizer params as dictionary. Format: key1:value1,key2:value2,...')
train_params.add_argument('--optimizer-betas',
type=multiple_values(2, data_type=float),
default=(0.9, 0.999),
help='Beta1 and beta2 for Adam-like optimizers, specified "x:x". Default: %(default)s.')
train_params.add_argument('--optimizer-eps',
type=float_greater_or_equal(0),
default=1e-08,
help='Optimizer epsilon. Default: %(default)s.')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--horovod',
action='store_true',
help='Use Horovod/MPI for distributed training (Sergeev and Del Balso 2018, '
'arxiv.org/abs/1802.05799). When using this option, run Sockeye with `horovodrun '
'-np X python3 -m sockeye.train` where X is the number of processes. Increasing '
'the number of processes multiplies the effective batch size (ex: batch_size 2560 '
'with `-np 4` gives effective batch size 10240).')
train_params.add_argument('--dist',
action='store_true',
help='Run in distributed training mode. When using this option, launch training with '
'`torchrun --nproc_per_node N -m sockeye.train`. Increasing the number of processes '
'multiplies the effective batch size (ex: batch_size 2560 with `--nproc_per_node 4` '
'gives effective batch size 10240).')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument("--kvstore",
type=str,
default=C.KVSTORE_DEVICE,
choices=C.KVSTORE_TYPES,
help="The MXNet kvstore to use. 'device' is recommended for single process training. "
"Use any of 'dist_sync', 'dist_device_sync' and 'dist_async' for distributed "
"training. Default: %(default)s.")
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--weight-init',
type=str,
default=C.INIT_XAVIER,
choices=C.INIT_TYPES,
help='Type of base weight initialization. Default: %(default)s.')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--weight-init-scale',
type=float,
default=3.0,
help='Weight initialization scale. Applies to uniform (scale) and xavier (magnitude). '
'Default: %(default)s.')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--weight-init-xavier-factor-type',
type=str,
default=C.INIT_XAVIER_FACTOR_TYPE_AVG,
choices=C.INIT_XAVIER_FACTOR_TYPES,
help='Xavier factor type. Default: %(default)s.')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--weight-init-xavier-rand-type',
type=str,
default=C.RAND_TYPE_UNIFORM,
choices=[C.RAND_TYPE_UNIFORM, C.RAND_TYPE_GAUSSIAN],
help='Xavier random number generator type. Default: %(default)s.')
train_params.add_argument('--initial-learning-rate',
type=float,
default=0.0002,
help='Initial learning rate. Default: %(default)s.')
train_params.add_argument('--weight-decay',
type=float,
default=0.0,
help='Weight decay constant. Default: %(default)s.')
train_params.add_argument('--momentum',
type=float,
default=0.0,
help='Momentum constant. Default: %(default)s.')
train_params.add_argument('--gradient-clipping-threshold',
type=float,
default=1.0,
help='Clip absolute gradients values greater than this value. '
'Set to negative to disable. Default: %(default)s.')
train_params.add_argument('--gradient-clipping-type',
choices=C.GRADIENT_CLIPPING_TYPES,
default=C.GRADIENT_CLIPPING_TYPE_NONE,
help='The type of gradient clipping. Default: %(default)s.')
train_params.add_argument('--learning-rate-scheduler-type',
default=C.LR_SCHEDULER_PLATEAU_REDUCE,
choices=C.LR_SCHEDULERS,
help='Learning rate scheduler type. Default: %(default)s.')
train_params.add_argument('--learning-rate-t-scale',
type=float,
default=1.0,
help="Step number is multiplied by this value when determining learning rate for the "
"current step. Default: %(default)s.")
train_params.add_argument('--learning-rate-reduce-factor',
type=float,
default=0.9,
help="Factor to multiply learning rate with "
"(for 'plateau-reduce' learning rate scheduler). Default: %(default)s.")
train_params.add_argument('--learning-rate-reduce-num-not-improved',
type=int,
default=8,
help="For 'plateau-reduce' learning rate scheduler. Adjust learning rate "
"if <optimized-metric> did not improve for x checkpoints. Default: %(default)s.")
train_params.add_argument('--learning-rate-warmup',
type=int,
default=0,
help="Number of warmup steps. If set to x, linearly increases learning rate from 10%% "
"to 100%% of the initial learning rate. Default: %(default)s.")
train_params.add_argument('--fixed-param-strategy',
default=None,
choices=C.FIXED_PARAM_STRATEGY_CHOICES,
help="Fix various parameters during training using a named strategy. The strategy "
"name indicates which parameters will be fixed (Wuebker et al., 2018). "
"Default: %(default)s.")
train_params.add_argument('--fixed-param-names',
default=[],
nargs='*',
help="Manually specify names of parameters to fix during training. Default: %(default)s.")
train_params.add_argument(C.TRAIN_ARGS_MONITOR_BLEU,
default=500,
type=int,
help='x>0: decode x sampled sentences from validation data and '
'compute evaluation metrics. x==-1: use full validation data. Default: %(default)s.')
# TODO(migration): Remove after removing MXNet code
train_params.add_argument('--decode-and-evaluate-device-id',
default=None,
type=int,
help='Separate device for decoding validation data. '
'Use a negative number to automatically acquire a GPU. '
'Use a positive number to acquire a specific GPU. Default: %(default)s.')
train_params.add_argument(C.TRAIN_ARGS_STOP_ON_DECODER_FAILURE,
action="store_true",
help='Stop training as soon as any checkpoint decoder fails (e.g. because there is not '
'enough GPU memory). Default: %(default)s.')
train_params.add_argument('--seed',
type=int,
default=1,
help='Random seed. Default: %(default)s.')
train_params.add_argument('--keep-last-params',
type=int,
default=-1,
help='Keep only the last n params files, use -1 to keep all files. Default: %(default)s')
train_params.add_argument('--keep-initializations',
action="store_true",
help='In addition to keeping the last n params files, also keep params from checkpoint 0.')
train_params.add_argument('--cache-last-best-params',
required=False,
type=int,
default=0,
help='Cache the last n best params files, as distinct from the last n in sequence. '
'Use 0 or negative to disable. Default: %(default)s')
train_params.add_argument('--cache-strategy',
required=False,
type=str,
default=C.AVERAGE_BEST,
choices=C.AVERAGE_CHOICES,
help='Strategy to use when deciding which are the "best" params files. '
'Default: %(default)s')
train_params.add_argument('--cache-metric',
required=False,
type=str,
default=C.PERPLEXITY,
choices=C.METRICS,
help='Metric to use when deciding which are the "best" params files. '
'Default: %(default)s')
train_params.add_argument('--dry-run',
action='store_true',
help="Do not perform any actual training, but print statistics about the model"
" and mode of operation.")