def add_training_args()

in sockeye/arguments.py [0:0]


def add_training_args(params):
    train_params = params.add_argument_group("Training parameters")

    add_batch_args(train_params)

    # TODO(migration): Update after removing MXNet code
    train_params.add_argument('--loss',
                              default=C.CROSS_ENTROPY_WITOUT_SOFTMAX_OUTPUT,
                              choices=[C.CROSS_ENTROPY, C.CROSS_ENTROPY_WITOUT_SOFTMAX_OUTPUT],
                              help='Loss to optimize. Default: %(default)s.')
    train_params.add_argument('--label-smoothing',
                              default=0.1,
                              type=float,
                              help='Smoothing constant for label smoothing. Default: %(default)s.')
    train_params.add_argument('--label-smoothing-impl',
                              default='mxnet',
                              choices=['mxnet', 'fairseq', 'torch'],
                              help='Choose label smoothing implementation. Default: %(default)s. '
                                   '`torch` requires PyTorch 1.10.')

    train_params.add_argument('--length-task',
                              type=str,
                              default=None,
                              choices=[C.LENGTH_TASK_RATIO, C.LENGTH_TASK_LENGTH],
                              help='If specified, adds an auxiliary task during training to predict source/target length ratios '
                                    '(mean squared error loss), or absolute lengths (Poisson) loss. Default %(default)s.')
    train_params.add_argument('--length-task-weight',
                              type=float_greater_or_equal(0.0),
                              default=1.0,
                              help='The weight of the auxiliary --length-task loss. Default %(default)s.')
    train_params.add_argument('--length-task-layers',
                              type=int_greater_or_equal(1),
                              default=1,
                              help='Number of fully-connected layers for predicting the length ratio. Default %(default)s.')

    train_params.add_argument('--target-factors-weight',
                              type=float,
                              nargs='+',
                              default=[1.0],
                              help='Weights of target factor losses. If one value is given, it applies to all '
                                   'secondary target factors. For multiple values, the number of weights given has '
                                   'to match the number of target factors. Default: %(default)s.')

    train_params.add_argument('--optimized-metric',
                              default=C.PERPLEXITY,
                              choices=C.METRICS,
                              help='Metric to optimize with early stopping {%(choices)s}. Default: %(default)s.')

    train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_INTERVAL,
                              type=int_greater_or_equal(1),
                              default=4000,
                              help='Checkpoint and evaluate every x updates (update-interval * batches). '
                                   'Default: %(default)s.')

    train_params.add_argument('--min-samples',
                              type=int,
                              default=None,
                              help='Minimum number of samples before training can stop. Default: %(default)s.')
    train_params.add_argument('--max-samples',
                              type=int,
                              default=None,
                              help='Maximum number of samples. Default: %(default)s.')
    train_params.add_argument('--min-updates',
                              type=int,
                              default=None,
                              help='Minimum number of updates before training can stop. Default: %(default)s.')
    train_params.add_argument('--max-updates',
                              type=int,
                              default=None,
                              help='Maximum number of updates. Default: %(default)s.')
    train_params.add_argument('--max-seconds',
                              type=int,
                              default=None,
                              help='Training will stop on the next checkpoint after reaching the maximum seconds. '
                                   'Default: %(default)s.')

    train_params.add_argument('--max-checkpoints',
                              type=int,
                              default=None,
                              help='Maximum number of checkpoints to continue training the model '
                                   'before training is stopped. '
                                   'Default: %(default)s.')
    train_params.add_argument('--max-num-checkpoint-not-improved',
                              type=int,
                              default=None,
                              help='Maximum number of checkpoints the model is allowed to not improve in '
                                   '<optimized-metric> on validation data before training is stopped. '
                                   'Default: %(default)s.')
    train_params.add_argument('--checkpoint-improvement-threshold',
                              type=float,
                              default=0.,
                              help='Improvement in <optimized-metric> over specified number of checkpoints must exceed '
                                   'this value to be considered actual improvement. Default: %(default)s.')

    train_params.add_argument('--min-num-epochs',
                              type=int,
                              default=None,
                              help='Minimum number of epochs (passes through the training data) '
                                   'before training can stop. Default: %(default)s.')
    train_params.add_argument('--max-num-epochs',
                              type=int,
                              default=None,
                              help='Maximum number of epochs (passes through the training data) Default: %(default)s.')
    train_params.add_argument('--embed-dropout',
                              type=multiple_values(2, data_type=float),
                              default=(.0, .0),
                              help='Dropout probability for source & target embeddings. Use "x:x" to specify separate '
                                   'values. Default: %(default)s.')
    train_params.add_argument('--transformer-dropout-attention',
                              type=multiple_values(2, data_type=float),
                              default=(0.1, 0.1),
                              help='Dropout probability for multi-head attention. Use "x:x" to specify separate '
                                   'values for encoder & decoder. Default: %(default)s.')
    train_params.add_argument('--transformer-dropout-act',
                              type=multiple_values(2, data_type=float),
                              default=(0.1, 0.1),
                              help='Dropout probability before activation in feed-forward block. Use "x:x" to specify '
                                   'separate values for encoder & decoder. Default: %(default)s.')
    train_params.add_argument('--transformer-dropout-prepost',
                              type=multiple_values(2, data_type=float),
                              default=(0.1, 0.1),
                              help='Dropout probability for pre/postprocessing blocks. Use "x:x" to specify separate '
                                   'values for encoder & decoder. Default: %(default)s.')

    train_params.add_argument('--optimizer',
                              default=C.OPTIMIZER_ADAM,
                              choices=C.OPTIMIZERS,
                              help='SGD update rule. Default: %(default)s.')
    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--optimizer-params',
                              type=simple_dict(),
                              default=None,
                              help='Additional optimizer params as dictionary. Format: key1:value1,key2:value2,...')
    train_params.add_argument('--optimizer-betas',
                              type=multiple_values(2, data_type=float),
                              default=(0.9, 0.999),
                              help='Beta1 and beta2 for Adam-like optimizers, specified "x:x". Default: %(default)s.')
    train_params.add_argument('--optimizer-eps',
                              type=float_greater_or_equal(0),
                              default=1e-08,
                              help='Optimizer epsilon. Default: %(default)s.')

    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--horovod',
                              action='store_true',
                              help='Use Horovod/MPI for distributed training (Sergeev and Del Balso 2018, '
                                   'arxiv.org/abs/1802.05799). When using this option, run Sockeye with `horovodrun '
                                   '-np X python3 -m sockeye.train` where X is the number of processes. Increasing '
                                   'the number of processes multiplies the effective batch size (ex: batch_size 2560 '
                                   'with `-np 4` gives effective batch size 10240).')
    train_params.add_argument('--dist',
                              action='store_true',
                              help='Run in distributed training mode. When using this option, launch training with '
                                   '`torchrun --nproc_per_node N -m sockeye.train`. Increasing the number of processes '
                                   'multiplies the effective batch size (ex: batch_size 2560 with `--nproc_per_node 4` '
                                   'gives effective batch size 10240).')

    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument("--kvstore",
                              type=str,
                              default=C.KVSTORE_DEVICE,
                              choices=C.KVSTORE_TYPES,
                              help="The MXNet kvstore to use. 'device' is recommended for single process training. "
                                   "Use any of 'dist_sync', 'dist_device_sync' and 'dist_async' for distributed "
                                   "training. Default: %(default)s.")

    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--weight-init',
                              type=str,
                              default=C.INIT_XAVIER,
                              choices=C.INIT_TYPES,
                              help='Type of base weight initialization. Default: %(default)s.')
    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--weight-init-scale',
                              type=float,
                              default=3.0,
                              help='Weight initialization scale. Applies to uniform (scale) and xavier (magnitude). '
                                   'Default: %(default)s.')
    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--weight-init-xavier-factor-type',
                              type=str,
                              default=C.INIT_XAVIER_FACTOR_TYPE_AVG,
                              choices=C.INIT_XAVIER_FACTOR_TYPES,
                              help='Xavier factor type. Default: %(default)s.')
    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--weight-init-xavier-rand-type',
                              type=str,
                              default=C.RAND_TYPE_UNIFORM,
                              choices=[C.RAND_TYPE_UNIFORM, C.RAND_TYPE_GAUSSIAN],
                              help='Xavier random number generator type. Default: %(default)s.')
    train_params.add_argument('--initial-learning-rate',
                              type=float,
                              default=0.0002,
                              help='Initial learning rate. Default: %(default)s.')
    train_params.add_argument('--weight-decay',
                              type=float,
                              default=0.0,
                              help='Weight decay constant. Default: %(default)s.')
    train_params.add_argument('--momentum',
                              type=float,
                              default=0.0,
                              help='Momentum constant. Default: %(default)s.')
    train_params.add_argument('--gradient-clipping-threshold',
                              type=float,
                              default=1.0,
                              help='Clip absolute gradients values greater than this value. '
                                   'Set to negative to disable. Default: %(default)s.')
    train_params.add_argument('--gradient-clipping-type',
                              choices=C.GRADIENT_CLIPPING_TYPES,
                              default=C.GRADIENT_CLIPPING_TYPE_NONE,
                              help='The type of gradient clipping. Default: %(default)s.')

    train_params.add_argument('--learning-rate-scheduler-type',
                              default=C.LR_SCHEDULER_PLATEAU_REDUCE,
                              choices=C.LR_SCHEDULERS,
                              help='Learning rate scheduler type. Default: %(default)s.')
    train_params.add_argument('--learning-rate-t-scale',
                              type=float,
                              default=1.0,
                              help="Step number is multiplied by this value when determining learning rate for the "
                                   "current step. Default: %(default)s.")
    train_params.add_argument('--learning-rate-reduce-factor',
                              type=float,
                              default=0.9,
                              help="Factor to multiply learning rate with "
                                   "(for 'plateau-reduce' learning rate scheduler). Default: %(default)s.")
    train_params.add_argument('--learning-rate-reduce-num-not-improved',
                              type=int,
                              default=8,
                              help="For 'plateau-reduce' learning rate scheduler. Adjust learning rate "
                                   "if <optimized-metric> did not improve for x checkpoints. Default: %(default)s.")
    train_params.add_argument('--learning-rate-warmup',
                              type=int,
                              default=0,
                              help="Number of warmup steps. If set to x, linearly increases learning rate from 10%% "
                                   "to 100%% of the initial learning rate. Default: %(default)s.")

    train_params.add_argument('--fixed-param-strategy',
                               default=None,
                               choices=C.FIXED_PARAM_STRATEGY_CHOICES,
                               help="Fix various parameters during training using a named strategy. The strategy "
                                    "name indicates which parameters will be fixed (Wuebker et al., 2018). "
                                    "Default: %(default)s.")
    train_params.add_argument('--fixed-param-names',
                              default=[],
                              nargs='*',
                              help="Manually specify names of parameters to fix during training. Default: %(default)s.")

    train_params.add_argument(C.TRAIN_ARGS_MONITOR_BLEU,
                              default=500,
                              type=int,
                              help='x>0: decode x sampled sentences from validation data and '
                                   'compute evaluation metrics. x==-1: use full validation data. Default: %(default)s.')

    # TODO(migration): Remove after removing MXNet code
    train_params.add_argument('--decode-and-evaluate-device-id',
                              default=None,
                              type=int,
                              help='Separate device for decoding validation data. '
                                   'Use a negative number to automatically acquire a GPU. '
                                   'Use a positive number to acquire a specific GPU. Default: %(default)s.')

    train_params.add_argument(C.TRAIN_ARGS_STOP_ON_DECODER_FAILURE,
                              action="store_true",
                              help='Stop training as soon as any checkpoint decoder fails (e.g. because there is not '
                                   'enough GPU memory). Default: %(default)s.')

    train_params.add_argument('--seed',
                              type=int,
                              default=1,
                              help='Random seed. Default: %(default)s.')

    train_params.add_argument('--keep-last-params',
                              type=int,
                              default=-1,
                              help='Keep only the last n params files, use -1 to keep all files. Default: %(default)s')

    train_params.add_argument('--keep-initializations',
                              action="store_true",
                              help='In addition to keeping the last n params files, also keep params from checkpoint 0.')

    train_params.add_argument('--cache-last-best-params',
                              required=False,
                              type=int,
                              default=0,
                              help='Cache the last n best params files, as distinct from the last n in sequence. '
                                   'Use 0 or negative to disable. Default: %(default)s')

    train_params.add_argument('--cache-strategy',
                              required=False,
                              type=str,
                              default=C.AVERAGE_BEST,
                              choices=C.AVERAGE_CHOICES,
                              help='Strategy to use when deciding which are the "best" params files. '
                                   'Default: %(default)s')

    train_params.add_argument('--cache-metric',
                              required=False,
                              type=str,
                              default=C.PERPLEXITY,
                              choices=C.METRICS,
                              help='Metric to use when deciding which are the "best" params files. '
                                   'Default: %(default)s')

    train_params.add_argument('--dry-run',
                              action='store_true',
                              help="Do not perform any actual training, but print statistics about the model"
                              " and mode of operation.")