in sockeye/arguments.py [0:0]
def add_model_parameters(params):
model_params = params.add_argument_group("ModelConfig")
model_params.add_argument('--params', '-p',
type=str,
default=None,
help='Initialize model parameters from file. Overrides random initializations.')
model_params.add_argument('--allow-missing-params',
action="store_true",
default=False,
help="Allow missing parameters when initializing model parameters from file. "
"Default: %(default)s.")
model_params.add_argument('--ignore-extra-params',
action="store_true",
default=False,
help="Allow extra parameters when initializing model parameters from file. "
"Default: %(default)s.")
model_params.add_argument('--encoder',
choices=C.ENCODERS,
default=C.TRANSFORMER_TYPE,
help="Type of encoder. Default: %(default)s.")
model_params.add_argument('--decoder',
choices=C.DECODERS,
default=C.TRANSFORMER_TYPE,
help="Type of decoder. Default: %(default)s. "
"'ssru_transformer' uses Simpler Simple Recurrent Units (Kim et al, 2019) "
"as replacement for self-attention layers.")
model_params.add_argument('--num-layers',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(6, 6),
help='Number of layers for encoder & decoder. '
'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.')
# transformer arguments
model_params.add_argument('--transformer-model-size',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(512, 512),
help='Number of hidden units in transformer layers. '
'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.')
model_params.add_argument('--transformer-attention-heads',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(8, 8),
help='Number of heads for all self-attention when using transformer layers. '
'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.')
model_params.add_argument('--transformer-feed-forward-num-hidden',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(2048, 2048),
help='Number of hidden units in transformers feed forward layers. '
'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.')
model_params.add_argument('--transformer-feed-forward-use-glu',
action='store_true',
default=False,
help='Use Gated Linear Units in transformer feed forward networks (Daupin et al. 2016, '
'arxiv.org/abs/1612.08083; Shazeer 2020, arxiv.org/abs/2002.05202). Default: '
'%(default)s.')
model_params.add_argument('--transformer-activation-type',
type=multiple_values(num_values=2, greater_or_equal=None, data_type=str),
default=(C.RELU, C.RELU),
help='Type of activation to use for each feed forward layer. Use "x:x" to specify '
'different values for encoder & decoder. Supported: {}. Default: '
'%(default)s.'.format(' '.join(C.TRANSFORMER_ACTIVATION_TYPES)))
model_params.add_argument('--transformer-positional-embedding-type',
choices=C.POSITIONAL_EMBEDDING_TYPES,
default=C.FIXED_POSITIONAL_EMBEDDING,
help='The type of positional embedding. Default: %(default)s.')
model_params.add_argument('--transformer-preprocess',
type=multiple_values(num_values=2, greater_or_equal=None, data_type=str),
default=('n', 'n'),
help='Transformer preprocess sequence for encoder and decoder. Supports three types of '
'operations: d=dropout, r=residual connection, n=layer normalization. You can '
'combine in any order, for example: "ndr". '
'Leave empty to not use any of these operations. '
'You can specify separate sequences for encoder and decoder by separating with ":" '
'For example: n:drn '
'Default: %(default)s.')
model_params.add_argument('--transformer-postprocess',
type=multiple_values(num_values=2, greater_or_equal=None, data_type=str),
default=('dr', 'dr'),
help='Transformer postprocess sequence for encoder and decoder. Supports three types of '
'operations: d=dropout, r=residual connection, n=layer normalization. You can '
'combine in any order, for example: "ndr". '
'Leave empty to not use any of these operations. '
'You can specify separate sequences for encoder and decoder by separating with ":" '
'For example: n:drn '
'Default: %(default)s.')
model_params.add_argument('--lhuc',
nargs="+",
default=None,
choices=C.LHUC_CHOICES,
metavar="COMPONENT",
help="Use LHUC (Vilar 2018). Include an amplitude parameter to hidden units for"
" domain adaptation. Needs a pre-trained model. Valid values: {values}."
" Default: %(default)s.".format(
values=", ".join(C.LHUC_CHOICES)))
# embedding arguments
model_params.add_argument('--num-embed',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(None, None),
help='Embedding size for source and target tokens. '
'Use "x:x" to specify separate values for src&tgt. Default: %d.' % C.DEFAULT_NUM_EMBED)
model_params.add_argument('--source-factors-num-embed',
type=int,
nargs='+',
default=[],
help='Embedding size for additional source factors. '
'You must provide as many dimensions as '
'(validation) source factor files. Default: %(default)s.')
model_params.add_argument('--target-factors-num-embed',
type=int,
nargs='+',
default=[],
help='Embedding size for additional target factors. '
'You must provide as many dimensions as '
'(validation) target factor files. Default: %(default)s.')
model_params.add_argument('--source-factors-combine', '-sfc',
choices=C.FACTORS_COMBINE_CHOICES,
default=[C.FACTORS_COMBINE_SUM],
nargs='+',
help='How to combine source factors. Can be either one value which will be applied to '
'all source factors, or a list of values. Default: %(default)s.')
model_params.add_argument('--target-factors-combine', '-tfc',
choices=C.FACTORS_COMBINE_CHOICES,
default=[C.FACTORS_COMBINE_SUM],
nargs='+',
help='How to combine target factors. Can be either one value which will be applied to '
'all target factors, or a list of values. Default: %(default)s.')
model_params.add_argument('--source-factors-share-embedding',
type=bool_str(),
nargs='+',
default=[False],
help='Share the embeddings with the source language. '
'Can be either one value which will be applied '
'to all source factors, or a list of values. Default: %(default)s.')
model_params.add_argument('--target-factors-share-embedding',
type=bool_str(),
nargs='+',
default=[False],
help='Share the embeddings with the target language. '
'Can be either one value which will be applied '
'to all target factors, or a list of values. Default: %(default)s.')
model_params.add_argument('--weight-tying-type',
default=C.WEIGHT_TYING_SRC_TRG_SOFTMAX,
choices=C.WEIGHT_TYING_TYPES,
help='The type of weight tying. source embeddings=src, target embeddings=trg, '
'target softmax weight matrix=softmax. Default: %(default)s.')
model_params.add_argument('--dtype', default=C.DTYPE_FP32, choices=[C.DTYPE_FP32, C.DTYPE_FP16],
help="Data type.")
model_params.add_argument('--amp',
action='store_true',
help='Use PyTorch automatic mixed precision (AMP) to run compatible operations in '
'float16 mode instead of float32.')
model_params.add_argument('--apex-amp',
action='store_true',
help='Use NVIDIA Apex automatic mixed precision (AMP) to run the entire model in float16 '
'mode with float32 master weights and dynamic loss scaling. This is faster than '
'PyTorch AMP with some additional risk and requires installing Apex: '
'https://github.com/NVIDIA/apex')