def add_args()

in pytorch_translate/rnn.py [0:0]


    def add_args(parser):
        parser.add_argument(
            "--language-model-only",
            default=False,
            action="store_true",
            help="whether to train a language model only where no encoder is used",
        )
        parser.add_argument(
            "--dropout",
            default=0.1,
            type=float,
            metavar="D",
            help="dropout probability",
        )
        parser.add_argument(
            "--encoder-embed-dim",
            default=0,
            type=int,
            metavar="N",
            help="encoder embedding dimension",
        )
        parser.add_argument(
            "--encoder-pretrained-embed",
            default=None,
            metavar="FILE",
            help="path to pre-trained encoder embedding",
        )
        parser.add_argument(
            "--encoder-freeze-embed",
            default=False,
            action="store_true",
            help=(
                "whether to freeze the encoder embedding or allow it to be "
                "updated during training"
            ),
        )
        parser.add_argument(
            "--encoder-normalize-embed",
            default=False,
            action="store_true",
            help=(
                "whether to normalize the encoder embeddings to have zero mean "
                "and unit variance (weighted by token frequency)"
            ),
        )
        parser.add_argument(
            "--encoder-hidden-dim", type=int, metavar="N", help="encoder cell num units"
        )
        parser.add_argument(
            "--encoder-layers", type=int, metavar="N", help="number of encoder layers"
        )
        parser.add_argument(
            "--encoder-bidirectional",
            action="store_true",
            help="whether the first layer is bidirectional or not",
        )
        parser.add_argument(
            "--averaging-encoder",
            default=False,
            action="store_true",
            help=(
                "whether use mean encoder hidden states as decoder initial "
                "states or not"
            ),
        )
        parser.add_argument(
            "--decoder-embed-dim",
            default=0,
            type=int,
            metavar="N",
            help="decoder embedding dimension",
        )
        parser.add_argument(
            "--decoder-pretrained-embed",
            default=None,
            metavar="FILE",
            help="path to pre-trained decoder embedding",
        )
        parser.add_argument(
            "--decoder-freeze-embed",
            default=False,
            action="store_true",
            help=(
                "whether to freeze the decoder embedding or allow it to be "
                "updated during training"
            ),
        )
        parser.add_argument(
            "--decoder-hidden-dim", type=int, metavar="N", help="decoder cell num units"
        )
        parser.add_argument(
            "--decoder-layers", type=int, metavar="N", help="number of decoder layers"
        )
        parser.add_argument(
            "--decoder-out-embed-dim",
            type=int,
            metavar="N",
            help="decoder output embedding dimension",
        )
        parser.add_argument(
            "--decoder-out-pretrained-embed",
            default=None,
            metavar="FILE",
            help="path to pre-trained decoder output embedding",
        )
        parser.add_argument(
            "--out-embed-norm",
            default=None,
            type=float,
            help="norm for output projection weights",
        )
        parser.add_argument(
            "--decoder-tie-embeddings",
            default=False,
            action="store_true",
            help="tie the decoder word embeddings with the output projection "
            "weights (requires that the embedding dims be of the same size)",
        )
        parser.add_argument(
            "--attention-type",
            type=str,
            metavar="EXPR",
            help="decoder attention, defaults to dot",
        )
        parser.add_argument(
            "--attention-heads",
            default=8,
            type=int,
            metavar="N",
            help="number of encoder-decoder attention heads, used when attention"
            " type is multihead. Ignored unless attention type is multihead.",
        )
        parser.add_argument(
            "--first-layer-attention",
            default=False,
            action="store_true",
            help="calculates attention after decoder's first RNN layer and"
            "concatenates it in input of every subsequent layer",
        )
        parser.add_argument(
            "--residual-level",
            default=None,
            type=int,
            help=(
                "First layer where to apply a residual connection. "
                "The value should be greater than 0 and smaller than the number of "
                "layers."
            ),
        )
        parser.add_argument(
            "--cell-type",
            default="lstm",
            type=str,
            metavar="EXPR",
            help="cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm",
        )

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--encoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for encoder input embedding",
        )
        parser.add_argument(
            "--encoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder output",
        )
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding",
        )
        parser.add_argument(
            "--decoder-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for decoder output",
        )
        parser.add_argument(
            "--sequence-lstm",
            action="store_true",
            help="use nn.LSTM implementation for encoder",
        )
        parser.add_argument(
            "--ngram-decoder",
            default=None,
            type=int,
            nargs="+",
            help=(
                "A single integer, or a list of integers. If "
                "positive, the decoder is not recurrent but a feedforward "
                "network with target-side n-gram history as input. The decoder "
                "is still conditioned on the source side via attention. If "
                "this parameter is a list of integers, the n-th entry applies "
                "to the n-th decoder (for multilingual models and "
                "multi-decoders)"
            ),
        )
        parser.add_argument(
            "--ngram-activation-type",
            default="relu",
            type=str,
            metavar="EXPR",
            help=(
                "Activation in FF layers of the ngram decoder, defaults to "
                "relu, values: relu, tanh"
            ),
        )
        parser.add_argument(
            "--multi-encoder",
            default=None,
            type=int,
            help=(
                "If this is positive, train n encoder networks rather than "
                "only one. The outputs of the encoders are concatenated before "
                "passing them through to the decoder."
            ),
        )
        parser.add_argument(
            "--multi-decoder",
            default=None,
            type=int,
            help=(
                "If this is positive, train n decoder networks rather than "
                "only one. The predictions are combined via the method in "
                "--multi-decoder-combination-strategy."
            ),
        )
        parser.add_argument(
            "--multi-decoder-combination-strategy",
            default="bottleneck",
            type=str,
            metavar="EXPR",
            help=(
                "Only used if --multi-decoder is positive. Controls how the "
                "decoders are combined with each other.\n"
                "- uniform: Separate projection layers, average predictions\n"
                "- uniform-probspace: Separate projection layers, average "
                "in probability space.\n"
                "- uniform-logprobspace: Separate projection layers, average "
                "in log-probability space.\n"
                "- unprojected: Shared projection layer, unprojected "
                "decoder outputs are averaged.\n"
                "- deepfusion: cf. https://arxiv.org/pdf/1503.03535.pdf \n"
                "- coldfusion: cf. https://arxiv.org/pdf/1708.06426.pdf \n"
                "- weighted: Separate projection layers, weighted average "
                "of logits. Weights are learned from unprojected decoder "
                "outputs.\n"
                "- weighted-probspace: Like 'weighted', but average in "
                "probability space.\n"
                "- weighted-logprobspace: Like 'weighted', but average in "
                "log-probability space.\n"
                "- weighted-unprojected: Shared projection layer, weighted "
                "average of decoder outputs. Weights are learned from "
                "unprojected decoder outputs.\n"
                "- concat: Shared projection layer, decoder outputs are "
                "concatenated.\n"
                "- bottleneck: Like 'concat' but with an additional "
                "bottleneck layer to reduce the size of the output embedding "
                "matrix.\n"
                "- deep_bottleneck: Like 'bottleneck' but with an additional "
                "non-linear layer.\n"
                "- multiplicative-unprojected: Shared projection layer, element"
                "-wise product of decoder outputs after ReLU.\n"
                "- max-unprojected: Shared projection layer, element"
                "-wise max of decoder outputs.\n"
            ),
        )
        parser.add_argument(
            "--multi-model-fixed-weights",
            default=None,
            type=float,
            nargs="+",
            help=(
                "Used for weighted* combination strategies. If specified, use "
                "these fixed model weights rather than a gating network."
            ),
        )
        parser.add_argument(
            "--multi-model-training-schedule",
            default="complete",
            type=str,
            metavar="EXPR",
            help=(
                "Only used if --multi-decoder is positive.\n"
                "- 'complete': Jointly train entire network on all batches.\n"
                "- 'unfreeze_single': Freeze all submodels except one for each "
                "training batch.\n"
                "- 'unfreeze_single_encoder': Freeze all encoders except one "
                "for each training batch.\n"
                "- 'unfreeze_single_decoder': Freeze all decoders except one "
                "for each training batch.\n"
                "- 'unfreeze_enc_N': Freeze N-th encoder.\n"
                "- 'unfreeze_dec_N': Freeze N-th decoder.\n"
                "- 'unfreeze_encdec_N': Freeze N-th encoder and N-th decoder.\n"
                "- 'freeze_all': Freeze all submodels, only train combination "
                "strategy.\n"
                "- 'freeze_all_encoders': Freeze all encoders.\n"
                "- 'freeze_all_decoders': Freeze all decoders.\n"
                "- 'separate': Each training batch is used for only one of the "
                "following: Train the n-th submodel, or train combination "
                "strategy."
            ),
        )
        parser.add_argument(
            "--multi-decoder-is-lm",
            default=None,
            type=int,
            nargs="+",
            help=(
                "If specified, sets --attention-type=no and --encoder-hidden-dim=0"
                "for the n-th decoder in an adaptive ensemble."
            ),
        )
        parser.add_argument(
            "--att-weighted-source-embeds",
            default=False,
            action="store_true",
            help=(
                "whether use attention weighted src embeddings to improve rare "
                "words translation or not"
            ),
        )
        parser.add_argument(
            "--encoder-context-embed",
            default=False,
            help=(
                "whether to use context-dependent source embeddings in the encoder "
                "for word disambiguation"
            ),
            action="store_true",
        )
        parser.add_argument(
            "--att-weighted-activation-type",
            default="tanh",
            type=str,
            metavar="EXPR",
            help=(
                "Activation in FF layers of the attention weighted src embeddings, "
                "defaults to relu, values: relu, tanh"
            ),
        )
        parser.add_argument(
            "--adaptive-softmax-cutoff",
            metavar="EXPR",
            help="comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion",
        )

        # Args for vocab reduction
        vocab_reduction.add_args(parser)