in pytorch_translate/rnn.py [0:0]
def add_args(parser):
parser.add_argument(
"--language-model-only",
default=False,
action="store_true",
help="whether to train a language model only where no encoder is used",
)
parser.add_argument(
"--dropout",
default=0.1,
type=float,
metavar="D",
help="dropout probability",
)
parser.add_argument(
"--encoder-embed-dim",
default=0,
type=int,
metavar="N",
help="encoder embedding dimension",
)
parser.add_argument(
"--encoder-pretrained-embed",
default=None,
metavar="FILE",
help="path to pre-trained encoder embedding",
)
parser.add_argument(
"--encoder-freeze-embed",
default=False,
action="store_true",
help=(
"whether to freeze the encoder embedding or allow it to be "
"updated during training"
),
)
parser.add_argument(
"--encoder-normalize-embed",
default=False,
action="store_true",
help=(
"whether to normalize the encoder embeddings to have zero mean "
"and unit variance (weighted by token frequency)"
),
)
parser.add_argument(
"--encoder-hidden-dim", type=int, metavar="N", help="encoder cell num units"
)
parser.add_argument(
"--encoder-layers", type=int, metavar="N", help="number of encoder layers"
)
parser.add_argument(
"--encoder-bidirectional",
action="store_true",
help="whether the first layer is bidirectional or not",
)
parser.add_argument(
"--averaging-encoder",
default=False,
action="store_true",
help=(
"whether use mean encoder hidden states as decoder initial "
"states or not"
),
)
parser.add_argument(
"--decoder-embed-dim",
default=0,
type=int,
metavar="N",
help="decoder embedding dimension",
)
parser.add_argument(
"--decoder-pretrained-embed",
default=None,
metavar="FILE",
help="path to pre-trained decoder embedding",
)
parser.add_argument(
"--decoder-freeze-embed",
default=False,
action="store_true",
help=(
"whether to freeze the decoder embedding or allow it to be "
"updated during training"
),
)
parser.add_argument(
"--decoder-hidden-dim", type=int, metavar="N", help="decoder cell num units"
)
parser.add_argument(
"--decoder-layers", type=int, metavar="N", help="number of decoder layers"
)
parser.add_argument(
"--decoder-out-embed-dim",
type=int,
metavar="N",
help="decoder output embedding dimension",
)
parser.add_argument(
"--decoder-out-pretrained-embed",
default=None,
metavar="FILE",
help="path to pre-trained decoder output embedding",
)
parser.add_argument(
"--out-embed-norm",
default=None,
type=float,
help="norm for output projection weights",
)
parser.add_argument(
"--decoder-tie-embeddings",
default=False,
action="store_true",
help="tie the decoder word embeddings with the output projection "
"weights (requires that the embedding dims be of the same size)",
)
parser.add_argument(
"--attention-type",
type=str,
metavar="EXPR",
help="decoder attention, defaults to dot",
)
parser.add_argument(
"--attention-heads",
default=8,
type=int,
metavar="N",
help="number of encoder-decoder attention heads, used when attention"
" type is multihead. Ignored unless attention type is multihead.",
)
parser.add_argument(
"--first-layer-attention",
default=False,
action="store_true",
help="calculates attention after decoder's first RNN layer and"
"concatenates it in input of every subsequent layer",
)
parser.add_argument(
"--residual-level",
default=None,
type=int,
help=(
"First layer where to apply a residual connection. "
"The value should be greater than 0 and smaller than the number of "
"layers."
),
)
parser.add_argument(
"--cell-type",
default="lstm",
type=str,
metavar="EXPR",
help="cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm",
)
# Granular dropout settings (if not specified these default to --dropout)
parser.add_argument(
"--encoder-dropout-in",
type=float,
metavar="D",
help="dropout probability for encoder input embedding",
)
parser.add_argument(
"--encoder-dropout-out",
type=float,
metavar="D",
help="dropout probability for encoder output",
)
parser.add_argument(
"--decoder-dropout-in",
type=float,
metavar="D",
help="dropout probability for decoder input embedding",
)
parser.add_argument(
"--decoder-dropout-out",
type=float,
metavar="D",
help="dropout probability for decoder output",
)
parser.add_argument(
"--sequence-lstm",
action="store_true",
help="use nn.LSTM implementation for encoder",
)
parser.add_argument(
"--ngram-decoder",
default=None,
type=int,
nargs="+",
help=(
"A single integer, or a list of integers. If "
"positive, the decoder is not recurrent but a feedforward "
"network with target-side n-gram history as input. The decoder "
"is still conditioned on the source side via attention. If "
"this parameter is a list of integers, the n-th entry applies "
"to the n-th decoder (for multilingual models and "
"multi-decoders)"
),
)
parser.add_argument(
"--ngram-activation-type",
default="relu",
type=str,
metavar="EXPR",
help=(
"Activation in FF layers of the ngram decoder, defaults to "
"relu, values: relu, tanh"
),
)
parser.add_argument(
"--multi-encoder",
default=None,
type=int,
help=(
"If this is positive, train n encoder networks rather than "
"only one. The outputs of the encoders are concatenated before "
"passing them through to the decoder."
),
)
parser.add_argument(
"--multi-decoder",
default=None,
type=int,
help=(
"If this is positive, train n decoder networks rather than "
"only one. The predictions are combined via the method in "
"--multi-decoder-combination-strategy."
),
)
parser.add_argument(
"--multi-decoder-combination-strategy",
default="bottleneck",
type=str,
metavar="EXPR",
help=(
"Only used if --multi-decoder is positive. Controls how the "
"decoders are combined with each other.\n"
"- uniform: Separate projection layers, average predictions\n"
"- uniform-probspace: Separate projection layers, average "
"in probability space.\n"
"- uniform-logprobspace: Separate projection layers, average "
"in log-probability space.\n"
"- unprojected: Shared projection layer, unprojected "
"decoder outputs are averaged.\n"
"- deepfusion: cf. https://arxiv.org/pdf/1503.03535.pdf \n"
"- coldfusion: cf. https://arxiv.org/pdf/1708.06426.pdf \n"
"- weighted: Separate projection layers, weighted average "
"of logits. Weights are learned from unprojected decoder "
"outputs.\n"
"- weighted-probspace: Like 'weighted', but average in "
"probability space.\n"
"- weighted-logprobspace: Like 'weighted', but average in "
"log-probability space.\n"
"- weighted-unprojected: Shared projection layer, weighted "
"average of decoder outputs. Weights are learned from "
"unprojected decoder outputs.\n"
"- concat: Shared projection layer, decoder outputs are "
"concatenated.\n"
"- bottleneck: Like 'concat' but with an additional "
"bottleneck layer to reduce the size of the output embedding "
"matrix.\n"
"- deep_bottleneck: Like 'bottleneck' but with an additional "
"non-linear layer.\n"
"- multiplicative-unprojected: Shared projection layer, element"
"-wise product of decoder outputs after ReLU.\n"
"- max-unprojected: Shared projection layer, element"
"-wise max of decoder outputs.\n"
),
)
parser.add_argument(
"--multi-model-fixed-weights",
default=None,
type=float,
nargs="+",
help=(
"Used for weighted* combination strategies. If specified, use "
"these fixed model weights rather than a gating network."
),
)
parser.add_argument(
"--multi-model-training-schedule",
default="complete",
type=str,
metavar="EXPR",
help=(
"Only used if --multi-decoder is positive.\n"
"- 'complete': Jointly train entire network on all batches.\n"
"- 'unfreeze_single': Freeze all submodels except one for each "
"training batch.\n"
"- 'unfreeze_single_encoder': Freeze all encoders except one "
"for each training batch.\n"
"- 'unfreeze_single_decoder': Freeze all decoders except one "
"for each training batch.\n"
"- 'unfreeze_enc_N': Freeze N-th encoder.\n"
"- 'unfreeze_dec_N': Freeze N-th decoder.\n"
"- 'unfreeze_encdec_N': Freeze N-th encoder and N-th decoder.\n"
"- 'freeze_all': Freeze all submodels, only train combination "
"strategy.\n"
"- 'freeze_all_encoders': Freeze all encoders.\n"
"- 'freeze_all_decoders': Freeze all decoders.\n"
"- 'separate': Each training batch is used for only one of the "
"following: Train the n-th submodel, or train combination "
"strategy."
),
)
parser.add_argument(
"--multi-decoder-is-lm",
default=None,
type=int,
nargs="+",
help=(
"If specified, sets --attention-type=no and --encoder-hidden-dim=0"
"for the n-th decoder in an adaptive ensemble."
),
)
parser.add_argument(
"--att-weighted-source-embeds",
default=False,
action="store_true",
help=(
"whether use attention weighted src embeddings to improve rare "
"words translation or not"
),
)
parser.add_argument(
"--encoder-context-embed",
default=False,
help=(
"whether to use context-dependent source embeddings in the encoder "
"for word disambiguation"
),
action="store_true",
)
parser.add_argument(
"--att-weighted-activation-type",
default="tanh",
type=str,
metavar="EXPR",
help=(
"Activation in FF layers of the attention weighted src embeddings, "
"defaults to relu, values: relu, tanh"
),
)
parser.add_argument(
"--adaptive-softmax-cutoff",
metavar="EXPR",
help="comma separated list of adaptive softmax cutoff points. "
"Must be used with adaptive_loss criterion",
)
# Args for vocab reduction
vocab_reduction.add_args(parser)