in pytorch_translate/transformer.py [0:0]
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument(
"--dropout", type=float, metavar="D", help="dropout probability"
)
parser.add_argument(
"--attention-dropout",
type=float,
metavar="D",
help="dropout probability for attention weights",
)
parser.add_argument(
"--relu-dropout",
type=float,
metavar="D",
help="dropout probability after ReLU in FFN",
)
parser.add_argument(
"--encoder-pretrained-embed",
type=str,
metavar="STR",
help="path to pre-trained encoder embedding",
)
parser.add_argument(
"--encoder-embed-dim",
type=int,
metavar="N",
help="encoder embedding dimension",
)
parser.add_argument(
"--encoder-ffn-embed-dim",
type=int,
metavar="N",
help="encoder embedding dimension for FFN",
)
parser.add_argument(
"--encoder-freeze-embed",
default=False,
action="store_true",
help=(
"whether to freeze the encoder embedding or allow it to be "
"updated during training"
),
)
parser.add_argument(
"--encoder-layers", type=int, metavar="N", help="num encoder layers"
)
parser.add_argument(
"--encoder-attention-heads",
type=int,
metavar="N",
help="num encoder attention heads",
)
parser.add_argument(
"--encoder-normalize-before",
default=False,
action="store_true",
help="apply layernorm before each encoder block",
)
parser.add_argument(
"--encoder-learned-pos",
default=False,
action="store_true",
help="use learned positional embeddings in the encoder",
)
parser.add_argument(
"--decoder-pretrained-embed",
type=str,
metavar="STR",
help="path to pre-trained decoder embedding",
)
parser.add_argument(
"--decoder-embed-dim",
type=int,
metavar="N",
help="decoder embedding dimension",
)
parser.add_argument(
"--decoder-ffn-embed-dim",
type=int,
metavar="N",
help="decoder embedding dimension for FFN",
)
parser.add_argument(
"--decoder-freeze-embed",
default=False,
action="store_true",
help=(
"whether to freeze the encoder embedding or allow it to be "
"updated during training"
),
)
parser.add_argument(
"--decoder-layers", type=int, metavar="N", help="num decoder layers"
)
parser.add_argument(
"--decoder-attention-heads",
type=int,
metavar="N",
help="num decoder attention heads",
)
parser.add_argument(
"--decoder-learned-pos",
default=False,
action="store_true",
help="use learned positional embeddings in the decoder",
)
parser.add_argument(
"--decoder-normalize-before",
default=False,
action="store_true",
help="apply layernorm before each decoder block",
)
parser.add_argument(
"--decoder-layerdrop",
type=float,
metavar="D",
default=0,
help="LayerDrop probability for decoder",
)
parser.add_argument(
"--decoder-layers-to-keep",
default=None,
help="which layers to *keep* when pruning as a comma-separated list",
)
parser.add_argument(
"--share-decoder-input-output-embed",
default=False,
action="store_true",
help="share decoder input and output embeddings",
)
parser.add_argument(
"--share-all-embeddings",
default=False,
action="store_true",
help="share encoder, decoder and output embeddings"
" (requires shared dictionary and embed dim)",
)
parser.add_argument(
"--adaptive-softmax-cutoff",
default=None,
metavar="EXPR",
help="comma separated list of adaptive softmax cutoff points. "
"Must be used with adaptive_loss criterion",
)
parser.add_argument(
"--decoder-out-embed-dim",
default=None,
type=int,
metavar="N",
help="decoder output embedding dimension (bottleneck layer before"
"output layer if specified.)",
)
parser.add_argument(
"--aan",
default=False,
action="store_true",
help="use average attention network (AAN) instead of decoder "
"self-attention",
)
# Args for vocab reduction
vocab_reduction.add_args(parser)