in fairseq/models/roberta/model.py [0:0]
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument(
"--encoder-layers", type=int, metavar="L", help="num encoder layers"
)
parser.add_argument(
"--encoder-embed-dim",
type=int,
metavar="H",
help="encoder embedding dimension",
)
parser.add_argument(
"--encoder-ffn-embed-dim",
type=int,
metavar="F",
help="encoder embedding dimension for FFN",
)
parser.add_argument(
"--encoder-attention-heads",
type=int,
metavar="A",
help="num encoder attention heads",
)
parser.add_argument(
"--activation-fn",
choices=utils.get_available_activation_fns(),
help="activation function to use",
)
parser.add_argument(
"--pooler-activation-fn",
choices=utils.get_available_activation_fns(),
help="activation function to use for pooler layer",
)
parser.add_argument(
"--encoder-normalize-before",
action="store_true",
help="apply layernorm before each encoder block",
)
parser.add_argument(
"--layernorm-embedding",
action="store_true",
help="add layernorm to embedding",
)
parser.add_argument(
"--dropout", type=float, metavar="D", help="dropout probability"
)
parser.add_argument(
"--attention-dropout",
type=float,
metavar="D",
help="dropout probability for attention weights",
)
parser.add_argument(
"--activation-dropout",
type=float,
metavar="D",
help="dropout probability after activation in FFN",
)
parser.add_argument(
"--pooler-dropout",
type=float,
metavar="D",
help="dropout probability in the masked_lm pooler layers",
)
parser.add_argument(
"--max-positions", type=int, help="number of positional embeddings to learn"
)
parser.add_argument(
"--load-checkpoint-heads",
action="store_true",
help="(re-)register and load heads when loading checkpoints",
)
parser.add_argument(
"--untie-weights-roberta",
action="store_true",
help="Untie weights between embeddings and classifiers in RoBERTa",
)
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument(
"--encoder-layerdrop",
type=float,
metavar="D",
default=0,
help="LayerDrop probability for encoder",
)
parser.add_argument(
"--encoder-layers-to-keep",
default=None,
help="which layers to *keep* when pruning as a comma-separated list",
)
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument(
"--quant-noise-pq",
type=float,
metavar="D",
default=0,
help="iterative PQ quantization noise at training time",
)
parser.add_argument(
"--quant-noise-pq-block-size",
type=int,
metavar="D",
default=8,
help="block size of quantization noise at training time",
)
parser.add_argument(
"--quant-noise-scalar",
type=float,
metavar="D",
default=0,
help="scalar quantization noise and scalar quantization at training time",
)
# args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020)
parser.add_argument(
"--spectral-norm-classification-head",
action="store_true",
default=False,
help="Apply spectral normalization on the classification head",
)
# args for Fully Sharded Data Parallel (FSDP) training
parser.add_argument(
"--min-params-to-wrap",
type=int,
metavar="D",
default=DEFAULT_MIN_PARAMS_TO_WRAP,
help=(
"minimum number of params for a layer to be wrapped with FSDP() when "
"training with --ddp-backend=fully_sharded. Smaller values will "
"improve memory efficiency, but may make torch.distributed "
"communication less efficient due to smaller input sizes. This option "
"is set to 0 (i.e., always wrap) when --checkpoint-activations or "
"--offload-activations are passed."
),
)
# args for AdaPruning
# In short, it adds regularizarion for the multihead attention module and feed forward neural nets
# For more details, please refer to the paper https://openreview.net/forum?id=_CMSV7FTzGI
parser.add_argument(
"--mha-reg-scale-factor",
type=float,
metavar="D",
default=0.0,
help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375",
)
parser.add_argument(
"--ffn-reg-scale-factor",
type=float,
metavar="D",
default=0.0,
help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375",
)
parser.add_argument(
"--mha-heads-to-keep",
type=int,
metavar="D",
default=-1,
help="number of heads to keep in each multi-head attention module, -1 means keeping all heads",
)
parser.add_argument(
"--ffn-blocks-to-remove",
type=int,
metavar="D",
default=-1,
help="number of feedforward blocks to remove in each transformer layer, -1 means keeping all ffn blocks",
)