in parlai/agents/transformer/transformer.py [0:0]
def add_common_cmdline_args(parser):
"""
Add common command line args.
"""
parser.add_argument(
'-esz',
'--embedding-size',
type=int,
default=300,
help='Size of all embedding layers. Must be a multiple of --n-heads.',
)
parser.add_argument(
'-nl', '--n-layers', type=int, default=2, help='Number of transformer layers.'
)
parser.add_argument(
'-hid',
'--ffn-size',
type=int,
default=300,
help='Hidden size of the FFN layers',
)
parser.add_argument(
'--dropout',
type=float,
default=0.0,
help='Dropout used around embeddings and before layer layer normalizations. '
'This is used in Vaswani 2017 and works well on large datasets.',
)
parser.add_argument(
'--attention-dropout',
type=float,
default=0.0,
help='Dropout used after attention softmax. This is not used in Vaswani 2017.',
)
parser.add_argument(
'--relu-dropout',
type=float,
default=0.0,
help='Dropout used after the ReLU in the FFN. Not used in Vaswani 2017, '
'but used in Tensor2Tensor.',
)
parser.add_argument(
'--n-heads', type=int, default=2, help='Number of multihead attention heads'
)
parser.add_argument(
'--learn-positional-embeddings',
type='bool',
default=False,
help='If off, sinusoidal embeddings are used. If on, position embeddings are '
'learned from scratch.',
)
parser.add_argument('--embeddings-scale', type='bool', default=True)
parser.add_argument(
'--n-positions',
type=int,
default=None,
hidden=True,
help='Number of positional embeddings to learn. Defaults '
'to truncate or 1024 if not provided.',
)
parser.add_argument(
'--n-segments',
type=int,
default=0,
help='The number of segments that support the model. '
'If zero no segment and no langs_embedding.',
)
parser.add_argument(
'--variant',
choices={'aiayn', 'xlm', 'prelayernorm', 'bart'},
default='aiayn',
help='Chooses locations of layer norms, etc. prelayernorm '
'is used to match some fairseq models',
recommended='xlm',
)
parser.add_argument(
'--activation',
choices={'relu', 'gelu'},
default='relu',
help='Nonlinear activation to use. AIAYN uses relu, but '
'more recent papers prefer gelu.',
recommended='gelu',
)
parser.add_argument(
'--output-scaling',
type=float,
default=1.0,
help='scale the output of every transformer by this quantity.',
)
parser.add_argument(
'--share-word-embeddings',
type='bool',
default=True,
help='Share word embeddings table for candidate and context'
'in the memory network',
)
parser.add_argument(
'-nel',
'--n-encoder-layers',
type=int,
default=-1,
help='This will overidde the n-layers for asymmetrical transformers',
)
parser.add_argument(
'-ndl',
'--n-decoder-layers',
type=int,
default=-1,
help='This will overidde the n-layers for asymmetrical transformers',
)
parser.add_argument(
'--model-parallel',
type='bool',
default=False,
help='Shard the layers across multiple GPUs.',
)
parser.add_argument(
'--checkpoint-activations',
type='bool',
default=False,
help='Recompute activations on backward pass to conserve memory.',
)