def add_common_cmdline

def add_common_cmdline_args()

in parlai/agents/transformer/transformer.py [0:0]
119 lines of code
1 McCabe index (conditional complexity)

def add_common_cmdline_args(parser):
    """
    Add common command line args.
    """
    parser.add_argument(
        '-esz',
        '--embedding-size',
        type=int,
        default=300,
        help='Size of all embedding layers. Must be a multiple of --n-heads.',
    )
    parser.add_argument(
        '-nl', '--n-layers', type=int, default=2, help='Number of transformer layers.'
    )
    parser.add_argument(
        '-hid',
        '--ffn-size',
        type=int,
        default=300,
        help='Hidden size of the FFN layers',
    )
    parser.add_argument(
        '--dropout',
        type=float,
        default=0.0,
        help='Dropout used around embeddings and before layer layer normalizations. '
        'This is used in Vaswani 2017 and works well on large datasets.',
    )
    parser.add_argument(
        '--attention-dropout',
        type=float,
        default=0.0,
        help='Dropout used after attention softmax. This is not used in Vaswani 2017.',
    )
    parser.add_argument(
        '--relu-dropout',
        type=float,
        default=0.0,
        help='Dropout used after the ReLU in the FFN. Not used in Vaswani 2017, '
        'but used in Tensor2Tensor.',
    )
    parser.add_argument(
        '--n-heads', type=int, default=2, help='Number of multihead attention heads'
    )
    parser.add_argument(
        '--learn-positional-embeddings',
        type='bool',
        default=False,
        help='If off, sinusoidal embeddings are used. If on, position embeddings are '
        'learned from scratch.',
    )
    parser.add_argument('--embeddings-scale', type='bool', default=True)
    parser.add_argument(
        '--n-positions',
        type=int,
        default=None,
        hidden=True,
        help='Number of positional embeddings to learn. Defaults '
        'to truncate or 1024 if not provided.',
    )
    parser.add_argument(
        '--n-segments',
        type=int,
        default=0,
        help='The number of segments that support the model. '
        'If zero no segment and no langs_embedding.',
    )
    parser.add_argument(
        '--variant',
        choices={'aiayn', 'xlm', 'prelayernorm', 'bart'},
        default='aiayn',
        help='Chooses locations of layer norms, etc. prelayernorm '
        'is used to match some fairseq models',
        recommended='xlm',
    )
    parser.add_argument(
        '--activation',
        choices={'relu', 'gelu'},
        default='relu',
        help='Nonlinear activation to use. AIAYN uses relu, but '
        'more recent papers prefer gelu.',
        recommended='gelu',
    )
    parser.add_argument(
        '--output-scaling',
        type=float,
        default=1.0,
        help='scale the output of every transformer by this quantity.',
    )
    parser.add_argument(
        '--share-word-embeddings',
        type='bool',
        default=True,
        help='Share word embeddings table for candidate and context'
        'in the memory network',
    )
    parser.add_argument(
        '-nel',
        '--n-encoder-layers',
        type=int,
        default=-1,
        help='This will overidde the n-layers for asymmetrical transformers',
    )
    parser.add_argument(
        '-ndl',
        '--n-decoder-layers',
        type=int,
        default=-1,
        help='This will overidde the n-layers for asymmetrical transformers',
    )
    parser.add_argument(
        '--model-parallel',
        type='bool',
        default=False,
        help='Shard the layers across multiple GPUs.',
    )
    parser.add_argument(
        '--checkpoint-activations',
        type='bool',
        default=False,
        help='Recompute activations on backward pass to conserve memory.',
    )