def get_parser()

in main.py [0:0]


def get_parser():
    parser = argparse.ArgumentParser()
    # model related
    parser.add_argument("--hid-sz", type=int, default=256, help="hidden size")
    parser.add_argument(
        "--inner-hid-sz", type=int, default=1024, help="inner hidden size of FF layer"
    )
    parser.add_argument("--nlayers", type=int, default=8, help="number of layers")
    parser.add_argument("--mem-sz", type=int, default=64, help="memory size")
    parser.add_argument(
        "--nheads", type=int, default=2, help="number of attention heads"
    )
    parser.add_argument(
        "--dropout", type=float, default=0.2, help="dropout rate of ReLU and attention"
    )
    parser.add_argument(
        "--compress",
        action="store_true",
        default=False,
        help="use the compressive transformer",
    )
    parser.add_argument(
        "--feedback",
        action="store_true",
        default=False,
        help="use the feedback transformer, computing one step at a time like RNNs",
    )
    parser.add_argument(
        "--expire-span",
        action="store_true",
        default=False,
        help="compute expiration span for each memory",
    )
    # optimization related
    parser.add_argument("--lr", type=float, default=0.03, help="learning rate")
    parser.add_argument("--momentum", type=float, default=0.9, help="SGD momentum")
    parser.add_argument("--batch-sz", type=int, default=64, help="batch size")
    parser.add_argument(
        "--test-batch-sz",
        type=int,
        default=0,
        help="set different batch size for test and val data if greater than 0",
    )
    parser.add_argument(
        "--nbatches", type=int, default=1000, help="number of batches in each epoch"
    )
    parser.add_argument(
        "--nepochs", type=int, default=1000, help="number of epochs to train"
    )
    parser.add_argument(
        "--optim",
        type=str,
        default="sgd",
        choices=("sgd", "adam"),
        help="optimization method",
    )
    parser.add_argument(
        "--lr-warmup",
        type=int,
        default=0,
        help="linearly increase LR from 0 during K updates",
    )
    parser.add_argument(
        "--lr-decay",
        action="store_true",
        default=False,
        help="decay learning rate with cosine scheduler",
    )
    parser.add_argument(
        "--grad-clip", type=float, default=0, help="clip gradient value",
    )
    parser.add_argument(
        "--split-batch",
        type=int,
        default=1,
        help="split batches into smaller pieces so it can fit in memory",
    )
    # data related
    parser.add_argument(
        "--data", type=str, help="data file location", required=True,
    )
    parser.add_argument(
        "--data-type",
        type=str,
        default="char",
        choices=["char", "word"],
        help="data type",
    )
    parser.add_argument(
        "--data-eos",
        action="store_true",
        default=False,
        help="include the end-of-line as as token",
    )
    parser.add_argument(
        "--data-omit-labels",
        nargs="+",
        type=str,
        default=[],
        help="do not train on those labels",
    )
    # plotting
    parser.add_argument(
        "--plot", action="store_true", default=False, help="plot in tensorboard"
    )
    parser.add_argument(
        "--plot-dir", type=str, default="tensorboard_runs", help="tensorboard log dir",
    )
    parser.add_argument(
        "--plot-name",
        type=str,
        default="",
        help="tensorboard log name (default: datetime)",
    )
    # misc
    parser.add_argument(
        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
    )
    parser.add_argument(
        "--checkpoint", type=str, default="", help="path to save/load model"
    )
    parser.add_argument(
        "--checkpoint-freq", type=int, default=0, help="how often to keep a copy"
    )
    parser.add_argument(
        "--full-test",
        action="store_true",
        default=False,
        help="do testing on whole data",
    )
    parser.add_argument(
        "--full-valid",
        action="store_true",
        default=False,
        help="do validation on whole data (during training)",
    )
    parser.add_argument(
        "--lazy-load-data",
        action="store_true",
        default=False,
        help="moves data to GPU one sample at a time",
    )
    transformer_seq.add_args(parser)
    distributed.add_args(parser)
    adaptive_span.add_args(parser)
    compressive.add_args(parser)
    expire_span.add_args(parser)
    feedback.add_args(parser)
    return parser