in main.py [0:0]
def get_parser():
parser = argparse.ArgumentParser()
# model related
parser.add_argument("--hid-sz", type=int, default=256, help="hidden size")
parser.add_argument(
"--inner-hid-sz", type=int, default=1024, help="inner hidden size of FF layer"
)
parser.add_argument("--nlayers", type=int, default=8, help="number of layers")
parser.add_argument("--mem-sz", type=int, default=64, help="memory size")
parser.add_argument(
"--nheads", type=int, default=2, help="number of attention heads"
)
parser.add_argument(
"--dropout", type=float, default=0.2, help="dropout rate of ReLU and attention"
)
parser.add_argument(
"--compress",
action="store_true",
default=False,
help="use the compressive transformer",
)
parser.add_argument(
"--feedback",
action="store_true",
default=False,
help="use the feedback transformer, computing one step at a time like RNNs",
)
parser.add_argument(
"--expire-span",
action="store_true",
default=False,
help="compute expiration span for each memory",
)
# optimization related
parser.add_argument("--lr", type=float, default=0.03, help="learning rate")
parser.add_argument("--momentum", type=float, default=0.9, help="SGD momentum")
parser.add_argument("--batch-sz", type=int, default=64, help="batch size")
parser.add_argument(
"--test-batch-sz",
type=int,
default=0,
help="set different batch size for test and val data if greater than 0",
)
parser.add_argument(
"--nbatches", type=int, default=1000, help="number of batches in each epoch"
)
parser.add_argument(
"--nepochs", type=int, default=1000, help="number of epochs to train"
)
parser.add_argument(
"--optim",
type=str,
default="sgd",
choices=("sgd", "adam"),
help="optimization method",
)
parser.add_argument(
"--lr-warmup",
type=int,
default=0,
help="linearly increase LR from 0 during K updates",
)
parser.add_argument(
"--lr-decay",
action="store_true",
default=False,
help="decay learning rate with cosine scheduler",
)
parser.add_argument(
"--grad-clip", type=float, default=0, help="clip gradient value",
)
parser.add_argument(
"--split-batch",
type=int,
default=1,
help="split batches into smaller pieces so it can fit in memory",
)
# data related
parser.add_argument(
"--data", type=str, help="data file location", required=True,
)
parser.add_argument(
"--data-type",
type=str,
default="char",
choices=["char", "word"],
help="data type",
)
parser.add_argument(
"--data-eos",
action="store_true",
default=False,
help="include the end-of-line as as token",
)
parser.add_argument(
"--data-omit-labels",
nargs="+",
type=str,
default=[],
help="do not train on those labels",
)
# plotting
parser.add_argument(
"--plot", action="store_true", default=False, help="plot in tensorboard"
)
parser.add_argument(
"--plot-dir", type=str, default="tensorboard_runs", help="tensorboard log dir",
)
parser.add_argument(
"--plot-name",
type=str,
default="",
help="tensorboard log name (default: datetime)",
)
# misc
parser.add_argument(
"--no-cuda", action="store_true", default=False, help="disables CUDA training"
)
parser.add_argument(
"--checkpoint", type=str, default="", help="path to save/load model"
)
parser.add_argument(
"--checkpoint-freq", type=int, default=0, help="how often to keep a copy"
)
parser.add_argument(
"--full-test",
action="store_true",
default=False,
help="do testing on whole data",
)
parser.add_argument(
"--full-valid",
action="store_true",
default=False,
help="do validation on whole data (during training)",
)
parser.add_argument(
"--lazy-load-data",
action="store_true",
default=False,
help="moves data to GPU one sample at a time",
)
transformer_seq.add_args(parser)
distributed.add_args(parser)
adaptive_span.add_args(parser)
compressive.add_args(parser)
expire_span.add_args(parser)
feedback.add_args(parser)
return parser