in training/distributed_training/pytorch/model_parallel/gpt2/train_gpt_simple.py [0:0]
def parse_args():
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
opt_grp = parser.add_argument_group(
title="optimization", description="arguments for optimization"
)
opt_grp.add_argument(
"--train_batch_size",
type=int,
default=4,
help="batch size per dp rank, for tensor parallelism degree 8 with pipeline parallel degree 1 this means 8*this batch size per node",
)
opt_grp.add_argument("--val_batch_size", type=int, default=4)
opt_grp.add_argument("--max_steps", type=int, default=5000)
opt_grp.add_argument("--seed", type=int, default=12345)
opt_grp.add_argument("--same_seed", type=int, default=0)
opt_grp.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
opt_grp.add_argument("--fp16", default=0, type=int, help="automatic mixed precision training")
opt_grp.add_argument(
"--fp32_grad_accumulation", default=0, type=int, help="Enable FP32 Grad accumulation"
)
opt_grp.add_argument("--grad_clip", default=1.0, type=float, help="gradient clipping")
opt_grp.add_argument("--weight_decay", default=0.01, type=float, help="weight decay")
opt_grp.add_argument(
"--beta1", default=0.9, type=float, help="beta1 parameter for Adam optimizer"
)
opt_grp.add_argument(
"--beta2", default=0.95, type=float, help="beta2 parameter for Adam optimizer"
)
opt_grp.add_argument(
"--activation_checkpointing",
type=int,
default=1,
help="enable gradient checkpointing to reduce memory consumption",
)
parser.add_argument(
"--logging_freq", type=int, default=1, help="number of iterations between logging"
)
# I/O
io_grp = parser.add_argument_group(title="io", description="location for input and output")
io_grp.add_argument("--use_wiki_data", type=int, default=0, help="use wiki corpus data for training")
io_grp.add_argument("--zipped_data", type=int, default=1, help="input data is zipped files")
io_grp.add_argument(
"--epochs", type=int, default=3, help="times of iterating over the training dataset"
)
io_grp.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
io_grp.add_argument(
"--checkpoint-dir",
type=str,
default="/opt/ml/checkpoints",
help="Saves partial checkpoints (model, optimizer) to this dir, and loads latest checkpoint from this if load_partial is specified.",
)
io_grp.add_argument(
"--model-dir",
type=str,
default=os.environ["SM_MODEL_DIR"],
help="Saves full model for inference to this dir. Also used if load_full is given to load the model. Note the lack of optimizer state here.",
)
io_grp.add_argument("--training-dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
io_grp.add_argument("--test-dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
io_grp.add_argument(
"--parallel_proc_data_processing",
type=int,
default=0,
help="Load data in parallel with a different process. At any point a process can have two files in memory. With tensor parallelism, each of the 8 processes on an instance will then have 2 files in memory. Depending on file sizes this may or may not be feasible. With pipeline parallelism this was not a problem as only 1 rank on an instance loaded data.",
)
io_grp.add_argument(
"--save_final_full_model",
type=int,
default=0,
help="Enabling this will save a combined model only at the end",
)
io_grp.add_argument(
"--skip_full_optimizer",
type=int,
default=1,
help="Disabling this will also save the full optimizer state",
)
io_grp.add_argument("--load_partial", type=int, default=0, help="Load from partial checkpoints")
io_grp.add_argument("--load_full", type=int, default=0, help="Load from full checkpoints")
io_grp.add_argument(
"--logits_output", type=str, default="", help="Path to save logits and loss"
)
io_grp.add_argument("--prescaled_batch", type=int, default=1, help="use prescaled batch")
# configure model size
model_grp = parser.add_argument_group(
title="model", description="arguments to describe model configuration"
)
model_grp.add_argument("--max_context_width", type=int, default=1024)
model_grp.add_argument("--hidden_width", type=int, default=768)
model_grp.add_argument("--num_layers", type=int, default=12)
model_grp.add_argument("--num_heads", type=int, default=12)
model_grp.add_argument("--resid_pdrop", type=float, default=0.1)
model_grp.add_argument("--embd_pdrop", type=float, default=0.1)
model_grp.add_argument("--attn_pdrop", type=float, default=0.1)
model_grp.add_argument("--summary_first_pdrop", type=float, default=0.1)
model_grp.add_argument("--use_adamw", type=int, default=0, help="Use adamw optimizer")
smp_grp = parser.add_argument_group(title="smp", description="smp")
smp_grp.add_argument("--tensor_parallel_degree", type=int, default=8)
smp_grp.add_argument("--pipeline_parallel_degree", type=int, default=1)
smp_grp.add_argument("--microbatches", type=int, default=1)
smp_grp.add_argument("--active_microbatches", type=int, default=None)
smp_grp.add_argument("--optimize", type=str, default="speed")
smp_grp.add_argument("--activation_strategy", type=str, default="each")
smp_grp.add_argument("--shard_optimizer_state", type=int, default=0)
smp_grp.add_argument("--offload_activations", type=int, default=0)
smp_grp.add_argument("--fast_mode", type=int, default=0)
smp_grp.add_argument("--static_mode", type=int, default=0)
smp_grp.add_argument("--delayed_param", type=int, default=0)
smp_grp.add_argument("--same_partition_load", type=int, default=0)
smp_grp.add_argument("--attention_in_fp32", type=int, default=1)
parser.add_argument(
"--num_kept_checkpoints",
type=int,
default=5,
help="how many checkpoints to keep before deleting",
)
parser.add_argument(
"--checkpoint_freq",
type=int,
default=10000,
help="number of iterations between checkpointing",
)
parser.add_argument(
"--validation_freq",
type=int,
default=None,
help="number of iterations to print validation loss",
)
parser.add_argument(
"--validation_batches",
type=int,
default=10,
help="number of batches to estimate validation loss",
)
parser.add_argument(
"--manual_partition",
type=int,
default=0,
help="evenly distribute layers across the partitions",
)
parser.add_argument(
"--match_weights", type=int, default=0, help="Get weights from the original model"
)
parser.add_argument(
"--preserve_np_state",
type=int,
default=0,
help="Perserve the numpy random state between validation",
)
parser.add_argument(
"--fast_validation",
type=int,
default=1,
help="Running validation only with the last data file for faster speed",
)
# learning rate
lr_grp = parser.add_argument_group(
title="lr", description="arguments for learning rate schedule"
)
lr_grp.add_argument("--lr", type=float, default=None, help="Initial learning rate.")
lr_grp.add_argument(
"--lr_decay_style",
type=str,
default="linear",
choices=["constant", "linear", "cosine", "exponential", "plateau"],
help="Learning rate decay function.",
)
lr_grp.add_argument(
"--lr_decay_iters",
type=int,
default=None,
help="number of iterations to decay learning rate over," " If None defaults to train iters",
)
lr_grp.add_argument(
"--min_lr",
type=float,
default=0.0,
help="Minumum value for learning rate. The scheduler" "clip values below this threshold.",
)
lr_grp.add_argument(
"--warmup",
type=float,
default=0.01,
help="Percentage of total iterations to warmup on "
"(.01 = 1 percent of all training iters).",
)
lr_grp.add_argument(
"--plateau",
type=float,
default=0.4,
help="Percentage of total iterations to keep at max if using plateau lr",
)
ci_grp = parser.add_argument_group(title="ci", description="ci related settings")
ci_grp.add_argument("--ci", default=False, action="store_true", help="Whether enable ci")
ci_grp.add_argument("--time_to_train", type=int, help="time to train threshold")
ci_grp.add_argument("--throughput", type=float, help="throughput threshold")
ci_grp.add_argument("--loss", type=float, help="loss threshold")
ci_grp.add_argument(
"--save_or_verify_ckptsum", default=False, action="store_true", help="Whether to save sum"
)
args, _ = parser.parse_known_args()
return args