in training/distributed_training/pytorch/model_parallel/bert/bert_example/sagemaker_smp_pretrain.py [0:0]
def parse_arguments():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument(
"--input_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain .hdf5 files for the task.",
)
parser.add_argument(
"--config_file", default=None, type=str, required=True, help="The BERT model config"
)
parser.add_argument(
"--bert_model",
default="bert-large-uncased",
type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
)
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.",
)
## Other parameters
parser.add_argument(
"--init_checkpoint",
default=None,
type=str,
help="The initial checkpoint to start training from.",
)
parser.add_argument(
"--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.",
)
parser.add_argument(
"--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence",
)
parser.add_argument(
"--train_batch_size", default=32, type=int, help="Total batch size for training."
)
parser.add_argument(
"--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam."
)
parser.add_argument(
"--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.",
)
parser.add_argument(
"--max_steps", default=1000, type=float, help="Total number of training steps to perform."
)
parser.add_argument(
"--warmup_proportion",
default=0.01,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.",
)
parser.add_argument(
"--local_rank",
type=int,
default=os.getenv("LOCAL_RANK", -1),
help="local_rank for distributed training on gpus",
)
parser.add_argument("--seed", type=int, default=4211, help="random seed for initialization")
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.",
)
parser.add_argument("--fp16", default=0, type=int, help="Mixed precision training")
parser.add_argument("--amp", default=0, type=int, help="Mixed precision training")
parser.add_argument(
"--loss_scale",
type=float,
default=0.0,
help="Loss scaling, positive power of 2 values can improve fp16 convergence.",
)
parser.add_argument("--log_freq", type=float, default=1.0, help="frequency of logging loss.")
parser.add_argument(
"--checkpoint_activations",
default=0,
type=int,
help="Whether to use gradient checkpointing",
)
parser.add_argument(
"--resume_from_checkpoint",
default=0,
type=int,
help="Whether to resume training from checkpoint.",
)
parser.add_argument("--s3_checkpoint_uri", default=None, type=str, help="S3 Checkpoint URI")
parser.add_argument("--resume_step", type=int, default=-1, help="Step to resume training from.")
parser.add_argument(
"--num_steps_per_checkpoint",
type=int,
default=100,
help="Number of update steps until a model checkpoint is saved to disk.",
)
parser.add_argument(
"--skip_checkpoint", default=0, type=int, help="Whether to save checkpoints"
)
parser.add_argument("--phase2", default=0, type=int, help="Whether to train with seq len 512")
parser.add_argument(
"--allreduce_post_accumulation",
default=0,
type=int,
help="Whether to do allreduces during gradient accumulation steps.",
)
parser.add_argument(
"--allreduce_post_accumulation_fp16",
default=0,
type=int,
help="Whether to do fp16 allreduce post accumulation.",
)
parser.add_argument(
"--phase1_end_step",
type=int,
default=7038,
help="Number of training steps in Phase1 - seq len 128",
)
parser.add_argument(
"--init_loss_scale", type=int, default=2 ** 20, help="Initial loss scaler value"
)
parser.add_argument("--do_train", default=0, type=int, help="Whether to run training.")
parser.add_argument(
"--use_env", default=0, type=int, help="Whether to read local rank from ENVVAR"
)
parser.add_argument(
"--disable_progress_bar", default=0, type=int, help="Disable tqdm progress bar"
)
parser.add_argument(
"--steps_this_run",
type=int,
default=1000,
help="If provided, only run this many steps before exiting",
)
parser.add_argument("--ddp", type=int, default=0)
parser.add_argument("--smp", type=int, default=0)
parser.add_argument("--num_microbatches", type=int, default=1)
parser.add_argument("--pipeline", type=str, default="interleaved")
parser.add_argument("--apply_optimizer", type=int, default=1)
parser.add_argument("--use_sequential", type=int, default=0)
parser.add_argument("--param_weight", type=float, default=0.3)
parser.add_argument("--overlapping_allreduce", type=int, default=1)
parser.add_argument("--save-full", default=0, type=int, help="Save the full model")
parser.add_argument("--partial-checkpoint", type=bool, default=True)
parser.add_argument("--mp_parameters", type=str, default="")
args = parser.parse_args()
args.fp16 = args.fp16 or args.amp
if args.resume_from_checkpoint:
args.steps_this_run += 100
if args.steps_this_run < 0:
args.steps_this_run = args.max_steps
return args