def parse_arguments()

in training/distributed_training/pytorch/model_parallel/bert/bert_example/sagemaker_smp_pretrain.py [0:0]


def parse_arguments():

    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--input_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain .hdf5 files  for the task.",
    )

    parser.add_argument(
        "--config_file", default=None, type=str, required=True, help="The BERT model config"
    )

    parser.add_argument(
        "--bert_model",
        default="bert-large-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written.",
    )

    ## Other parameters
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="The initial checkpoint to start training from.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )
    parser.add_argument(
        "--max_predictions_per_seq",
        default=80,
        type=int,
        help="The maximum total of masked tokens in input sequence",
    )
    parser.add_argument(
        "--train_batch_size", default=32, type=int, help="Total batch size for training."
    )
    parser.add_argument(
        "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam."
    )
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps", default=1000, type=float, help="Total number of training steps to perform."
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.01,
        type=float,
        help="Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=os.getenv("LOCAL_RANK", -1),
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument("--seed", type=int, default=4211, help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument("--fp16", default=0, type=int, help="Mixed precision training")
    parser.add_argument("--amp", default=0, type=int, help="Mixed precision training")
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0.0,
        help="Loss scaling, positive power of 2 values can improve fp16 convergence.",
    )
    parser.add_argument("--log_freq", type=float, default=1.0, help="frequency of logging loss.")
    parser.add_argument(
        "--checkpoint_activations",
        default=0,
        type=int,
        help="Whether to use gradient checkpointing",
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        default=0,
        type=int,
        help="Whether to resume training from checkpoint.",
    )
    parser.add_argument("--s3_checkpoint_uri", default=None, type=str, help="S3 Checkpoint URI")
    parser.add_argument("--resume_step", type=int, default=-1, help="Step to resume training from.")
    parser.add_argument(
        "--num_steps_per_checkpoint",
        type=int,
        default=100,
        help="Number of update steps until a model checkpoint is saved to disk.",
    )
    parser.add_argument(
        "--skip_checkpoint", default=0, type=int, help="Whether to save checkpoints"
    )
    parser.add_argument("--phase2", default=0, type=int, help="Whether to train with seq len 512")
    parser.add_argument(
        "--allreduce_post_accumulation",
        default=0,
        type=int,
        help="Whether to do allreduces during gradient accumulation steps.",
    )
    parser.add_argument(
        "--allreduce_post_accumulation_fp16",
        default=0,
        type=int,
        help="Whether to do fp16 allreduce post accumulation.",
    )
    parser.add_argument(
        "--phase1_end_step",
        type=int,
        default=7038,
        help="Number of training steps in Phase1 - seq len 128",
    )
    parser.add_argument(
        "--init_loss_scale", type=int, default=2 ** 20, help="Initial loss scaler value"
    )
    parser.add_argument("--do_train", default=0, type=int, help="Whether to run training.")
    parser.add_argument(
        "--use_env", default=0, type=int, help="Whether to read local rank from ENVVAR"
    )
    parser.add_argument(
        "--disable_progress_bar", default=0, type=int, help="Disable tqdm progress bar"
    )
    parser.add_argument(
        "--steps_this_run",
        type=int,
        default=1000,
        help="If provided, only run this many steps before exiting",
    )
    parser.add_argument("--ddp", type=int, default=0)
    parser.add_argument("--smp", type=int, default=0)
    parser.add_argument("--num_microbatches", type=int, default=1)
    parser.add_argument("--pipeline", type=str, default="interleaved")
    parser.add_argument("--apply_optimizer", type=int, default=1)
    parser.add_argument("--use_sequential", type=int, default=0)
    parser.add_argument("--param_weight", type=float, default=0.3)
    parser.add_argument("--overlapping_allreduce", type=int, default=1)
    parser.add_argument("--save-full", default=0, type=int, help="Save the full model")
    parser.add_argument("--partial-checkpoint", type=bool, default=True)
    parser.add_argument("--mp_parameters", type=str, default="")

    args = parser.parse_args()

    args.fp16 = args.fp16 or args.amp

    if args.resume_from_checkpoint:
        args.steps_this_run += 100

    if args.steps_this_run < 0:
        args.steps_this_run = args.max_steps

    return args