in assets/training/finetune_acft_hf_nlp/src/finetune/finetune.py [0:0]
def get_parser():
"""Get the parser object."""
parser = argparse.ArgumentParser(description="Sequence classification with Lora support")
# Model optimization settings
parser.add_argument(
"--apply_ort",
type=str2bool,
default="false",
help="If set to true, will use the ONNXRunTime training",
)
parser.add_argument(
"--apply_deepspeed",
type=str2bool,
default="false",
help="If set to true, will enable deepspeed for training",
)
parser.add_argument(
"--deepspeed_stage",
type=int,
default=2,
choices=[2, 3],
help=(
"This parameter configures which DEFAULT deepspeed config to be used - stage2 or stage3. The default "
"choice is stage2. Note that, this parameter is ONLY applicable when user doesn't pass any config "
"information via deepspeed port."
)
)
parser.add_argument(
"--deepspeed",
type=str,
default=None,
help="Deepspeed config to be used for finetuning",
)
parser.add_argument(
"--local_rank",
type=int,
default=-1,
help="Local rank passed by torch distributed launch",
)
# Lora settings
parser.add_argument("--apply_lora", type=str2bool, default="false", help="lora enabled")
parser.add_argument("--lora_alpha", type=int, default=128, help="lora attn alpha")
parser.add_argument("--lora_dropout", type=float, default=0.0, help="lora dropout value")
parser.add_argument("--lora_r", default=8, type=int, help="lora dimension")
# Training settings
parser.add_argument("--num_train_epochs", default=5, type=int, help="training epochs")
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help=(
"If set to a positive number, the total number of training steps to perform. Overrides `epochs`."
"In case of using a finite iterable dataset the training may stop before reaching the set number of steps"
"when all data is exhausted."
),
)
parser.add_argument("--per_device_train_batch_size", default=4, type=int, help="Train batch size")
parser.add_argument("--per_device_eval_batch_size", default=4, type=int, help="Validation batch size")
parser.add_argument(
"--auto_find_batch_size",
default="false",
type=str2bool,
help=(
"Flag to enable auto finding of batch size. If the provided `train_batch_size` goes into Out Of Memory"
" (OOM) enabling auto_find_batch_size will find the correct batch size by iteratively reducing"
" `train_batch_size` by afactor of 2 till the OOM is fixed."
),
)
# -- optimizer options adamw_hf, adamw_torch, adamw_apex_fused, adafactor
parser.add_argument(
"--optim",
default="adamw_torch",
type=str,
help="Optimizer to be used while training",
)
parser.add_argument(
"--learning_rate",
default=2e-5,
type=float,
help="Start learning rate. Defaults to linear scheduler.",
)
parser.add_argument(
"--warmup_steps",
default=0,
type=int,
help="Number of steps used for a linear warmup from 0 to learning_rate",
)
parser.add_argument(
"--weight_decay",
default=0,
type=float,
help=(
"The weight decay to apply (if not zero) to all layers except all "
"bias and LayerNorm weights in AdamW optimizer"
),
)
parser.add_argument(
"--adam_beta1",
default=0.9,
type=float,
help="The beta1 hyperparameter for the AdamW optimizer",
)
parser.add_argument(
"--adam_beta2",
default=0.999,
type=float,
help="The beta2 hyperparameter for the AdamW optimizer",
)
parser.add_argument(
"--adam_epsilon",
default=1e-8,
type=float,
help="The epsilon hyperparameter for the AdamW optimizer"
)
parser.add_argument(
"--gradient_accumulation_steps",
default=1,
type=int,
help="Number of updates steps to accumulate the gradients for, before performing a backward/update pass",
)
parser.add_argument(
"--gradient_checkpointing",
default="false",
type=str2bool,
help="Enable / disable gradient checkpointing",
)
parser.add_argument(
"--fp16",
default="false",
type=str2bool,
help="Enable mixed precision training",
)
parser.add_argument(
"--bf16",
default="false",
type=str2bool,
help="Enable mixed precision training",
)
parser.add_argument(
"--lr_scheduler_type",
default="linear",
type=str,
help="The scheduler type to use"
)
parser.add_argument(
"--dataloader_num_workers",
default=0,
type=int,
help="Number of workers to use for loading the data"
)
parser.add_argument(
"--precision",
type=int,
default=32,
help=(
"Apply mixed precision training. "
"This can reduce memory footprint by performing operations in half-precision."
),
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed that will be set at the beginning of training",
)
parser.add_argument(
"--enable_full_determinism",
type=str2bool,
default="false",
help="Ensure reproducible behavior during distributed training",
)
parser.add_argument(
"--ignore_mismatched_sizes",
type=str2bool,
default="true",
help=(
"Whether or not to raise an error if some of the weights from the "
"checkpoint do not have the same size as the weights of the model"
),
)
parser.add_argument(
"--ddp_timeout",
type=int,
default=3600,
help=(
"The timeout for `torch.distributed.init_process_group` calls, used to avoid GPU socket timeouts when "
"performing slow operations in distributed runnings. Please refer the [PyTorch documentation] "
"(https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more "
"information."
),
)
parser.add_argument(
"--max_grad_norm",
type=float,
default=1.0,
help=(
"Maximum gradient norm (for gradient clipping)"
),
)
parser.add_argument(
"--eval_accumulation_steps",
default=None,
type=int,
help="Number of predictions steps to accumulate before moving the tensors to the CPU.",
)
parser.add_argument(
"--evaluation_strategy", type=str, default="epoch", help="The evaluation strategy to adopt during training",
)
parser.add_argument(
"--evaluation_steps_interval",
type=float,
default=0.0,
help=(
"The evaluation steps in fraction of an epoch steps to adopt during training. "
"Overwrites evaluation_steps if not 0."
),
)
parser.add_argument(
"--eval_steps",
type=int,
default=500,
help="Number of update steps between two evals if evaluation_strategy='steps'",
)
parser.add_argument(
"--logging_strategy", type=str, default="epoch", help="The logging strategy to adopt during training",
)
parser.add_argument(
"--logging_steps",
type=int,
default=500,
help="Number of update steps between two logs if logging_strategy='steps'",
)
parser.add_argument(
"--metric_for_best_model",
type=str,
default="loss",
help="Specify the metric to use to compare two different models"
)
parser.add_argument(
"--resume_from_checkpoint",
type=str2bool,
default="false",
help="Loads Optimizer, Scheduler and Trainer state for finetuning if true",
)
parser.add_argument(
"--save_strategy",
type=str,
default=SaveStrategy.EVALUATION_STRATEGY,
help="The checkpoint save strategy to adopt during training.",
)
parser.add_argument(
"--save_steps",
type=int,
default=100,
help="Number of update steps between two checkpoint saves if save_strategy='steps'",
)
parser.add_argument(
"--save_total_limit",
type=int,
default=-1,
help=(
"If a value is passed, will limit the total amount of checkpoints. "
"Deletes the older checkpoints in output_dir. "
"If the value is -1 saves all checkpoints"
),
)
parser.add_argument(
"--apply_early_stopping", type=str2bool, default="false", help="Enable early stopping"
)
parser.add_argument(
"--early_stopping_patience",
type=int,
default=1,
help="Stop training when the specified metric worsens for early_stopping_patience evaluation calls",
)
parser.add_argument(
"--early_stopping_threshold",
type=float,
default=0.0,
help="Denotes how much the specified metric must improve to satisfy early stopping conditions"
)
parser.add_argument(
"--preprocess_output",
default=None,
type=str,
help="output folder of preprocessor containing the metadata of train, evaluation and test files",
)
parser.add_argument(
"--model_selector_output",
default=None,
type=str,
help=("output folder of model selector containing model configs, tokenizer, checkpoints."),
)
parser.add_argument(
"--pytorch_model_folder",
default="pytorch_model_folder",
type=str,
help="Output dir to save the finetune model and other metadata",
)
parser.add_argument(
"--mlflow_model_folder",
default="mlflow_model_folder",
type=str,
help="Output dir to save the finetune model as mlflow model",
)
return parser