in assets/training/finetune_acft_hf_nlp/environments/acpt/context/finetune_run.py [0:0]
def _initiate_run(completion_files_folder: str, model_selector_output: str,
preprocess_output: str, pytorch_model_folder: str, mlflow_model_folder: str):
"""Run the model selector, preprocess, finetune and registration script."""
# get task name
task_name = decode_param_from_env_var("task_name")
num_nodes = parse_to_int(decode_param_from_env_var("Node_Count"))
num_gpus = parse_to_int(decode_param_from_env_var("number_of_gpu_to_use_finetuning"))
logger.info(f'Nodes are {num_nodes} , gpus are : {num_gpus}')
# get system properties
system_properties = parse_system_properties(decode_param_from_env_var("system_properties"))
# set log_level_debug as environment parameter
log_level_debug_enabled = \
system_properties.get(SystemSettings.LOG_LEVEL_DEBUG, False) if system_properties else False
os.environ[SystemSettings.LOG_LEVEL_DEBUG] = str(log_level_debug_enabled)
# model selector
cmd = [
"python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.model_selector",
"--task_name", task_name,
"--output_dir", model_selector_output
]
add_optional_input(cmd, "mlflow_model_path")
add_optional_input(cmd, "pytorch_model_path")
_run_subprocess_cmd(cmd, component_name="model_selector", completion_files_folder=completion_files_folder,
single_run=True, number_of_processes=num_gpus)
# preprocess
cmd = [
"python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.preprocess",
"--task_name", task_name,
"--batch_size", decode_param_from_env_var("batch_size"),
"--pad_to_max_length", decode_param_from_env_var("pad_to_max_length"),
"--max_seq_length", decode_param_from_env_var("max_seq_length"),
"--train_file_path", os.path.join(decode_input_from_env_var("dataset_input") or "", "train_input.jsonl"),
"--test_file_path", os.path.join(decode_input_from_env_var("dataset_input") or "", "train_input.jsonl"),
"--num_train_epochs", decode_param_from_env_var('num_train_epochs'),
"--model_selector_output", model_selector_output,
"--output_dir", preprocess_output
]
# add task_specific params
add_task_specific_params(cmd, task_name, component_name="preprocess")
# add optional input validation_file_path
validation_file_path = os.path.join(decode_input_from_env_var("dataset_input") or "", "validation_input.jsonl")
if os.path.isfile(validation_file_path):
cmd += ["--validation_file_path", validation_file_path]
num_retries = system_properties.get("num_retries", 3) if system_properties else 3
@retry_with_backoff(delay=2, retries=num_retries)
def _run_preprocess_cmd_with_retries():
_run_subprocess_cmd(cmd, component_name="preprocess", completion_files_folder=completion_files_folder,
single_run=True, number_of_processes=num_gpus)
_run_preprocess_cmd_with_retries()
# finetune
if not _is_multi_node_enabled():
cmd_base = ["python", "-m", "torch.distributed.launch", "--nproc_per_node",
decode_param_from_env_var('number_of_gpu_to_use_finetuning'), "-m"]
else:
cmd_base = ["python", "-m"]
cmd = [
"azureml.acft.contrib.hf.nlp.entry_point.finetune.finetune",
"--apply_lora", decode_param_from_env_var('apply_lora'),
"--merge_lora_weights", decode_param_from_env_var('merge_lora_weights'),
"--lora_alpha", decode_param_from_env_var('lora_alpha'),
"--lora_r", decode_param_from_env_var('lora_r'),
"--lora_dropout", decode_param_from_env_var('lora_dropout'),
"--num_train_epochs", decode_param_from_env_var('num_train_epochs'),
"--max_steps", decode_param_from_env_var('max_steps'),
"--per_device_train_batch_size", decode_param_from_env_var('per_device_train_batch_size'),
"--per_device_eval_batch_size", decode_param_from_env_var('per_device_eval_batch_size'),
"--auto_find_batch_size", decode_param_from_env_var('auto_find_batch_size'),
"--optim", decode_param_from_env_var('optim'),
"--learning_rate", decode_param_from_env_var('learning_rate'),
"--warmup_steps", decode_param_from_env_var('warmup_steps'),
"--weight_decay", decode_param_from_env_var('weight_decay'),
"--adam_beta1", decode_param_from_env_var('adam_beta1'),
"--adam_beta2", decode_param_from_env_var('adam_beta2'),
"--adam_epsilon", decode_param_from_env_var('adam_epsilon'),
"--gradient_accumulation_steps", decode_param_from_env_var('gradient_accumulation_steps'),
"--eval_accumulation_steps", decode_param_from_env_var('eval_accumulation_steps'),
"--lr_scheduler_type", decode_param_from_env_var('lr_scheduler_type'),
"--precision", decode_param_from_env_var('precision'),
"--seed", decode_param_from_env_var('seed'),
"--enable_full_determinism", decode_param_from_env_var('enable_full_determinism'),
"--dataloader_num_workers", decode_param_from_env_var('dataloader_num_workers'),
"--ignore_mismatched_sizes", decode_param_from_env_var('ignore_mismatched_sizes'),
"--max_grad_norm", decode_param_from_env_var('max_grad_norm'),
"--evaluation_strategy", decode_param_from_env_var('evaluation_strategy'),
"--evaluation_steps_interval", decode_param_from_env_var('evaluation_steps_interval'),
"--eval_steps", decode_param_from_env_var('eval_steps'),
"--logging_strategy", decode_param_from_env_var('logging_strategy'),
"--logging_steps", decode_param_from_env_var('logging_steps'),
"--metric_for_best_model", decode_param_from_env_var('metric_for_best_model'),
"--resume_from_checkpoint", decode_param_from_env_var('resume_from_checkpoint'),
"--save_strategy", decode_param_from_env_var('save_strategy'),
"--save_steps", decode_param_from_env_var('save_steps'),
"--save_total_limit", decode_param_from_env_var('save_total_limit'),
"--apply_early_stopping", decode_param_from_env_var('apply_early_stopping'),
"--early_stopping_patience", decode_param_from_env_var('early_stopping_patience'),
"--early_stopping_threshold", decode_param_from_env_var('early_stopping_threshold'),
"--apply_ort", decode_param_from_env_var('apply_ort'),
"--apply_deepspeed", decode_param_from_env_var('apply_deepspeed'),
"--deepspeed_stage", decode_param_from_env_var('deepspeed_stage'),
"--model_selector_output", model_selector_output,
"--preprocess_output", preprocess_output,
"--system_properties", decode_param_from_env_var("system_properties"),
"--pytorch_model_folder", pytorch_model_folder,
"--mlflow_model_folder", mlflow_model_folder,
"--output_model", decode_output_from_env_var('output_model')
]
cmd_base.extend(cmd)
_run_subprocess_cmd(cmd_base, component_name="finetune", completion_files_folder=completion_files_folder,
single_run=False, number_of_processes=num_gpus)
# validate lora weights
# identify model name
model_selector_args_path = os.path.join(
model_selector_output, SaveFileConstants.MODEL_SELECTOR_ARGS_SAVE_PATH)
with open(model_selector_args_path, 'r') as rptr:
model_name = json.load(rptr)['model_name']
cmd = [
"python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.validate_lora_weights",
"--task_name", task_name,
"--base_pytorch_model_path", os.path.join(model_selector_output, model_name),
"--lora_weights_path", os.path.join(pytorch_model_folder, PEFT_ADAPTER_WEIGHTS_DIR),
"--train_file_path", os.path.join(decode_input_from_env_var("dataset_input") or "", "train_input.jsonl"),
]
add_task_specific_params(cmd, task_name, component_name="validate_lora_weights")
_run_subprocess_cmd(cmd, component_name="validate_lora_weights", completion_files_folder=completion_files_folder,
single_run=True, number_of_processes=num_gpus)
# model registration
cmd = [
"python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.register_model",
"--task_name", task_name,
"--model_asset_id", decode_param_from_env_var('model_asset_id'),
"--registration_details_folder", decode_output_from_env_var('output_model'),
"--model_path", os.path.join(
pytorch_model_folder,
PEFT_ADAPTER_WEIGHTS_DIR
),
"--convert_to_safetensors", "true",
]
add_optional_param(cmd=cmd, component_param_name="registered_model_name", argparse_param_name="model_name")
add_optional_param(cmd=cmd, component_param_name="model_registration_tag", argparse_param_name="model_tag")
_run_subprocess_cmd(cmd, component_name="register_model", completion_files_folder=completion_files_folder,
single_run=True, number_of_processes=num_gpus)