def _initiate_run()

in assets/training/finetune_acft_hf_nlp/environments/acpt/context/finetune_run.py [0:0]


def _initiate_run(completion_files_folder: str, model_selector_output: str,
                  preprocess_output: str, pytorch_model_folder: str, mlflow_model_folder: str):
    """Run the model selector, preprocess, finetune and registration script."""
    # get task name
    task_name = decode_param_from_env_var("task_name")
    num_nodes = parse_to_int(decode_param_from_env_var("Node_Count"))
    num_gpus = parse_to_int(decode_param_from_env_var("number_of_gpu_to_use_finetuning"))
    logger.info(f'Nodes are {num_nodes} , gpus are : {num_gpus}')

    # get system properties
    system_properties = parse_system_properties(decode_param_from_env_var("system_properties"))

    # set log_level_debug as environment parameter
    log_level_debug_enabled = \
        system_properties.get(SystemSettings.LOG_LEVEL_DEBUG, False) if system_properties else False
    os.environ[SystemSettings.LOG_LEVEL_DEBUG] = str(log_level_debug_enabled)

    # model selector
    cmd = [
        "python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.model_selector",
        "--task_name", task_name,
        "--output_dir", model_selector_output
    ]
    add_optional_input(cmd, "mlflow_model_path")
    add_optional_input(cmd, "pytorch_model_path")
    _run_subprocess_cmd(cmd, component_name="model_selector", completion_files_folder=completion_files_folder,
                        single_run=True, number_of_processes=num_gpus)
    # preprocess
    cmd = [
        "python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.preprocess",
        "--task_name", task_name,
        "--batch_size", decode_param_from_env_var("batch_size"),
        "--pad_to_max_length", decode_param_from_env_var("pad_to_max_length"),
        "--max_seq_length", decode_param_from_env_var("max_seq_length"),
        "--train_file_path", os.path.join(decode_input_from_env_var("dataset_input") or "", "train_input.jsonl"),
        "--test_file_path", os.path.join(decode_input_from_env_var("dataset_input") or "", "train_input.jsonl"),
        "--num_train_epochs", decode_param_from_env_var('num_train_epochs'),
        "--model_selector_output", model_selector_output,
        "--output_dir", preprocess_output
    ]
    # add task_specific params
    add_task_specific_params(cmd, task_name, component_name="preprocess")
    # add optional input validation_file_path
    validation_file_path = os.path.join(decode_input_from_env_var("dataset_input") or "", "validation_input.jsonl")
    if os.path.isfile(validation_file_path):
        cmd += ["--validation_file_path", validation_file_path]

    num_retries = system_properties.get("num_retries", 3) if system_properties else 3

    @retry_with_backoff(delay=2, retries=num_retries)
    def _run_preprocess_cmd_with_retries():
        _run_subprocess_cmd(cmd, component_name="preprocess", completion_files_folder=completion_files_folder,
                            single_run=True, number_of_processes=num_gpus)

    _run_preprocess_cmd_with_retries()

    # finetune
    if not _is_multi_node_enabled():
        cmd_base = ["python", "-m", "torch.distributed.launch", "--nproc_per_node",
                    decode_param_from_env_var('number_of_gpu_to_use_finetuning'), "-m"]
    else:
        cmd_base = ["python", "-m"]

    cmd = [
        "azureml.acft.contrib.hf.nlp.entry_point.finetune.finetune",
        "--apply_lora", decode_param_from_env_var('apply_lora'),
        "--merge_lora_weights", decode_param_from_env_var('merge_lora_weights'),
        "--lora_alpha", decode_param_from_env_var('lora_alpha'),
        "--lora_r", decode_param_from_env_var('lora_r'),
        "--lora_dropout", decode_param_from_env_var('lora_dropout'),
        "--num_train_epochs", decode_param_from_env_var('num_train_epochs'),
        "--max_steps", decode_param_from_env_var('max_steps'),
        "--per_device_train_batch_size", decode_param_from_env_var('per_device_train_batch_size'),
        "--per_device_eval_batch_size", decode_param_from_env_var('per_device_eval_batch_size'),
        "--auto_find_batch_size", decode_param_from_env_var('auto_find_batch_size'),
        "--optim", decode_param_from_env_var('optim'),
        "--learning_rate", decode_param_from_env_var('learning_rate'),
        "--warmup_steps", decode_param_from_env_var('warmup_steps'),
        "--weight_decay", decode_param_from_env_var('weight_decay'),
        "--adam_beta1", decode_param_from_env_var('adam_beta1'),
        "--adam_beta2", decode_param_from_env_var('adam_beta2'),
        "--adam_epsilon", decode_param_from_env_var('adam_epsilon'),
        "--gradient_accumulation_steps", decode_param_from_env_var('gradient_accumulation_steps'),
        "--eval_accumulation_steps", decode_param_from_env_var('eval_accumulation_steps'),
        "--lr_scheduler_type", decode_param_from_env_var('lr_scheduler_type'),
        "--precision", decode_param_from_env_var('precision'),
        "--seed", decode_param_from_env_var('seed'),
        "--enable_full_determinism", decode_param_from_env_var('enable_full_determinism'),
        "--dataloader_num_workers", decode_param_from_env_var('dataloader_num_workers'),
        "--ignore_mismatched_sizes", decode_param_from_env_var('ignore_mismatched_sizes'),
        "--max_grad_norm", decode_param_from_env_var('max_grad_norm'),
        "--evaluation_strategy", decode_param_from_env_var('evaluation_strategy'),
        "--evaluation_steps_interval", decode_param_from_env_var('evaluation_steps_interval'),
        "--eval_steps", decode_param_from_env_var('eval_steps'),
        "--logging_strategy", decode_param_from_env_var('logging_strategy'),
        "--logging_steps", decode_param_from_env_var('logging_steps'),
        "--metric_for_best_model", decode_param_from_env_var('metric_for_best_model'),
        "--resume_from_checkpoint", decode_param_from_env_var('resume_from_checkpoint'),
        "--save_strategy", decode_param_from_env_var('save_strategy'),
        "--save_steps", decode_param_from_env_var('save_steps'),
        "--save_total_limit", decode_param_from_env_var('save_total_limit'),
        "--apply_early_stopping", decode_param_from_env_var('apply_early_stopping'),
        "--early_stopping_patience", decode_param_from_env_var('early_stopping_patience'),
        "--early_stopping_threshold", decode_param_from_env_var('early_stopping_threshold'),
        "--apply_ort", decode_param_from_env_var('apply_ort'),
        "--apply_deepspeed", decode_param_from_env_var('apply_deepspeed'),
        "--deepspeed_stage", decode_param_from_env_var('deepspeed_stage'),
        "--model_selector_output", model_selector_output,
        "--preprocess_output", preprocess_output,
        "--system_properties", decode_param_from_env_var("system_properties"),
        "--pytorch_model_folder", pytorch_model_folder,
        "--mlflow_model_folder", mlflow_model_folder,
        "--output_model", decode_output_from_env_var('output_model')
    ]
    cmd_base.extend(cmd)
    _run_subprocess_cmd(cmd_base, component_name="finetune", completion_files_folder=completion_files_folder,
                        single_run=False, number_of_processes=num_gpus)

    # validate lora weights

    # identify model name
    model_selector_args_path = os.path.join(
        model_selector_output, SaveFileConstants.MODEL_SELECTOR_ARGS_SAVE_PATH)
    with open(model_selector_args_path, 'r') as rptr:
        model_name = json.load(rptr)['model_name']

    cmd = [
        "python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.validate_lora_weights",
        "--task_name", task_name,
        "--base_pytorch_model_path", os.path.join(model_selector_output, model_name),
        "--lora_weights_path", os.path.join(pytorch_model_folder, PEFT_ADAPTER_WEIGHTS_DIR),
        "--train_file_path", os.path.join(decode_input_from_env_var("dataset_input") or "", "train_input.jsonl"),
    ]
    add_task_specific_params(cmd, task_name, component_name="validate_lora_weights")
    _run_subprocess_cmd(cmd, component_name="validate_lora_weights", completion_files_folder=completion_files_folder,
                        single_run=True, number_of_processes=num_gpus)

    # model registration
    cmd = [
        "python", "-m", "azureml.acft.contrib.hf.nlp.entry_point.finetune.register_model",
        "--task_name", task_name,
        "--model_asset_id", decode_param_from_env_var('model_asset_id'),
        "--registration_details_folder", decode_output_from_env_var('output_model'),
        "--model_path", os.path.join(
            pytorch_model_folder,
            PEFT_ADAPTER_WEIGHTS_DIR
        ),
        "--convert_to_safetensors", "true",
    ]
    add_optional_param(cmd=cmd, component_param_name="registered_model_name", argparse_param_name="model_name")
    add_optional_param(cmd=cmd, component_param_name="model_registration_tag", argparse_param_name="model_tag")
    _run_subprocess_cmd(cmd, component_name="register_model", completion_files_folder=completion_files_folder,
                        single_run=True, number_of_processes=num_gpus)