in assets/training/model_management/src/run_model_preprocess.py [0:0]
def run():
"""Run preprocess."""
parser = _get_parser()
args, _ = parser.parse_known_args()
model_id = args.model_id
task_name = args.task_name
model_flavor = args.model_flavor
vllm_enabled = False if args.vllm_enabled.lower() == "false" else True
model_framework = args.model_framework
hf_config_args = args.hf_config_args
hf_tokenizer_args = args.hf_tokenizer_args
hf_model_args = args.hf_model_args
hf_pipeline_args = args.hf_pipeline_args
hf_config_class = args.hf_config_class
hf_model_class = args.hf_model_class
hf_tokenizer_class = args.hf_tokenizer_class
hf_use_experimental_features = False if args.hf_use_experimental_features.lower() == "false" else True
extra_pip_requirements = args.extra_pip_requirements
inference_base_image = args.inference_base_image
model_download_metadata_path = args.model_download_metadata
model_path = args.model_path
mlflow_model_output_dir = args.mlflow_model_output_dir
license_file_path = args.license_file_path
TRUST_CODE_KEY = "trust_remote_code=True"
if not ModelFramework.has_value(model_framework):
raise Exception(f"Unsupported model framework {model_framework}")
preprocess_args = {}
if model_download_metadata_path:
with open(model_download_metadata_path) as f:
download_details = json.load(f)
preprocess_args.update(download_details.get("tags", {}))
preprocess_args.update(download_details.get("properties", {}))
preprocess_args["misc"] = download_details.get("misc", [])
if task_name is None or \
(
not TransformersSupportedTasks.has_value(task_name.lower()) and
not PyFuncSupportedTasks.has_value(task_name.lower())
):
task_name = preprocess_args.get("task")
logger.warning("task_name is not provided or not supported. "
f"Using task_name={task_name} from model download metadata.")
if task_name is None:
supported_tasks = set(TransformersSupportedTasks.list_values() + PyFuncSupportedTasks.list_values())
raise AzureMLException._with_error(
AzureMLError.create(UnsupportedTaskType, task_type=args.task_name,
supported_tasks=list(supported_tasks))
)
files = check_for_py_files(model_path)
logger.info(f"check if model folder contains .py files or not: {files}")
if files:
if hf_model_args is None or TRUST_CODE_KEY not in hf_model_args:
hf_model_args = TRUST_CODE_KEY
logger.warning(f"{TRUST_CODE_KEY} is not provided for hf_model_args. Using {TRUST_CODE_KEY}.")
if hf_tokenizer_args is None or TRUST_CODE_KEY not in hf_tokenizer_args:
hf_tokenizer_args = TRUST_CODE_KEY
logger.warning(f"{TRUST_CODE_KEY} is not provided for hf_tokenizer_args. Using {TRUST_CODE_KEY}.")
if hf_config_args is None or TRUST_CODE_KEY not in hf_config_args:
hf_config_args = TRUST_CODE_KEY
logger.warning(f"{TRUST_CODE_KEY} is not provided for hf_config_args. Using {TRUST_CODE_KEY}.")
preprocess_args["task"] = task_name.lower()
preprocess_args["model_id"] = model_id if model_id else preprocess_args.get("model_id")
preprocess_args["model_flavor"] = model_flavor if model_flavor else "HFtransformersV2"
preprocess_args["vllm_enabled"] = vllm_enabled
preprocess_args[HF_CONF.EXTRA_PIP_REQUIREMENTS.value] = extra_pip_requirements
preprocess_args[HF_CONF.HF_CONFIG_ARGS.value] = hf_config_args
preprocess_args[HF_CONF.HF_TOKENIZER_ARGS.value] = hf_tokenizer_args
preprocess_args[HF_CONF.HF_MODEL_ARGS.value] = hf_model_args
preprocess_args[HF_CONF.HF_PIPELINE_ARGS.value] = hf_pipeline_args
preprocess_args[HF_CONF.HF_CONFIG_CLASS.value] = hf_config_class
preprocess_args[HF_CONF.HF_PRETRAINED_CLASS.value] = hf_model_class
preprocess_args[HF_CONF.HF_TOKENIZER_CLASS.value] = hf_tokenizer_class
preprocess_args[HF_CONF.HF_USE_EXPERIMENTAL_FEATURES.value] = hf_use_experimental_features
preprocess_args["inference_base_image"] = inference_base_image
# update custom dimensions with input parameters
custom_dimensions.update_custom_dimensions(preprocess_args)
logger.info(f"Preprocess args : {preprocess_args}")
with TemporaryDirectory(dir=mlflow_model_output_dir) as working_dir, TemporaryDirectory(
dir=mlflow_model_output_dir
) as temp_dir:
run_preprocess(model_framework, model_path, working_dir, temp_dir, **preprocess_args)
shutil.copytree(working_dir, mlflow_model_output_dir, dirs_exist_ok=True)
# Copy license file to output model path
if license_file_path:
# removing the default dumped license when user provides custom license file.
transformers_license_path = os.path.join(mlflow_model_output_dir, "LICENSE.txt")
if os.path.isfile(transformers_license_path):
os.remove(transformers_license_path)
shutil.copy(license_file_path, mlflow_model_output_dir)
logger.info(f"listing output directory files: {mlflow_model_output_dir}:\n{os.listdir(mlflow_model_output_dir)}")