in launcher/nemo/stages.py [0:0]
def run(self) -> str:
"""
Run current stage
"""
# Setup folders and datasets
self.setup_folder_and_data()
# Save stage hydra config
job_path = self.get_job_path()
# Identify if launching a trainium job
is_trainium = self.__class__.__name__ == "SMTrainingTrainiumRecipe"
is_custom = self.cfg.get("training_cfg") is not None
if not is_custom:
stage_cfg_path = SMTraining.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
else:
stage_cfg_path = job_path.config_file
if self.cluster == "sm_jobs":
if is_custom:
raise RuntimeError("SM jobs launcher is not supported with custom training.")
cluster_parameters = {"job_name": self.job_name}
self.create_sm_jobs_script(job_path.folder)
command_groups = self.make_sm_jobs_command()
else:
# Make cluster parameters
cluster_parameters = self._make_cluster_parameters(self.cluster)
cluster_parameters["train_script_text"] = self._make_train_script_text(stage_cfg_path)
if get_container_type(self.cfg.container) == "docker":
cluster_parameters["launch_docker_container_text"] = self._make_launch_docker_container_text()
cluster_parameters["docker_exec_script_text"] = self._make_docker_exec_script_text(stage_cfg_path)
if get_container_type(self.cfg.container) != "enroot":
cluster_parameters.pop("container_mounts", None)
# if self.cfg.get("slurm_create_submission_file_only", None) is not None:
# cluster_parameters["slurm_create_submission_file_only"] = self.cfg.slurm_create_submission_file_only
cluster_parameters["hostfile"] = self._get_hostfile_location()
if is_trainium and self.get_cluster_type() == "bcm":
# Save temp training config file with string interpolations resolved so it can be
# copied into Neuron's package by the compute node(s) eventually selected by Slurm.
# NOTE: This file can't be removed. Multiple nodes may run the job asynchronously
# so there aren't any order guarantees nor an ideal moment to remove the file.
OmegaConf.save(self.cfg.training, self._temp_training_conf_file, True)
# Make k8s config file if necessary
if self.cluster == "k8s":
# The following two methods are overrides from the Training class. They require
# `template_root` but in our implementation we re-define it inside those methods.
# Therefore, `template_root` is just a sentinel so parent behavior is not broken.
sentinel_template_root = ""
self._make_k8s_spec_file(sentinel_template_root, cluster_parameters, job_path, stage_cfg_path)
self._copy_k8s_helm_chart(sentinel_template_root, job_path)
# k8s does not need command groups
command_groups = None
else:
command_groups = self.make_stage_command_groups(stage_cfg_path)
launcher = SMAutoLauncher(
folder=job_path.folder,
cluster=self.cluster,
**cluster_parameters,
)
job_id = launcher.launch(command_groups=command_groups)
if self.cluster == "bcm":
try:
self.telemetry.start(
self.cluster,
self.instance_type,
get_num_nodes(self.stage_cfg),
job_id=job_id,
container=self.cfg.get("container", None),
)
except:
pass
return job_id