def run()

in launcher/nemo/stages.py [0:0]
51 lines of code
14 McCabe index (conditional complexity)

    def run(self) -> str:
        """
        Run current stage
        """
        # Setup folders and datasets
        self.setup_folder_and_data()
        # Save stage hydra config
        job_path = self.get_job_path()
        # Identify if launching a trainium job
        is_trainium = self.__class__.__name__ == "SMTrainingTrainiumRecipe"

        is_custom = self.cfg.get("training_cfg") is not None
        if not is_custom:
            stage_cfg_path = SMTraining.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
        else:
            stage_cfg_path = job_path.config_file

        if self.cluster == "sm_jobs":
            if is_custom:
                raise RuntimeError("SM jobs launcher is not supported with custom training.")
            cluster_parameters = {"job_name": self.job_name}
            self.create_sm_jobs_script(job_path.folder)
            command_groups = self.make_sm_jobs_command()
        else:
            # Make cluster parameters
            cluster_parameters = self._make_cluster_parameters(self.cluster)

            cluster_parameters["train_script_text"] = self._make_train_script_text(stage_cfg_path)
            if get_container_type(self.cfg.container) == "docker":
                cluster_parameters["launch_docker_container_text"] = self._make_launch_docker_container_text()
                cluster_parameters["docker_exec_script_text"] = self._make_docker_exec_script_text(stage_cfg_path)
            if get_container_type(self.cfg.container) != "enroot":
                cluster_parameters.pop("container_mounts", None)
            # if self.cfg.get("slurm_create_submission_file_only", None) is not None:
            #     cluster_parameters["slurm_create_submission_file_only"] = self.cfg.slurm_create_submission_file_only
            cluster_parameters["hostfile"] = self._get_hostfile_location()

            if is_trainium and self.get_cluster_type() == "bcm":
                # Save temp training config file with string interpolations resolved so it can be
                # copied into Neuron's package by the compute node(s) eventually selected by Slurm.
                # NOTE: This file can't be removed. Multiple nodes may run the job asynchronously
                # so there aren't any order guarantees nor an ideal moment to remove the file.
                OmegaConf.save(self.cfg.training, self._temp_training_conf_file, True)

            # Make k8s config file if necessary
            if self.cluster == "k8s":
                # The following two methods are overrides from the Training class. They require
                # `template_root` but in our implementation we re-define it inside those methods.
                # Therefore, `template_root` is just a sentinel so parent behavior is not broken.
                sentinel_template_root = ""
                self._make_k8s_spec_file(sentinel_template_root, cluster_parameters, job_path, stage_cfg_path)
                self._copy_k8s_helm_chart(sentinel_template_root, job_path)

                # k8s does not need command groups
                command_groups = None
            else:
                command_groups = self.make_stage_command_groups(stage_cfg_path)

        launcher = SMAutoLauncher(
            folder=job_path.folder,
            cluster=self.cluster,
            **cluster_parameters,
        )
        job_id = launcher.launch(command_groups=command_groups)

        if self.cluster == "bcm":
            try:
                self.telemetry.start(
                    self.cluster,
                    self.instance_type,
                    get_num_nodes(self.stage_cfg),
                    job_id=job_id,
                    container=self.cfg.get("container", None),
                )
            except:
                pass

        return job_id