in launcher/nemo/slurm_launcher.py [0:0]
def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None:
# We need to handle this ntasks_per_node specifically
# Since we are using torchrun to launch custom jobs, we can not use ntasks_per_node in sbatch command
self.ntasks_per_node = kwargs.pop("ntasks_per_node", 8)
if "train_script_text" in kwargs:
self.train_script_text = kwargs.pop("train_script_text")
else:
raise ValueError(f"Missing train_script_text from launcher kwargs {kwargs}")
self.launch_docker_container_text = kwargs.pop("launch_docker_container_text", None)
self.docker_exec_script_text = kwargs.pop("docker_exec_script_text", None)
self.slurm_create_submission_file_only = kwargs.pop("slurm_create_submission_file_only", False)
if "hostfile" in kwargs:
self.hostfile = kwargs.pop("hostfile")
else:
raise ValueError(f"Missing hostfile from launcher kwargs {kwargs}")
if "slurm_docker_cfg" in kwargs:
kwargs.pop("slurm_docker_cfg")
super(SlurmLauncher, self).__init__(folder, job_name)
self.parameters = {}
self._update_parameters(job_name=job_name, **kwargs)
if shutil.which("srun") is None and not NEMO_LAUNCHER_DEBUG and not self.slurm_create_submission_file_only:
raise RuntimeError('Could not detect "srun", are you indeed on a slurm cluster?')