in launcher/nemo/slurm_launcher.py [0:0]
def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
"""
The submission file will be responsible for the following
- Handle sbatch config (implemented in upstream)
- Handle env variables (implemented in upstream)
- Handle storing distribution information which will be consumed by train_script.sh
- Call train_script.sh with proper srun command
"""
origin_sbatch_str = super()._make_submission_file_text(command_groups)
origin_sbatch_str = origin_sbatch_str.split("\n")
assert origin_sbatch_str[0] == "#!/bin/bash", origin_sbatch_str[0]
command_idx = None
for idx, sbatch_str in enumerate(origin_sbatch_str):
if sbatch_str.startswith("# command"):
command_idx = idx
break
assert command_idx is not None, f"Can not find command in the submission file str: {origin_sbatch_str}"
distributed_strs = [
"",
"# Prepare distributed files",
f'srun -l bash -c "scontrol show hostnames | sort > {self.hostfile}"',
"",
]
if self.launch_docker_container_text is None:
updated_sbatch_str = origin_sbatch_str[:command_idx] + distributed_strs + origin_sbatch_str[command_idx:]
else:
updated_sbatch_str = origin_sbatch_str[:command_idx] + distributed_strs + command_groups[0]
return "\n".join(updated_sbatch_str)