in submit_slurm_jobs.py [0:0]
def create_slurm_script(self, job: Job):
# Submit job to the cluster (edit jinja)
# load yaml config.yaml
with open(job.config, 'r') as file:
config = json.load(file)
max_gpu_per_node = 8
# Pick the right number of nodes and n_proc_per_node
world_size = config["distributed"]["tp_size"] * config["distributed"]["cp_size"] * config["distributed"]["pp_size"] * config["distributed"]["dp_size"]
assert world_size <= max_gpu_per_node or world_size % max_gpu_per_node == 0
nodes = max(1, world_size // max_gpu_per_node)
n_proc_per_node = min(max_gpu_per_node, world_size // nodes)
assert nodes * n_proc_per_node == world_size
context_bench = {
'nodes': nodes,
'n_proc_per_node': n_proc_per_node,
'root_path': job.root_path,
"config": job.config,
"qos": job.qos,
}
base_path = os.path.join(os.getcwd(), "template/base_job.slurm")
with open(base_path, 'r') as file:
base_job_file = file.read()
base_job_template = Template(base_job_file)
# Write the rendered script to a new file located at the job root_path
output_file_path = os.path.join(job.root_path, "job.slurm")
with open(output_file_path, 'w') as file:
file.write(base_job_template.render(context_bench))
print(f"Slurm script created at {output_file_path}")