def create_slurm_script()

in submit_slurm_jobs.py [0:0]


    def create_slurm_script(self, job: Job):
        # Submit job to the cluster (edit jinja)    
        # load yaml config.yaml
        with open(job.config, 'r') as file:
            config = json.load(file)
        
        max_gpu_per_node = 8
        # Pick the right number of nodes and n_proc_per_node
        world_size = config["distributed"]["tp_size"] * config["distributed"]["cp_size"] * config["distributed"]["pp_size"] * config["distributed"]["dp_size"]
        assert world_size <= max_gpu_per_node or world_size % max_gpu_per_node == 0
        nodes = max(1, world_size // max_gpu_per_node)
        n_proc_per_node = min(max_gpu_per_node, world_size // nodes)
        assert nodes * n_proc_per_node == world_size
        
        context_bench = {
            'nodes': nodes,
            'n_proc_per_node': n_proc_per_node,
            'root_path': job.root_path,
            "config": job.config,
            "qos": job.qos,
        }
        
        base_path = os.path.join(os.getcwd(), "template/base_job.slurm")

        with open(base_path, 'r') as file:
            base_job_file = file.read()
        
        base_job_template = Template(base_job_file)
                
        # Write the rendered script to a new file located at the job root_path
        output_file_path = os.path.join(job.root_path, "job.slurm")
        with open(output_file_path, 'w') as file:
            file.write(base_job_template.render(context_bench))

        print(f"Slurm script created at {output_file_path}")