def submit_jobs()

in bench_cluster/submit_jobs.py [0:0]


def submit_jobs(inp_dir, qos, hf_token, nb_slurm_array, cluster: str, only: str = None):
    scheduler = Scheduler(inp_dir, qos)

    #TODO: batch into job arrays
    env_vars = os.environ.copy()
    env_vars["HUGGINGFACE_TOKEN"] = hf_token
    total_jobs = len(scheduler.job_lists)

    if only == "fail":
        scheduler.job_lists = scheduler.keep_only_jobs(Status.FAIL)
    elif only == "pending":
        scheduler.job_lists = scheduler.keep_only_jobs(Status.PENDING)
    elif only == "timeout":
        scheduler.job_lists = scheduler.keep_only_jobs(Status.TIMEOUT)
    elif only == "running":
        scheduler.job_lists = scheduler.keep_only_jobs(Status.RUNNING)
    
    if only is not None:
        filtered_jobs = len(scheduler.job_lists)
        if filtered_jobs == 0:
            print(f"No '{only}' jobs to resubmit")
            return
        print(f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted")
    
    scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED)

    if nb_slurm_array > 0:
        # Use job dependecies
        
        # Distribute the jobs into the arrays        
        base_jobs_per_array = len(scheduler.job_lists) // nb_slurm_array
        extra_jobs = len(scheduler.job_lists) % nb_slurm_array
        distribution = [base_jobs_per_array] * nb_slurm_array
        for i in range(extra_jobs):
            distribution[i] += 1
        
        start = 0
        
        for i, nb_jobs in enumerate(distribution):
            previous_job_id = None
            end = start + nb_jobs
            job_array = scheduler.job_lists[start:end]
            
            print(f"Launching job Dependency array {i+1} with {nb_jobs} jobs")
            
            for job in job_array:
                scheduler.create_slurm_script(job, cluster)

            scheduler.launch_dependency(job_array, env_vars)
            
            start = end
    else:
        # Don't use job dependecies
        for job in scheduler.job_lists:
            scheduler.create_slurm_script(job, cluster)
            subprocess.run(["sbatch", os.path.join(job.root_path, "bench.slurm")], env=env_vars)
            job.set_status(Status.PENDING)