in submit_slurm_jobs.py [0:0]
def submit_jobs(inp_dir, qos, hf_token, nb_slurm_array, only: str = None):
scheduler = Scheduler(inp_dir, qos)
#TODO: batch into job arrays
env_vars = os.environ.copy()
env_vars["HUGGINGFACE_TOKEN"] = hf_token
total_jobs = len(scheduler.job_lists)
if only == "fail":
scheduler.job_lists = scheduler.keep_only_jobs(Status.FAIL)
elif only == "pending":
scheduler.job_lists = scheduler.keep_only_jobs(Status.PENDING)
elif only == "timeout":
scheduler.job_lists = scheduler.keep_only_jobs(Status.TIMEOUT)
elif only == "running":
scheduler.job_lists = scheduler.keep_only_jobs(Status.RUNNING)
if only is not None:
filtered_jobs = len(scheduler.job_lists)
if filtered_jobs == 0:
print(f"No '{only}' jobs to resubmit")
return
print(f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted")
scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED)
if nb_slurm_array > 0:
# Use job dependecies
# Distribute the jobs into the arrays
base_jobs_per_array = len(scheduler.job_lists) // nb_slurm_array
extra_jobs = len(scheduler.job_lists) % nb_slurm_array
distribution = [base_jobs_per_array] * nb_slurm_array
for i in range(extra_jobs):
distribution[i] += 1
start = 0
for i, nb_jobs in enumerate(distribution):
previous_job_id = None
end = start + nb_jobs
job_array = scheduler.job_lists[start:end]
print(f"Launching job Dependency array {i+1} with {nb_jobs} jobs")
for job in job_array:
scheduler.create_slurm_script(job)
scheduler.launch_dependency(job_array, env_vars)
start = end
else:
# Don't use job dependecies
for job in scheduler.job_lists:
scheduler.create_slurm_script(job)
print(os.path.join(job.root_path, "job.slurm"))
subprocess.run(["sbatch", os.path.join(job.root_path, "job.slurm")], env=env_vars)
job.set_status(Status.PENDING)