in tools/sweep_launch_job.py [0:0]
def main():
# Parse arguments
desc = "Launch a job on SLURM cluster. Should only be called from sweep_launch.py"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument("--conda-env", required=True)
parser.add_argument("--script-path", required=True)
parser.add_argument("--script-mode", required=True)
parser.add_argument("--cfgs-dir", required=True)
parser.add_argument("--pycls-dir", required=True)
parser.add_argument("--logs-dir", required=True)
parser.add_argument("--max-retry", required=True, type=int)
args = parser.parse_args()
prt("Called with args: {}".format(args))
# Attach signal handlers for SIGUSR1 and SIGTERM
signal.signal(signal.SIGUSR1, sigusr1_handler)
signal.signal(signal.SIGTERM, sigterm_handler)
# Print info about run
job_id = os.environ["SLURM_ARRAY_JOB_ID"]
task_id = os.environ["SLURM_ARRAY_TASK_ID"]
prt("Job array main job ID: {}".format(job_id))
prt("Job array task ID (index): {}".format(task_id))
prt("Running job on: {}".format(str(os.uname())))
# Load what we need
run_os_cmd("module purge")
run_os_cmd("module load anaconda3")
run_os_cmd("source deactivate")
run_os_cmd("source activate {}".format(args.conda_env))
# Get cfg_file to use
cfg_files = sorted(f for f in os.listdir(args.cfgs_dir) if f.endswith(".yaml"))
cfg_file = os.path.join(args.cfgs_dir, cfg_files[int(task_id)])
prt("Using cfg_file: {}".format(cfg_file))
# Create out_dir
out_dir = os.path.join(args.logs_dir, "{:06}".format(int(task_id)))
os.makedirs(out_dir, exist_ok=True)
prt("Using out_dir: {}".format(out_dir))
# Create slurm_file with SLURM info
slurm_file = os.path.join(out_dir, "SLURM.txt")
with open(slurm_file, "a") as f:
f.write("SLURM env variables for the job writing to this directory:\n")
slurm_info = {k: os.environ[k] for k in os.environ if k.startswith("SLURM_")}
f.write(json.dumps(slurm_info, indent=4))
prt("Dumped SLURM job info to {}".format(slurm_file))
# Set PYTHONPATH to pycls copy for sweep
os.environ["PYTHONPATH"] = args.pycls_dir
prt("Using PYTHONPATH={}".format(args.pycls_dir))
# Generate srun command to launch
cmd_to_run = (
"srun"
" --output {out_dir}/stdout.log"
" --error {out_dir}/stderr.log"
" python {script}"
" --mode {mode}"
" --cfg {cfg}"
" OUT_DIR {out_dir}"
).format(
out_dir=out_dir, script=args.script_path, mode=args.script_mode, cfg=cfg_file
)
prt("Running cmd:\n", cmd_to_run.replace(" ", "\n "))
# Run command in background using subprocess and wait so that signals can be caught
p = subprocess.Popen(cmd_to_run, shell=True)
prt("Waiting for job to complete")
p.wait()
prt("Completed waiting. Return code for job: {}".format(p.returncode))
if p.returncode != 0:
retry_file = os.path.join(out_dir, "RETRY.txt")
with open(retry_file, "a") as f:
f.write("Encountered non-zero exit code\n")
with open(retry_file, "r") as f:
retry_count = len(f.readlines()) - 1
prt("Retry count for job: {}".format(retry_count))
if retry_count < args.max_retry:
requeue_job()