snakemake/profiles/slurm-csd3/status.py (77 lines of code) (raw):
#!/usr/bin/env python3
import logging
import re
import shlex
import subprocess as sp
import sys
import time
logger = logging.getLogger("__name__")
STATUS_ATTEMPTS = 20
jobid = sys.argv[1]
for i in range(STATUS_ATTEMPTS):
try:
sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n"))
res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")}
# regular execution
if jobid in res:
status = res[jobid]
# job array
# example:
# 2379_1|COMPLETED|0:0
# 2379_1.batch|COMPLETED|0:0
# 2379_2|RUNNING|0:0
# 2379_2.batch|RUNNING|0:0
# 2379_[3-7%1]|PENDING|0:0
else:
all_steps = sorted(
[(k, v) for k, v in res.items() if not k.endswith("batch")], key=lambda x: x[0]
)
statuses = {v for _, v in all_steps}
if "COMPLETED" in statuses:
status = "COMPLETED"
elif "FAILED" in statuses:
status = "FAILED"
else:
status = all_steps[-1][1]
break
except sp.CalledProcessError as e:
logger.error("sacct process error")
logger.error(e)
except IndexError as e:
logger.error(e)
pass
# Try getting job with scontrol instead in case sacct is misconfigured
try:
sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}"))
statuses = [
re.search(r"JobState=(\w+)", line).group(1)
for line in sctrl_res.decode().split("\n")
if line != ""
]
if "COMPLETED" in statuses:
status = "COMPLETED"
elif "FAILED" in statuses:
status = "FAILED"
else:
status = statuses[0]
break
except sp.CalledProcessError as e:
logger.error("scontrol process error")
logger.error(e)
if i >= STATUS_ATTEMPTS - 1:
print("failed")
sys.exit(0)
else:
time.sleep(1)
if status == "BOOT_FAIL":
print("failed")
elif status == "OUT_OF_MEMORY":
print("failed")
elif status.startswith("CANCELLED"):
print("failed")
elif status == "COMPLETED":
print("success")
elif status == "DEADLINE":
print("failed")
elif status == "FAILED":
print("failed")
elif status == "NODE_FAIL":
print("failed")
elif status == "PREEMPTED":
print("failed")
elif status == "TIMEOUT":
print("failed")
elif status == "SUSPENDED":
print("running")
else:
print("running")