snakemake/profiles/slurm-moz/status.py (56 lines of code) (raw):

#!/usr/bin/env python3 import logging import re import shlex import subprocess as sp import sys import time logger = logging.getLogger("__name__") STATUS_ATTEMPTS = 20 jobid = sys.argv[1] for i in range(STATUS_ATTEMPTS): try: sacct_res = sp.check_output(shlex.split(f"sacct -P -b -j {jobid} -n")) res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} break except sp.CalledProcessError as e: logger.error("sacct process error") logger.error(e) except IndexError as e: logger.error(e) pass # Try getting job with scontrol instead in case sacct is misconfigured try: sctrl_res = sp.check_output(shlex.split(f"scontrol -o show job {jobid}")) m = re.search(r"JobState=(\w+)", sctrl_res.decode()) res = {jobid: m.group(1)} break except sp.CalledProcessError as e: logger.error("scontrol process error") logger.error(e) if i >= STATUS_ATTEMPTS - 1: print("failed") sys.exit(0) else: time.sleep(1) status = res[jobid] if status == "BOOT_FAIL": print("failed") elif status == "OUT_OF_MEMORY": print("failed") elif status.startswith("CANCELLED"): print("failed") elif status == "COMPLETED": print("success") elif status == "DEADLINE": print("failed") elif status == "FAILED": print("failed") elif status == "NODE_FAIL": print("failed") elif status == "PREEMPTED": print("failed") elif status == "TIMEOUT": print("failed") elif status == "SUSPENDED": print("running") else: print("running")