in azure-slurm/slurmcc/util.py [0:0]
def srun(self, hostlist: List[str], user_command: str, timeout: int, shell: bool, partition: str) -> SrunOutput:
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
temp_file_path = temp_file.name
try:
command = f"bash -c '{user_command}'" if shell else user_command
partition_flag = f"-p {partition} " if partition else ""
#adding deadline timeout 1 minute more than the srun timeout to avoid deadline timeout before srun can finish running
srun_command = f"srun {partition_flag}-w {','.join(hostlist)} --error {temp_file_path} --deadline=now+{timeout+1}minute --time={timeout} {command}"
logging.debug(srun_command)
#subprocess timeout is in seconds, so we need to convert the timeout to seconds
#add 3 minutes to it so it doesnt timeout before the srun command can kill the job from its own timeout
subp_timeout=timeout*60+180
result = subprocesslib.run(srun_command, check=True, timeout=subp_timeout, shell=True,stdout=subprocesslib.PIPE, stderr=subprocesslib.PIPE, universal_newlines=True)
return SrunOutput(returncode=result.returncode, stdout=result.stdout, stderr=None)
except subprocesslib.CalledProcessError as e:
logging.error(f"Command: {srun_command} failed with return code {e.returncode}")
with open(temp_file_path, 'r') as f:
stderr_content = f.read()
if not stderr_content.strip("\n"):
stderr_content = None
raise SrunExitCodeException(returncode=e.returncode,stdout=e.stdout, stderr=e.stderr, stderr_content=stderr_content)
except subprocesslib.TimeoutExpired:
logging.error("Srun command timed out!")
raise