def srun()

in azure-slurm/slurmcc/util.py [0:0]


    def srun(self, hostlist: List[str], user_command: str, timeout: int, shell: bool, partition: str) -> SrunOutput:
        with tempfile.NamedTemporaryFile(delete=True) as temp_file:
            temp_file_path = temp_file.name

            try:
                command = f"bash -c '{user_command}'" if shell else user_command
                partition_flag = f"-p {partition} " if partition else ""
                #adding deadline timeout 1 minute more than the srun timeout to avoid deadline timeout before srun can finish running
                srun_command = f"srun {partition_flag}-w {','.join(hostlist)} --error {temp_file_path} --deadline=now+{timeout+1}minute --time={timeout} {command}"
                logging.debug(srun_command)
                #subprocess timeout is in seconds, so we need to convert the timeout to seconds
                #add 3 minutes to it so it doesnt timeout before the srun command can kill the job from its own timeout
                subp_timeout=timeout*60+180
                result = subprocesslib.run(srun_command, check=True, timeout=subp_timeout, shell=True,stdout=subprocesslib.PIPE, stderr=subprocesslib.PIPE, universal_newlines=True)
                return SrunOutput(returncode=result.returncode, stdout=result.stdout, stderr=None)
            except subprocesslib.CalledProcessError as e:
                logging.error(f"Command: {srun_command} failed with return code {e.returncode}")
                with open(temp_file_path, 'r') as f:
                    stderr_content = f.read()
                    if not stderr_content.strip("\n"):
                        stderr_content = None
                raise SrunExitCodeException(returncode=e.returncode,stdout=e.stdout, stderr=e.stderr, stderr_content=stderr_content)
            except subprocesslib.TimeoutExpired:
                logging.error("Srun command timed out!")
                raise