in services/worker/src/worker/executor.py [0:0]
def is_worker_alive(self, worker_loop_executor: OutputExecutor) -> bool:
if worker_loop_executor.running():
return True
try:
worker_loop_executor.stop() # raises an error if the worker returned unexpected exit code
except ProcessExitedWithError as err:
explanation = f"exit code {err.exit_code}"
if err.exit_code == -9:
explanation += " SIGKILL - surely an OOM"
error_msg = f"Worker crashed ({explanation})"
state = self.get_state()
if state and state["current_job_info"]:
error_msg += f" when running job_id={state['current_job_info']['job_id']}"
logging.error(error_msg)
raise
except BaseException as err:
explanation = f"{type(err).__name__}: {err}"
error_msg = f"Worker crashed ({explanation})"
state = self.get_state()
if state and state["current_job_info"]:
error_msg += f" when running job_id={state['current_job_info']['job_id']}"
logging.error(error_msg)
raise
if worker_loop_executor.process:
return_code = worker_loop_executor.process.returncode
if return_code is not None and return_code != 0:
explanation = f"return code {return_code}"
if return_code == -9:
explanation += " SIGKILL - surely an OOM"
error_msg = f"Worker crashed ({explanation})"
state = self.get_state()
if state and state["current_job_info"]:
error_msg += f" when running job_id={state['current_job_info']['job_id']}"
logging.error(error_msg)
raise
return False