in pbspro/src/pbspro/driver.py [0:0]
def handle_failed_nodes(self, nodes: List[Node]) -> List[Node]:
to_delete = []
to_drain = []
now = datetime.datetime.now()
for node in nodes:
if node.keep_alive:
continue
if node.state == "Failed":
# node.closed = True
# if self._is_boot_timeout(now, node):
# to_delete.append(node)
continue
if not node.resources.get("ccnodeid"):
logging.fine(
"Attempting to delete %s but ccnodeid is not set yet.", node
)
continue
job_state = node.metadata.get("pbs_state", "")
if "down" in job_state:
node.closed = True
if "state-unknown" in job_state:
logging.warning(
"Node is in state-unknown - skipping scale down - %s", node
)
continue
# no private_ip == no dns entry, so we can safely remove it
if "offline" in job_state or not node.private_ip:
to_delete.append(node)
else:
if self._down_long_enough(now, node):
to_drain.append(node)
if to_drain:
logging.info("Draining down nodes: %s", to_drain)
self.handle_draining(to_drain)
if to_delete:
logging.info("Deleting down,offline nodes: %s", to_delete)
return self.handle_post_delete(to_delete)
return []