def handle_failed_nodes()

in pbspro/src/pbspro/driver.py [0:0]


    def handle_failed_nodes(self, nodes: List[Node]) -> List[Node]:
        to_delete = []
        to_drain = []
        now = datetime.datetime.now()

        for node in nodes:

            if node.keep_alive:
                continue

            if node.state == "Failed":
                # node.closed = True
                # if self._is_boot_timeout(now, node):
                #     to_delete.append(node)
                continue

            if not node.resources.get("ccnodeid"):
                logging.fine(
                    "Attempting to delete %s but ccnodeid is not set yet.", node
                )
                continue

            job_state = node.metadata.get("pbs_state", "")
            if "down" in job_state:

                node.closed = True
                if "state-unknown" in job_state:
                    logging.warning(
                        "Node is in state-unknown - skipping scale down - %s", node
                    )
                    continue
                # no private_ip == no dns entry, so we can safely remove it
                if "offline" in job_state or not node.private_ip:
                    to_delete.append(node)
                else:
                    if self._down_long_enough(now, node):
                        to_drain.append(node)

        if to_drain:
            logging.info("Draining down nodes: %s", to_drain)
            self.handle_draining(to_drain)

        if to_delete:
            logging.info("Deleting down,offline nodes: %s", to_delete)
            return self.handle_post_delete(to_delete)
        return []