def wait_pool_ready()

in src/hpcadvisor/batch_handler.py [0:0]


def wait_pool_ready(poolid):
    if not batch_client:
        log.critical("batch_client is None")
        return

    # measure time to wait for pool to be ready

    init_time = datetime.datetime.now()

    log.debug(f"awaiting pool to be ready: poolid={poolid}")
    rc = utils.wait_until(
        lambda: batch_client.pool.get(poolid).allocation_state == "steady"
    )
    if rc is utils.WAIT_UNTIL_RC.TIMEOUT:
        log.error("Timed out waiting for pool resize.")
    if rc is utils.WAIT_UNTIL_RC.FAILURE:
        log.error("Pool resize failed.")

    elapsed_time = datetime.datetime.now() - init_time
    log.debug(f"Pool ready: poolid={poolid} elapsed_time={elapsed_time}")
    log.info(f"Pool resize is complete: poolid={poolid}")

    target_dedicated_nodes = batch_client.pool.get(poolid).target_dedicated_nodes

    def verify_pool():
        pool_config = batch_client.pool.get(poolid)
        if pool_config.resize_errors and len(pool_config.resize_errors) > 0:
            log.error(f"Pool {pool_id} resize failed.")
            msg = "\n".join(
                [f"  {e.code}: {e.message}" for e in pool_config.resize_errors]
            )
            log.error(f"Resize errors: \n{msg}")
            return utils.WAIT_UNTIL_CALLBACK_RC.CANCEL_WAIT  # abort
        count = 0
        for cn in batch_client.compute_node.list(poolid):
            lc_state = cn.state.lower()
            if lc_state in ["idle", "running"]:
                count += 1
            elif lc_state in ["unusable", "starttaskfailed", "offline", "unknown"]:
                log.error(f"Compute node {cn.id} is in {cn.state} state.")
                return utils.WAIT_UNTIL_FUNCTION_RC.CANCEL_WAIT  # abort
        if count == (target_dedicated_nodes or 0):
            return utils.WAIT_UNTIL_FUNCTION_RC.SUCCESS  # done
        return utils.WAIT_UNTIL_FUNCTION_RC.CONTINUE_WAIT  # continue waiting

    # pool can be in steady state but with nodes that are unusable.
    # so, we need to check the state of the compute nodes

    init_time = datetime.datetime.now()
    log.info(f"awaiting compute nodes startup: poolid={poolid}")
    rc = utils.wait_until(lambda: verify_pool())
    if rc is utils.WAIT_UNTIL_RC.TIMEOUT:
        log.error("Timed out waiting for compute nodes to be ready.")
        return None
    if rc is utils.WAIT_UNTIL_RC.FAILURE:
        log.error("Compute nodes failed to start.")
        return None

    elapsed_time = datetime.datetime.now() - init_time
    log.debug(f"Compute nodes ready: poolid={poolid} elapsed_time={elapsed_time}")
    log.info(f"Compute nodes are ready. poolid={poolid}")
    pool_info = batch_client.pool.get(poolid)
    log.info(f" pool_info.current_dedicated_nodes={pool_info.current_dedicated_nodes}")

    return True