in src/hpcadvisor/batch_handler.py [0:0]
def wait_pool_ready(poolid):
if not batch_client:
log.critical("batch_client is None")
return
# measure time to wait for pool to be ready
init_time = datetime.datetime.now()
log.debug(f"awaiting pool to be ready: poolid={poolid}")
rc = utils.wait_until(
lambda: batch_client.pool.get(poolid).allocation_state == "steady"
)
if rc is utils.WAIT_UNTIL_RC.TIMEOUT:
log.error("Timed out waiting for pool resize.")
if rc is utils.WAIT_UNTIL_RC.FAILURE:
log.error("Pool resize failed.")
elapsed_time = datetime.datetime.now() - init_time
log.debug(f"Pool ready: poolid={poolid} elapsed_time={elapsed_time}")
log.info(f"Pool resize is complete: poolid={poolid}")
target_dedicated_nodes = batch_client.pool.get(poolid).target_dedicated_nodes
def verify_pool():
pool_config = batch_client.pool.get(poolid)
if pool_config.resize_errors and len(pool_config.resize_errors) > 0:
log.error(f"Pool {pool_id} resize failed.")
msg = "\n".join(
[f" {e.code}: {e.message}" for e in pool_config.resize_errors]
)
log.error(f"Resize errors: \n{msg}")
return utils.WAIT_UNTIL_CALLBACK_RC.CANCEL_WAIT # abort
count = 0
for cn in batch_client.compute_node.list(poolid):
lc_state = cn.state.lower()
if lc_state in ["idle", "running"]:
count += 1
elif lc_state in ["unusable", "starttaskfailed", "offline", "unknown"]:
log.error(f"Compute node {cn.id} is in {cn.state} state.")
return utils.WAIT_UNTIL_FUNCTION_RC.CANCEL_WAIT # abort
if count == (target_dedicated_nodes or 0):
return utils.WAIT_UNTIL_FUNCTION_RC.SUCCESS # done
return utils.WAIT_UNTIL_FUNCTION_RC.CONTINUE_WAIT # continue waiting
# pool can be in steady state but with nodes that are unusable.
# so, we need to check the state of the compute nodes
init_time = datetime.datetime.now()
log.info(f"awaiting compute nodes startup: poolid={poolid}")
rc = utils.wait_until(lambda: verify_pool())
if rc is utils.WAIT_UNTIL_RC.TIMEOUT:
log.error("Timed out waiting for compute nodes to be ready.")
return None
if rc is utils.WAIT_UNTIL_RC.FAILURE:
log.error("Compute nodes failed to start.")
return None
elapsed_time = datetime.datetime.now() - init_time
log.debug(f"Compute nodes ready: poolid={poolid} elapsed_time={elapsed_time}")
log.info(f"Compute nodes are ready. poolid={poolid}")
pool_info = batch_client.pool.get(poolid)
log.info(f" pool_info.current_dedicated_nodes={pool_info.current_dedicated_nodes}")
return True