in cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py [0:0]
def check_deployed_config_version(cluster_name: str, table_name: str, expected_config_version: str, region: str):
"""
Verify that every compute/login node in the cluster has deployed the expected config version.
The verification is made by checking the config version reported by compute/login nodes on the cluster DDB table.
A RuntimeError exception is raised if the check fails.
The function is retried and the wait time is expected to be in the interval (cfn_hup_time, 2*cfn_hup_time),
where cfn_hup_time is the wait time for the cfn-hup daemon (as of today it is 120 seconds).
:param cluster_name: name of the cluster.
:param table_name: DDB table to read the deployed config version from.
:param expected_config_version: expected config version.
:param region: AWS region name (eg: us-east-1).
:return: None
"""
logger.info(
"Checking that cluster configuration deployed on cluster nodes for cluster %s is %s",
cluster_name,
expected_config_version,
)
for instance_ids in list_cluster_instance_ids_iterator(
cluster_name=cluster_name,
node_type=["Compute", "LoginNode"],
instance_state=["running"],
region=region,
):
n_instance_ids = len(instance_ids)
if not n_instance_ids:
logger.warning("Found empty batch of cluster nodes: nothing to check")
continue
logger.info("Found batch of %s cluster node(s): %s", n_instance_ids, instance_ids)
items = get_cluster_config_records(table_name, instance_ids, region)
logger.info("Retrieved %s DDB item(s):\n\t%s", len(items), "\n\t".join([str(i) for i in items]))
missing, incomplete, wrong = _check_cluster_config_items(instance_ids, items, expected_config_version)
if missing or incomplete or wrong:
raise CheckFailedError(
f"Check failed due to the following erroneous records:\n"
f" * missing records ({len(missing)}): {missing}\n"
f" * incomplete records ({len(incomplete)}): {incomplete}\n"
f" * wrong records ({len(wrong)}): {wrong}"
)
logger.info("Verified cluster configuration for cluster node(s) %s", instance_ids)