def check_deployed_config_version()

in cookbooks/aws-parallelcluster-slurm/files/default/head_node_checks/check_cluster_ready.py [0:0]


def check_deployed_config_version(cluster_name: str, table_name: str, expected_config_version: str, region: str):
    """
    Verify that every compute/login node in the cluster has deployed the expected config version.

    The verification is made by checking the config version reported by compute/login nodes on the cluster DDB table.
    A RuntimeError exception is raised if the check fails.
    The function is retried and the wait time is expected to be in the interval (cfn_hup_time, 2*cfn_hup_time),
    where cfn_hup_time is the wait time for the cfn-hup daemon (as of today it is 120 seconds).

    :param cluster_name: name of the cluster.
    :param table_name: DDB table to read the deployed config version from.
    :param expected_config_version: expected config version.
    :param region: AWS region name (eg: us-east-1).
    :return: None
    """
    logger.info(
        "Checking that cluster configuration deployed on cluster nodes for cluster %s is %s",
        cluster_name,
        expected_config_version,
    )

    for instance_ids in list_cluster_instance_ids_iterator(
        cluster_name=cluster_name,
        node_type=["Compute", "LoginNode"],
        instance_state=["running"],
        region=region,
    ):
        n_instance_ids = len(instance_ids)

        if not n_instance_ids:
            logger.warning("Found empty batch of cluster nodes: nothing to check")
            continue

        logger.info("Found batch of %s cluster node(s): %s", n_instance_ids, instance_ids)

        items = get_cluster_config_records(table_name, instance_ids, region)
        logger.info("Retrieved %s DDB item(s):\n\t%s", len(items), "\n\t".join([str(i) for i in items]))

        missing, incomplete, wrong = _check_cluster_config_items(instance_ids, items, expected_config_version)

        if missing or incomplete or wrong:
            raise CheckFailedError(
                f"Check failed due to the following erroneous records:\n"
                f"  * missing records ({len(missing)}): {missing}\n"
                f"  * incomplete records ({len(incomplete)}): {incomplete}\n"
                f"  * wrong records ({len(wrong)}): {wrong}"
            )
        logger.info("Verified cluster configuration for cluster node(s) %s", instance_ids)