def do_diagnose_cluster()

in mysqloperator/controller/diagnose.py [0:0]


def do_diagnose_cluster(cluster: InnoDBCluster, logger) -> ClusterStatus:
    if not cluster.deleting:
        cluster.reload()

    all_pods = set(cluster.get_pods())

    last_known_quorum = cluster.get_last_known_quorum()

    # TODO last known quorum tracking
    log_msg = f"Diagnosing cluster {cluster.name}  deleting={cluster.deleting}  last_known_quorum={last_known_quorum}..."

    # Check if the cluster has already been initialized
    create_time = cluster.get_create_time()
    log_msg += f"create_time={create_time}  deleting={cluster.deleting}"

    if not create_time and not cluster.deleting:
        cluster_status = ClusterStatus()
        cluster_status.status = ClusterDiagStatus.INITIALIZING
        log_msg += f"\nCluster {cluster.name}  status={cluster_status.status}"
        return cluster_status

    all_member_pods = set()
    online_pods = set()
    offline_pods = set()
    unsure_pods = set()
    gtid_executed = {}

    online_pod_statuses = {}
    for pod in all_pods:
        # Diagnose the instance even if deleting - so we can remove it from the cluster and later re-add it
#        if pod.deleting:
#            logger.info(f"instance {pod} is deleting")
#            continue
        status = diagnose_instance(pod, logger)
        log_msg += f"\ndiag instance {pod} --> {status.status} quorum={status.in_quorum} gtid_executed={status.gtid_executed}"

        gtid_executed[pod.index] = status.gtid_executed

        if status.status == InstanceDiagStatus.UNKNOWN:
            unsure_pods.add(pod)
            all_member_pods.add(pod)
        elif status.status in (InstanceDiagStatus.OFFLINE, InstanceDiagStatus.ERROR, InstanceDiagStatus.UNMANAGED):
            offline_pods.add(pod)
            all_member_pods.add(pod)
        elif status.status in (InstanceDiagStatus.ONLINE, InstanceDiagStatus.RECOVERING):
            online_pod_statuses[pod.endpoint] = status
            online_pods.add(pod)
            all_member_pods.add(pod)
        elif status.status == InstanceDiagStatus.NOT_MANAGED:
            pass
        else:
            all_member_pods.add(pod)
            logger.error(f"Internal error processing pod {pod}")
            assert False

    log_msg += f"\n{cluster.name}: all={all_pods}  members={all_member_pods}  online={online_pods}  offline={offline_pods}  unsure={unsure_pods}"

    assert online_pods.union(offline_pods, unsure_pods) == all_member_pods

    cluster_status = ClusterStatus()

    cluster_status.gtid_executed = gtid_executed

    if online_pods:
        active_partitions, blocked_partitions = find_group_partitions(
            online_pod_statuses, all_member_pods, logger)
        log_msg += f"\nactive_partitions={active_partitions}  blocked_partitions={blocked_partitions}"

        if not active_partitions:
            # no quorum
            if unsure_pods:
                cluster_status.status = ClusterDiagStatus.NO_QUORUM_UNCERTAIN
            else:
                cluster_status.status = ClusterDiagStatus.NO_QUORUM
            if blocked_partitions:
                cluster_status.quorum_candidates = list(blocked_partitions[0])
        elif len(active_partitions) == 1:
            # ok
            if unsure_pods:
                cluster_status.status = ClusterDiagStatus.ONLINE_UNCERTAIN
            elif offline_pods:
                cluster_status.status = ClusterDiagStatus.ONLINE_PARTIAL
            else:
                cluster_status.status = ClusterDiagStatus.ONLINE
            cluster_status.online_members = [
                p.pod for p in active_partitions[0] if p.pod]
            for p in active_partitions[0]:
                if p.is_primary:
                    cluster_status.primary = p.pod
                    break
        else:
            # split-brain
            if unsure_pods:
                cluster_status.status = ClusterDiagStatus.SPLIT_BRAIN_UNCERTAIN
            else:
                cluster_status.status = ClusterDiagStatus.SPLIT_BRAIN
            cluster_status.online_members = []
            for part in active_partitions:
                cluster_status.online_members += [p.pod for p in part if p.pod]
    else:
        if cluster.deleting:
            cluster_status.status = ClusterDiagStatus.FINALIZING
        else:
            if offline_pods:
                if unsure_pods:
                    cluster_status.status = ClusterDiagStatus.OFFLINE_UNCERTAIN
                else:
                    cluster_status.status = ClusterDiagStatus.OFFLINE
            else:
                cluster_status.status = ClusterDiagStatus.UNKNOWN

    if cluster_status.status in (ClusterDiagStatus.UNKNOWN,
                                 ClusterDiagStatus.OFFLINE,
                                 ClusterDiagStatus.OFFLINE_UNCERTAIN,
                                 ClusterDiagStatus.SPLIT_BRAIN,
                                 ClusterDiagStatus.SPLIT_BRAIN_UNCERTAIN,
                                 ClusterDiagStatus.ONLINE_UNCERTAIN,
                                 ClusterDiagStatus.NO_QUORUM,
                                 ClusterDiagStatus.NO_QUORUM_UNCERTAIN):
        logger.info(log_msg)
        cluster_status.type = ClusterInClusterSetType.UNKNOWN
    else:
        cluster_status.type = status.cluster_in_cluster_set_type

    logger.debug(f"Cluster {cluster.name}  status={cluster_status.status}")

    return cluster_status