def find_group_partitions()

in mysqloperator/controller/diagnose.py [0:0]


def find_group_partitions(online_pod_info: Dict[str, InstanceStatus],
                          pods: Set[MySQLPod], logger) -> Tuple[List[List[InstanceStatus]], List[Set[MySQLPod]]]:
    # List of group partitions that have quorum and can execute transactions.
    # If there's more than 1, then there's a split-brain. If there's none, then
    # we have no availability.
    active_partitions: List[List[InstanceStatus]] = []
    # List of group partitions that have no quorum and can't execute transactions.
    blocked_partitions: List[Set[MySQLPod]] = []

    all_pods = {}
    for pod in pods:
        all_pods[pod.endpoint] = pod

    no_primary_active_partitions = []

    for ep, p in online_pod_info.items():
        # logger.info(f"{ep}:  {'QUORUM' if p.in_quorum else 'NOQUORUM'} {'PRIM' if p.is_primary else 'SEC'} ONLINE_PODS={online_pod_info.keys()}")
        # logger.info(f"PEERS OF {ep}={p.peers}")
        if p.in_quorum:
            online_peers = [peer for peer, state in p.peers.items() if state in ("ONLINE", "RECOVERING")] # A: UNMANAGED ?
            missing = set(online_peers) - set(online_pod_info.keys())
            if missing:
                logger.info(
                    f"Group view of {ep} has {p.peers.keys()} but these are not ONLINE: {missing}")
                raise kopf.TemporaryError(
                    "Cluster status results inconsistent", delay=5)

            part = [online_pod_info[peer] for peer,
                    state in p.peers.items() if state in ("ONLINE", "RECOVERING")] # A: NOT_MANAGED ?
            if p.is_primary:
                active_partitions.append(part)
            else:
                no_primary_active_partitions.append(part)

    if not active_partitions and no_primary_active_partitions:
        # it's possible for a group with quorum to not have a PRIMARY
        # for a short time if the PRIMARY is removed from the group
        raise kopf.TemporaryError(
            "Cluster has quorum but no PRIMARY", delay=10)

    def active_partition_with(pod):
        for part in active_partitions:
            if pod.endpoint in part:
                return part
        return None

    # print()
    for ep, p in online_pod_info.items():
       #     print(ep, p.status, p.in_quorum, p.peers)
        if not p.in_quorum:
            part = active_partition_with(p)
            assert not part, f"Inconsistent group view, {p} not expected to be in {part}"

            part = set([all_pods[peer] for peer, state in p.peers.items()
                        if state not in ("(MISSING)", "UNREACHABLE")])
            if part not in blocked_partitions:
                blocked_partitions.append(part)
    # print("ACTIVE PARTS", active_partitions)
    # print("BLOCKED PARTS", blocked_partitions)
    # print()
    # sort by partition size
    blocked_partitions.sort(key=lambda x: len(x), reverse=True)

    return active_partitions, blocked_partitions