in mysqloperator/controller/diagnose.py [0:0]
def do_diagnose_cluster(cluster: InnoDBCluster, logger) -> ClusterStatus:
if not cluster.deleting:
cluster.reload()
all_pods = set(cluster.get_pods())
last_known_quorum = cluster.get_last_known_quorum()
# TODO last known quorum tracking
log_msg = f"Diagnosing cluster {cluster.name} deleting={cluster.deleting} last_known_quorum={last_known_quorum}..."
# Check if the cluster has already been initialized
create_time = cluster.get_create_time()
log_msg += f"create_time={create_time} deleting={cluster.deleting}"
if not create_time and not cluster.deleting:
cluster_status = ClusterStatus()
cluster_status.status = ClusterDiagStatus.INITIALIZING
log_msg += f"\nCluster {cluster.name} status={cluster_status.status}"
return cluster_status
all_member_pods = set()
online_pods = set()
offline_pods = set()
unsure_pods = set()
gtid_executed = {}
online_pod_statuses = {}
for pod in all_pods:
# Diagnose the instance even if deleting - so we can remove it from the cluster and later re-add it
# if pod.deleting:
# logger.info(f"instance {pod} is deleting")
# continue
status = diagnose_instance(pod, logger)
log_msg += f"\ndiag instance {pod} --> {status.status} quorum={status.in_quorum} gtid_executed={status.gtid_executed}"
gtid_executed[pod.index] = status.gtid_executed
if status.status == InstanceDiagStatus.UNKNOWN:
unsure_pods.add(pod)
all_member_pods.add(pod)
elif status.status in (InstanceDiagStatus.OFFLINE, InstanceDiagStatus.ERROR, InstanceDiagStatus.UNMANAGED):
offline_pods.add(pod)
all_member_pods.add(pod)
elif status.status in (InstanceDiagStatus.ONLINE, InstanceDiagStatus.RECOVERING):
online_pod_statuses[pod.endpoint] = status
online_pods.add(pod)
all_member_pods.add(pod)
elif status.status == InstanceDiagStatus.NOT_MANAGED:
pass
else:
all_member_pods.add(pod)
logger.error(f"Internal error processing pod {pod}")
assert False
log_msg += f"\n{cluster.name}: all={all_pods} members={all_member_pods} online={online_pods} offline={offline_pods} unsure={unsure_pods}"
assert online_pods.union(offline_pods, unsure_pods) == all_member_pods
cluster_status = ClusterStatus()
cluster_status.gtid_executed = gtid_executed
if online_pods:
active_partitions, blocked_partitions = find_group_partitions(
online_pod_statuses, all_member_pods, logger)
log_msg += f"\nactive_partitions={active_partitions} blocked_partitions={blocked_partitions}"
if not active_partitions:
# no quorum
if unsure_pods:
cluster_status.status = ClusterDiagStatus.NO_QUORUM_UNCERTAIN
else:
cluster_status.status = ClusterDiagStatus.NO_QUORUM
if blocked_partitions:
cluster_status.quorum_candidates = list(blocked_partitions[0])
elif len(active_partitions) == 1:
# ok
if unsure_pods:
cluster_status.status = ClusterDiagStatus.ONLINE_UNCERTAIN
elif offline_pods:
cluster_status.status = ClusterDiagStatus.ONLINE_PARTIAL
else:
cluster_status.status = ClusterDiagStatus.ONLINE
cluster_status.online_members = [
p.pod for p in active_partitions[0] if p.pod]
for p in active_partitions[0]:
if p.is_primary:
cluster_status.primary = p.pod
break
else:
# split-brain
if unsure_pods:
cluster_status.status = ClusterDiagStatus.SPLIT_BRAIN_UNCERTAIN
else:
cluster_status.status = ClusterDiagStatus.SPLIT_BRAIN
cluster_status.online_members = []
for part in active_partitions:
cluster_status.online_members += [p.pod for p in part if p.pod]
else:
if cluster.deleting:
cluster_status.status = ClusterDiagStatus.FINALIZING
else:
if offline_pods:
if unsure_pods:
cluster_status.status = ClusterDiagStatus.OFFLINE_UNCERTAIN
else:
cluster_status.status = ClusterDiagStatus.OFFLINE
else:
cluster_status.status = ClusterDiagStatus.UNKNOWN
if cluster_status.status in (ClusterDiagStatus.UNKNOWN,
ClusterDiagStatus.OFFLINE,
ClusterDiagStatus.OFFLINE_UNCERTAIN,
ClusterDiagStatus.SPLIT_BRAIN,
ClusterDiagStatus.SPLIT_BRAIN_UNCERTAIN,
ClusterDiagStatus.ONLINE_UNCERTAIN,
ClusterDiagStatus.NO_QUORUM,
ClusterDiagStatus.NO_QUORUM_UNCERTAIN):
logger.info(log_msg)
cluster_status.type = ClusterInClusterSetType.UNKNOWN
else:
cluster_status.type = status.cluster_in_cluster_set_type
logger.debug(f"Cluster {cluster.name} status={cluster_status.status}")
return cluster_status