in mysqloperator/controller/innodbcluster/cluster_controller.py [0:0]
def repair_cluster(self, pod: MySQLPod, diagnostic: diagnose.ClusterStatus, logger: Logger) -> None:
# TODO check statuses where router has to be put down
# Restore cluster to an ONLINE state
if diagnostic.status == diagnose.ClusterDiagStatus.ONLINE:
# Nothing to do
return
elif diagnostic.status == diagnose.ClusterDiagStatus.ONLINE_PARTIAL:
# Nothing to do, rejoins handled on pod events
return
elif diagnostic.status == diagnose.ClusterDiagStatus.ONLINE_UNCERTAIN:
# Nothing to do
# TODO maybe delete unreachable pods if enabled?
return
elif diagnostic.status == diagnose.ClusterDiagStatus.OFFLINE:
# Reboot cluster if all pods are reachable
if len([g for g in diagnostic.gtid_executed.values() if g is not None]) == len(self.cluster.get_pods()):
seed_pod = select_pod_with_most_gtids(diagnostic.gtid_executed)
self.cluster.info(action="RestoreCluster", reason="Rebooting",
message=f"Restoring OFFLINE cluster through pod {seed_pod}")
shellutils.RetryLoop(logger).call(self.reboot_cluster, seed_pod, logger)
else:
logger.debug(f"Cannot reboot cluster because not all pods are reachable")
raise kopf.TemporaryError(
f"Cluster cannot be restored because there are unreachable pods", delay=5)
elif diagnostic.status == diagnose.ClusterDiagStatus.OFFLINE_UNCERTAIN:
# TODO delete unconnectable pods after timeout, if enabled
raise kopf.TemporaryError(
f"Unreachable members found while in state {diagnostic.status}, waiting...")
elif diagnostic.status == diagnose.ClusterDiagStatus.NO_QUORUM:
# Restore cluster
self.cluster.info(action="RestoreCluster", reason="RestoreQuorum",
message="Restoring quorum of cluster")
shellutils.RetryLoop(logger).call(
self.force_quorum, diagnostic.quorum_candidates[0], logger)
elif diagnostic.status == diagnose.ClusterDiagStatus.NO_QUORUM_UNCERTAIN:
# Restore cluster
# TODO delete unconnectable pods after timeout, if enabled
raise kopf.TemporaryError(
f"Unreachable members found while in state {diagnostic.status}, waiting...")
elif diagnostic.status == diagnose.ClusterDiagStatus.SPLIT_BRAIN:
self.cluster.error(action="UnrecoverableState", reason="SplitBrain",
message="Cluster is in a SPLIT-BRAIN state and cannot be restored automatically.")
# TODO check if recoverable case
# Fatal error, user intervention required
raise kopf.PermanentError(
f"Unable to recover from current cluster state. User action required. state={diagnostic.status}")
elif diagnostic.status == diagnose.ClusterDiagStatus.SPLIT_BRAIN_UNCERTAIN:
# TODO check if recoverable case and if NOT, then throw a permanent error
self.cluster.error(action="UnrecoverableState", reason="SplitBrain",
message="Cluster is in state SPLIT-BRAIN with unreachable instances and cannot be restored automatically.")
raise kopf.PermanentError(
f"Unable to recover from current cluster state. User action required. state={diagnostic.status}")
# TODO delete unconnectable pods after timeout, if enabled
raise kopf.TemporaryError(
f"Unreachable members found while in state {diagnostic.status}, waiting...")
elif diagnostic.status == diagnose.ClusterDiagStatus.UNKNOWN:
# Nothing to do, but we can try again later and hope something comes back
raise kopf.TemporaryError(
f"No members of the cluster could be reached. state={diagnostic.status}")
elif diagnostic.status == diagnose.ClusterDiagStatus.INVALID:
self.cluster.error(action="UnrecoverableState", reason="Invalid",
message="Cluster state is invalid and cannot be restored automatically.")
raise kopf.PermanentError(
f"Unable to recover from current cluster state. User action required. state={diagnostic.status}")
elif diagnostic.status == diagnose.ClusterDiagStatus.FINALIZING:
# Nothing to do
return
else:
raise kopf.PermanentError(
f"Invalid cluster state {diagnostic.status}")