def repair_cluster()

in mysqloperator/controller/innodbcluster/cluster_controller.py [0:0]


    def repair_cluster(self, pod: MySQLPod, diagnostic: diagnose.ClusterStatus, logger: Logger) -> None:
        # TODO check statuses where router has to be put down

        # Restore cluster to an ONLINE state
        if diagnostic.status == diagnose.ClusterDiagStatus.ONLINE:
            # Nothing to do
            return

        elif diagnostic.status == diagnose.ClusterDiagStatus.ONLINE_PARTIAL:
            # Nothing to do, rejoins handled on pod events
            return

        elif diagnostic.status == diagnose.ClusterDiagStatus.ONLINE_UNCERTAIN:
            # Nothing to do
            # TODO maybe delete unreachable pods if enabled?
            return

        elif diagnostic.status == diagnose.ClusterDiagStatus.OFFLINE:
            # Reboot cluster if all pods are reachable
            if len([g for g in diagnostic.gtid_executed.values() if g is not None]) == len(self.cluster.get_pods()):
                seed_pod = select_pod_with_most_gtids(diagnostic.gtid_executed)

                self.cluster.info(action="RestoreCluster", reason="Rebooting",
                                    message=f"Restoring OFFLINE cluster through pod {seed_pod}")

                shellutils.RetryLoop(logger).call(self.reboot_cluster, seed_pod, logger)
            else:
                logger.debug(f"Cannot reboot cluster because not all pods are reachable")
                raise kopf.TemporaryError(
                        f"Cluster cannot be restored because there are unreachable pods", delay=5)

        elif diagnostic.status == diagnose.ClusterDiagStatus.OFFLINE_UNCERTAIN:
            # TODO delete unconnectable pods after timeout, if enabled
            raise kopf.TemporaryError(
                f"Unreachable members found while in state {diagnostic.status}, waiting...")

        elif diagnostic.status == diagnose.ClusterDiagStatus.NO_QUORUM:
            # Restore cluster
            self.cluster.info(action="RestoreCluster", reason="RestoreQuorum",
                              message="Restoring quorum of cluster")

            shellutils.RetryLoop(logger).call(
                self.force_quorum, diagnostic.quorum_candidates[0], logger)

        elif diagnostic.status == diagnose.ClusterDiagStatus.NO_QUORUM_UNCERTAIN:
            # Restore cluster
            # TODO delete unconnectable pods after timeout, if enabled
            raise kopf.TemporaryError(
                f"Unreachable members found while in state {diagnostic.status}, waiting...")

        elif diagnostic.status == diagnose.ClusterDiagStatus.SPLIT_BRAIN:
            self.cluster.error(action="UnrecoverableState", reason="SplitBrain",
                               message="Cluster is in a SPLIT-BRAIN state and cannot be restored automatically.")

            # TODO check if recoverable case
            # Fatal error, user intervention required
            raise kopf.PermanentError(
                f"Unable to recover from current cluster state. User action required. state={diagnostic.status}")

        elif diagnostic.status == diagnose.ClusterDiagStatus.SPLIT_BRAIN_UNCERTAIN:
            # TODO check if recoverable case and if NOT, then throw a permanent error
            self.cluster.error(action="UnrecoverableState", reason="SplitBrain",
                               message="Cluster is in state SPLIT-BRAIN with unreachable instances and cannot be restored automatically.")

            raise kopf.PermanentError(
                f"Unable to recover from current cluster state. User action required. state={diagnostic.status}")
            # TODO delete unconnectable pods after timeout, if enabled
            raise kopf.TemporaryError(
                f"Unreachable members found while in state {diagnostic.status}, waiting...")

        elif diagnostic.status == diagnose.ClusterDiagStatus.UNKNOWN:
            # Nothing to do, but we can try again later and hope something comes back
            raise kopf.TemporaryError(
                f"No members of the cluster could be reached. state={diagnostic.status}")

        elif diagnostic.status == diagnose.ClusterDiagStatus.INVALID:
            self.cluster.error(action="UnrecoverableState", reason="Invalid",
                               message="Cluster state is invalid and cannot be restored automatically.")

            raise kopf.PermanentError(
                f"Unable to recover from current cluster state. User action required. state={diagnostic.status}")

        elif diagnostic.status == diagnose.ClusterDiagStatus.FINALIZING:
            # Nothing to do
            return

        else:
            raise kopf.PermanentError(
                f"Invalid cluster state {diagnostic.status}")