def on_innodbcluster

def on_innodbcluster_delete()

in mysqloperator/controller/innodbcluster/operator_cluster.py [0:0]
53 lines of code
27 McCabe index (conditional complexity)

def on_innodbcluster_delete(name: str, namespace: str, body: Body,
                            logger: Logger, **kwargs):
    cluster = InnoDBCluster(body)

    logger.info(f"Deleting cluster {name}")

    g_group_monitor.remove_cluster(cluster)

    # Notify routers for deletion. It will be too late in on_router_pod_delete.
    # Do this first before getting STS to 0, which will kill the servers and the metadata can't be updated then
    # At last scale down the clusters. If we scale down the Router Deployment to 0, should do the same but is async.
    # This async process is a problem, as we need it to be finished before scaling down the STS to 0 / removing the
    # cluster from the clusterset
    # So, practically we duplicate on_router_pod_delete() here
    routers = cluster.get_routers()
    if routers:
        logger.info(f"Time to notify router(s) {routers} for IC deletion")
        controller = ClusterController(cluster)
        try:
            controller.on_router_pod_delete(routers, logger)
        except Exception as exc:
            # Ignore errors, there isn't much we could do
            # and there is no point in retrying forever
            logger.warning(f"on_innodbcluster_delete: Failed to remove metadata for {routers}: {exc}")
            #print(traceback.format_exc())
            logger.warning("on_innodbcluster_delete: Exception ignored, there might be stale metadata left")

    # Scale down the cluster to 0
    sts = cluster.get_stateful_set()
    if sts:
        pods = cluster.get_pods()
        # First we need to check if there is only one pod there and whether it is being deleted
        # In case it is being deleted on_pod_delete() won't be called when we scale down the STS to 0
        # In this case the code that calls cluster finalizer removal won't be called too and the
        # cluster finalizer will stay hanging
        # If we check after scaling down to 0, and there is only one pod, it will be moved to Terminating
        # state and we won't know whether it was in Terminating beforehand. If it wasn't then
        # on_pod_delete() will be called and we will try to remove the finalizer again663/385000on_spec
        # then len(pods) == maxUnavailable and all pods should be inspected whether they are terminating
        if len(pods) == 1 and pods[0].deleting:
            # if there is only one pod and it is deleting then on_pod_delete() won't be called
            # in this case the IC finalizer won't be removed and the IC will hang
            logger.info("on_innodbcluster_delete: The cluster's only one pod is already deleting. Removing cluster finalizer here")
            cluster.remove_cluster_finalizer()

        if len(pods):
            # TODO: this should be moved to controller or elsewhere
            # TODO: try more pods, if one fails and more are avaialble
            # TODO: if this is PRIMARY we got to do something ... maybe force a failover?
            # TODO: this shouldn't block decomission (catch and log/ignore errors)
            # TODO: remove admin/backup/metrics/router/... accounts as far as they are replicated to primary
            with shellutils.DbaWrap(shellutils.connect_to_pod_dba(pods[0], logger)) as dba:
                try:
                    cluster_status = dba.get_cluster().status({"extended": 1})
                    if "clusterRole" in cluster_status:
                        logger.info("9.3.0+ cluster, ClusterSet enabled")
                        my_name = dba.get_cluster().name
                        cs = dba.get_cluster_set()
                        cs_status = cs.status(extended=1)
                        logger.info(f"CSet={json.dumps(cs_status, indent=4)}")
                        if cs_status["clusters"][my_name]["clusterRole"] == "PRIMARY" and len(cs_status["clusters"]) > 1:
                            #raise kopf.TemporaryError(f"Cluster {my_name} is PRIMARY. Can not remove, trigger a failover first!")
                            # Check if all REPLICAS are still there, if not there / stale, remove them
                            invalidated = 0
                            ok = {}
                            for cluster_name, cluster_data in cs_status["clusters"].items():
                                if cluster_data["clusterRole"] == "REPLICA":
                                    if cluster_data["globalStatus"] == "INVALIDATED" and cluster_data["status"] == "UNREACHABLE":
                                        invalidated = invalidated + 1
                                    else:
                                        # we can also throw here directly on first occurence, but let's just collect some data for the exception message
                                        ok[cluster_name] = cluster_data

                            # Without the primary
                            if (len(cs_status["clusters"]) - 1) != invalidated:
                                raise kopf.TemporaryError(f"Cluster {my_name} is PRIMARY. Can not remove, trigger a failover first! The following replicas seem to be ok {json.dumps(ok, indent=4)}")
                                # else this is the only cluster in the clusterset and we are fine

                            for cluster_name in cs_status["clusters"].keys():
                                logger.info(f"Removing INVALIDATED and UNREACHABLE cluster {cluster_name} from the cluster")
                                cs.remove_cluster(cluster_name, {"force": True})

                        cs.remove_cluster(my_name)
                    else:
                        logger.info("pre 9.3.0 cluster, not ClusterSet enabled")

                except mysqlsh.Error as exc:
                    # For whatever reaon we fail: this shouldn't stop us from
                    # decomissioning our pods. Even if not unregistered.
                    # TODO: maybe the only reason might be if this were the
                    #       primary cluster, while other clusters exist ...
                    #       but a) that is a user error and b) there shouldn't
                    #       be an exception .. but let's keep an eye on it
                    logger.error(f"Error while trying to check ClusterSet status for unregistering: {exc}")

        logger.info(f"Updating InnoDB Cluster StatefulSet.instances to 0")
        cluster_objects.update_stateful_set_spec(sts, {"spec": {"replicas": 0}})

    # Scale down routers to 0
    logger.info(f"Updating Router Deployment.replicas to 0")
    router_objects.update_size(cluster, 0, False, logger)