in mysqloperator/controller/innodbcluster/operator_cluster.py [0:0]
def on_innodbcluster_delete(name: str, namespace: str, body: Body,
logger: Logger, **kwargs):
cluster = InnoDBCluster(body)
logger.info(f"Deleting cluster {name}")
g_group_monitor.remove_cluster(cluster)
# Notify routers for deletion. It will be too late in on_router_pod_delete.
# Do this first before getting STS to 0, which will kill the servers and the metadata can't be updated then
# At last scale down the clusters. If we scale down the Router Deployment to 0, should do the same but is async.
# This async process is a problem, as we need it to be finished before scaling down the STS to 0 / removing the
# cluster from the clusterset
# So, practically we duplicate on_router_pod_delete() here
routers = cluster.get_routers()
if routers:
logger.info(f"Time to notify router(s) {routers} for IC deletion")
controller = ClusterController(cluster)
try:
controller.on_router_pod_delete(routers, logger)
except Exception as exc:
# Ignore errors, there isn't much we could do
# and there is no point in retrying forever
logger.warning(f"on_innodbcluster_delete: Failed to remove metadata for {routers}: {exc}")
#print(traceback.format_exc())
logger.warning("on_innodbcluster_delete: Exception ignored, there might be stale metadata left")
# Scale down the cluster to 0
sts = cluster.get_stateful_set()
if sts:
pods = cluster.get_pods()
# First we need to check if there is only one pod there and whether it is being deleted
# In case it is being deleted on_pod_delete() won't be called when we scale down the STS to 0
# In this case the code that calls cluster finalizer removal won't be called too and the
# cluster finalizer will stay hanging
# If we check after scaling down to 0, and there is only one pod, it will be moved to Terminating
# state and we won't know whether it was in Terminating beforehand. If it wasn't then
# on_pod_delete() will be called and we will try to remove the finalizer again663/385000on_spec
# then len(pods) == maxUnavailable and all pods should be inspected whether they are terminating
if len(pods) == 1 and pods[0].deleting:
# if there is only one pod and it is deleting then on_pod_delete() won't be called
# in this case the IC finalizer won't be removed and the IC will hang
logger.info("on_innodbcluster_delete: The cluster's only one pod is already deleting. Removing cluster finalizer here")
cluster.remove_cluster_finalizer()
if len(pods):
# TODO: this should be moved to controller or elsewhere
# TODO: try more pods, if one fails and more are avaialble
# TODO: if this is PRIMARY we got to do something ... maybe force a failover?
# TODO: this shouldn't block decomission (catch and log/ignore errors)
# TODO: remove admin/backup/metrics/router/... accounts as far as they are replicated to primary
with shellutils.DbaWrap(shellutils.connect_to_pod_dba(pods[0], logger)) as dba:
try:
cluster_status = dba.get_cluster().status({"extended": 1})
if "clusterRole" in cluster_status:
logger.info("9.3.0+ cluster, ClusterSet enabled")
my_name = dba.get_cluster().name
cs = dba.get_cluster_set()
cs_status = cs.status(extended=1)
logger.info(f"CSet={json.dumps(cs_status, indent=4)}")
if cs_status["clusters"][my_name]["clusterRole"] == "PRIMARY" and len(cs_status["clusters"]) > 1:
#raise kopf.TemporaryError(f"Cluster {my_name} is PRIMARY. Can not remove, trigger a failover first!")
# Check if all REPLICAS are still there, if not there / stale, remove them
invalidated = 0
ok = {}
for cluster_name, cluster_data in cs_status["clusters"].items():
if cluster_data["clusterRole"] == "REPLICA":
if cluster_data["globalStatus"] == "INVALIDATED" and cluster_data["status"] == "UNREACHABLE":
invalidated = invalidated + 1
else:
# we can also throw here directly on first occurence, but let's just collect some data for the exception message
ok[cluster_name] = cluster_data
# Without the primary
if (len(cs_status["clusters"]) - 1) != invalidated:
raise kopf.TemporaryError(f"Cluster {my_name} is PRIMARY. Can not remove, trigger a failover first! The following replicas seem to be ok {json.dumps(ok, indent=4)}")
# else this is the only cluster in the clusterset and we are fine
for cluster_name in cs_status["clusters"].keys():
logger.info(f"Removing INVALIDATED and UNREACHABLE cluster {cluster_name} from the cluster")
cs.remove_cluster(cluster_name, {"force": True})
cs.remove_cluster(my_name)
else:
logger.info("pre 9.3.0 cluster, not ClusterSet enabled")
except mysqlsh.Error as exc:
# For whatever reaon we fail: this shouldn't stop us from
# decomissioning our pods. Even if not unregistered.
# TODO: maybe the only reason might be if this were the
# primary cluster, while other clusters exist ...
# but a) that is a user error and b) there shouldn't
# be an exception .. but let's keep an eye on it
logger.error(f"Error while trying to check ClusterSet status for unregistering: {exc}")
logger.info(f"Updating InnoDB Cluster StatefulSet.instances to 0")
cluster_objects.update_stateful_set_spec(sts, {"spec": {"replicas": 0}})
# Scale down routers to 0
logger.info(f"Updating Router Deployment.replicas to 0")
router_objects.update_size(cluster, 0, False, logger)