in mysqloperator/controller/diagnose.py [0:0]
def diagnose_instance(pod: MySQLPod, logger, dba: 'Dba' = None) -> InstanceStatus:
"""
Check state of an instance in the given pod.
State is checked in isolation. That is, using its own view of the group
and its own local copy of the metadata (if there is one). Thus, it can be
incorrect, if for example, the pod was deleted and didn't have its copy of
the metadata updated or if there's a split-brain.
"""
status = InstanceStatus()
status.pod = pod
if not dba:
try:
dba = mysqlsh.connect_dba(pod.endpoint_co)
except mysqlsh.Error as e:
logger.info(f"Could not connect to {pod.endpoint}: error={e}")
status.connect_error = e.code
if mysql.ErrorCode.CR_MAX_ERROR >= e.code >= mysql.ErrorCode.CR_MIN_ERROR:
# client side errors mean we can't connect to the server, but the
# problem could be in the client or network and not the server
# Check status of the pod
pod.reload()
logger.debug(f"{pod.endpoint}: pod.phase={pod.phase} deleting={pod.deleting}")
if pod.phase != "Running" or not pod.check_containers_ready() or pod.deleting:
# not ONLINE for sure if the Pod is not running
status.status = InstanceDiagStatus.OFFLINE
else:
if shellutils.check_fatal_connect(e, pod.endpoint_url_safe, logger):
raise
return status
cluster = None
if dba:
status.gtid_executed = dba.session.run_sql("select @@gtid_executed").fetch_one()[0]
try:
# TODO: we want to check from individual Pod's/Server's perspective
# it will now check based from primary N times
cluster = dba.get_cluster()
except mysqlsh.Error as e:
logger.info(f"get_cluster() error for {pod.endpoint}: error={e}")
# TODO check for invalid metadata errors
# Note: get_cluster() on a member that was previously removed
# can fail as OFFLINE instead of NOT_MANAGED if its copy of the
# metadata lacks the trx where it was removed
if e.code == errors.SHERR_DBA_BADARG_INSTANCE_NOT_ONLINE:
status.status = InstanceDiagStatus.OFFLINE
elif e.code in (errors.SHERR_DBA_BADARG_INSTANCE_NOT_MANAGED, errors.SHERR_DBA_METADATA_NOT_FOUND):
status.status = InstanceDiagStatus.NOT_MANAGED
else:
if shellutils.check_fatal(
e, pod.endpoint_url_safe, "get_cluster()", logger):
raise
status.status = InstanceDiagStatus.UNKNOWN
except RuntimeError as e:
e_str = str(e)
if e_str.find("unmanaged replication group"):
status.status = InstanceDiagStatus.UNMANAGED
else:
logger.info(f"diagnose_instance: 2 Runtime Error [{e}]")
status.status = InstanceDiagStatus.UNKNOWN
if cluster:
try:
mstatus = cluster.status({"extended": 1})
status.cluster_in_cluster_set_type = ClusterInClusterSetType.PRIMARY
if "clusterRole" in mstatus:
logger.info("9.3.0+ cluster, ClusterSet enabled")
if mstatus["clusterRole"] == "REPLICA":
status.cluster_in_cluster_set_type = ClusterInClusterSetType.REPLICA
else:
status.cluster_in_cluster_set_type = ClusterInClusterSetType.UNKNOWN
else:
logger.info("pre 9.3.0 cluster, not ClusterSet enabled")
cluster_status = mstatus["defaultReplicaSet"]["status"]
status.view_id = mstatus["defaultReplicaSet"]["groupViewId"]
if cluster_status.startswith("OK"):
status.in_quorum = True
else:
logger.info(
f"""No quorum visible from {pod.endpoint}: status={cluster_status} topology={";".join([f'{m},{i["status"]}' for m, i in mstatus["defaultReplicaSet"]["topology"].items()])}""")
status.in_quorum = False
members = {}
mystate = None
for member, info in mstatus["defaultReplicaSet"]["topology"].items():
if pod.instance_type == "group-member":
state = info["status"]
members[member] = state
if member == pod.endpoint:
mystate = state
if state == "ONLINE":
status.is_primary = info["memberRole"] == "PRIMARY"
elif pod.instance_type == "read-replica":
if "readReplicas" in info:
for rr_member, rr_info in info["readReplicas"].items():
if rr_member == pod.endpoint:
mystate = rr_info["status"]
else:
raise Exception(f"Unknown instance type for {pod.name}: {pod.instance_type}")
if not mystate:
# TODO
raise Exception(
f"Could not find {pod} in local cluster.status() output")
status.peers = members
if mystate == "ONLINE":
status.status = InstanceDiagStatus.ONLINE
elif mystate == "RECOVERING":
status.status = InstanceDiagStatus.RECOVERING
elif mystate == "ERROR":
status.status = InstanceDiagStatus.ERROR
elif mystate == "OFFLINE":
status.status = InstanceDiagStatus.OFFLINE
elif mystate == "UNREACHABLE":
status.status = InstanceDiagStatus.UNREACHABLE
else:
logger.error(f"{pod.endpoint}: bad state {mystate}")
assert False, f"{pod.endpoint}: bad state {mystate}"
except mysqlsh.Error as e:
if shellutils.check_fatal(
e, pod.endpoint_url_safe, "status()", logger):
raise
logger.info(f"status() failed at {pod.endpoint}: error={e}")
status.status = InstanceDiagStatus.UNKNOWN
return status