def diagnose_instance()

in mysqloperator/controller/diagnose.py [0:0]


def diagnose_instance(pod: MySQLPod, logger, dba: 'Dba' = None) -> InstanceStatus:
    """
    Check state of an instance in the given pod.

    State is checked in isolation. That is, using its own view of the group
    and its own local copy of the metadata (if there is one). Thus, it can be
    incorrect, if for example, the pod was deleted and didn't have its copy of
    the metadata updated or if there's a split-brain.
    """
    status = InstanceStatus()
    status.pod = pod

    if not dba:
        try:
            dba = mysqlsh.connect_dba(pod.endpoint_co)
        except mysqlsh.Error as e:
            logger.info(f"Could not connect to {pod.endpoint}: error={e}")
            status.connect_error = e.code

            if mysql.ErrorCode.CR_MAX_ERROR >= e.code >= mysql.ErrorCode.CR_MIN_ERROR:
                # client side errors mean we can't connect to the server, but the
                # problem could be in the client or network and not the server

                # Check status of the pod
                pod.reload()
                logger.debug(f"{pod.endpoint}: pod.phase={pod.phase}  deleting={pod.deleting}")
                if pod.phase != "Running" or not pod.check_containers_ready() or pod.deleting:
                    # not ONLINE for sure if the Pod is not running
                    status.status = InstanceDiagStatus.OFFLINE
            else:
                if shellutils.check_fatal_connect(e, pod.endpoint_url_safe, logger):
                    raise

            return status

    cluster = None
    if dba:
        status.gtid_executed = dba.session.run_sql("select @@gtid_executed").fetch_one()[0]

        try:
            # TODO: we want to check from individual Pod's/Server's perspective
            #       it will now check based from primary N times
            cluster = dba.get_cluster()
        except mysqlsh.Error as e:
            logger.info(f"get_cluster() error for {pod.endpoint}: error={e}")

            # TODO check for invalid metadata errors
            # Note: get_cluster() on a member that was previously removed
            # can fail as OFFLINE instead of NOT_MANAGED if its copy of the
            # metadata lacks the trx where it was removed
            if e.code == errors.SHERR_DBA_BADARG_INSTANCE_NOT_ONLINE:
                status.status = InstanceDiagStatus.OFFLINE
            elif e.code in (errors.SHERR_DBA_BADARG_INSTANCE_NOT_MANAGED, errors.SHERR_DBA_METADATA_NOT_FOUND):
                status.status = InstanceDiagStatus.NOT_MANAGED
            else:
                if shellutils.check_fatal(
                        e, pod.endpoint_url_safe, "get_cluster()", logger):
                    raise
                status.status = InstanceDiagStatus.UNKNOWN
        except RuntimeError as e:
            e_str = str(e)
            if e_str.find("unmanaged replication group"):
                status.status = InstanceDiagStatus.UNMANAGED
            else:
                logger.info(f"diagnose_instance: 2 Runtime Error [{e}]")
                status.status = InstanceDiagStatus.UNKNOWN

    if cluster:
        try:
            mstatus = cluster.status({"extended": 1})

            status.cluster_in_cluster_set_type = ClusterInClusterSetType.PRIMARY
            if "clusterRole" in mstatus:
                logger.info("9.3.0+ cluster, ClusterSet enabled")
                if mstatus["clusterRole"] == "REPLICA":
                    status.cluster_in_cluster_set_type = ClusterInClusterSetType.REPLICA
                else:
                    status.cluster_in_cluster_set_type = ClusterInClusterSetType.UNKNOWN
            else:
                logger.info("pre 9.3.0 cluster, not ClusterSet enabled")

            cluster_status = mstatus["defaultReplicaSet"]["status"]
            status.view_id = mstatus["defaultReplicaSet"]["groupViewId"]

            if cluster_status.startswith("OK"):
                status.in_quorum = True
            else:
                logger.info(
                    f"""No quorum visible from {pod.endpoint}: status={cluster_status}  topology={";".join([f'{m},{i["status"]}' for m, i in mstatus["defaultReplicaSet"]["topology"].items()])}""")
                status.in_quorum = False

            members = {}
            mystate = None
            for member, info in mstatus["defaultReplicaSet"]["topology"].items():
                if pod.instance_type == "group-member":
                    state = info["status"]
                    members[member] = state
                    if member == pod.endpoint:
                        mystate = state
                        if state == "ONLINE":
                            status.is_primary = info["memberRole"] == "PRIMARY"
                elif pod.instance_type == "read-replica":
                    if "readReplicas" in info:
                        for rr_member, rr_info in info["readReplicas"].items():
                            if rr_member == pod.endpoint:
                                mystate = rr_info["status"]
                else:
                    raise Exception(f"Unknown instance type for {pod.name}: {pod.instance_type}")

            if not mystate:
                # TODO
                raise Exception(
                    f"Could not find {pod} in local cluster.status() output")

            status.peers = members

            if mystate == "ONLINE":
                status.status = InstanceDiagStatus.ONLINE
            elif mystate == "RECOVERING":
                status.status = InstanceDiagStatus.RECOVERING
            elif mystate == "ERROR":
                status.status = InstanceDiagStatus.ERROR
            elif mystate == "OFFLINE":
                status.status = InstanceDiagStatus.OFFLINE
            elif mystate == "UNREACHABLE":
                status.status = InstanceDiagStatus.UNREACHABLE
            else:
                logger.error(f"{pod.endpoint}: bad state {mystate}")
                assert False, f"{pod.endpoint}: bad state {mystate}"
        except mysqlsh.Error as e:
            if shellutils.check_fatal(
                    e, pod.endpoint_url_safe, "status()", logger):
                raise

            logger.info(f"status() failed at {pod.endpoint}: error={e}")
            status.status = InstanceDiagStatus.UNKNOWN

    return status