protected void checkMaster()

in core/src/main/java/org/apache/brooklyn/core/mgmt/ha/HighAvailabilityManagerImpl.java [777:885]


    protected void checkMaster(boolean initializing) {
        ManagementPlaneSyncRecord memento = loadManagementPlaneSyncRecord(false);
        
        if (getNodeState()==ManagementNodeState.FAILED || getNodeState()==ManagementNodeState.HOT_BACKUP) {
            // if failed or hot backup then we can't promote ourselves, so no point in checking who is master
            return;
        }

        ManagementNodeState ourPrevState = getNodeState();
        updateLocalPlaneId(memento);
        
        String currMasterNodeId = memento.getMasterNodeId();
        ManagementNodeSyncRecord currMasterNodeRecord = memento.getManagementNodes().get(currMasterNodeId);
        ManagementNodeSyncRecord ownNodeRecord = memento.getManagementNodes().get(ownNodeId);
        
        ManagementNodeSyncRecord newMasterNodeRecord = null;
        boolean demotingSelfInFavourOfOtherMaster = false;
        
        if (currMasterNodeRecord != null && currMasterNodeRecord.getStatus() == ManagementNodeState.MASTER && isHeartbeatOk(currMasterNodeRecord, ownNodeRecord)) {
            // master seems healthy
            if (ownNodeId.equals(currMasterNodeId)) {
                if (LOG.isTraceEnabled()) LOG.trace("Existing master healthy (us): master={}", currMasterNodeRecord.toVerboseString());
                return;
            } else {
                if (ownNodeRecord!=null && ownNodeRecord.getStatus() == ManagementNodeState.MASTER) {
                    LOG.error("Management node "+ownNodeId+" detected master change, stolen from us, deferring to "+currMasterNodeId);
                    newMasterNodeRecord = currMasterNodeRecord;
                    demotingSelfInFavourOfOtherMaster = true;
                } else {
                    if (LOG.isTraceEnabled()) LOG.trace("Existing master healthy (remote): master={}", currMasterNodeRecord.toVerboseString());
                    return;
                }
            }
        } else if (ownNodeRecord == null || !isHeartbeatOk(ownNodeRecord, ownNodeRecord)) {
            // our heartbeats are also out-of-date! perhaps something wrong with persistence? just log, and don't over-react!
            if (ownNodeRecord == null) {
                LOG.error("No management node memento for self ("+ownNodeId+"); perhaps persister unwritable? "
                        + "Master ("+currMasterNodeId+") reported failed but no-op as cannot tell conclusively");
            } else {
                LOG.error("This management node ("+ownNodeId+") memento heartbeats out-of-date; perhaps perister unwritable? "
                        + "Master ("+currMasterNodeId+") reported failed but no-op as cannot tell conclusively"
                        + ": self="+ownNodeRecord.toVerboseString());
            }
            return;
        } else if (ownNodeId.equals(currMasterNodeId)) {
            // we are supposed to be the master, but seem to be unhealthy!
            LOG.warn("This management node ("+ownNodeId+") supposed to be master but reportedly unhealthy? "
                    + "no-op as expect other node to fix: self="+ownNodeRecord.toVerboseString());
            return;
        }
        
        if (demotingSelfInFavourOfOtherMaster) {
            LOG.debug("Master-change for this node only, demoting "+ownNodeRecord.toVerboseString()+" in favour of official master "+newMasterNodeRecord.toVerboseString());
            demoteTo(
                BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY) ?
                    ManagementNodeState.HOT_STANDBY : ManagementNodeState.STANDBY);
            return;
        } else {
            LOG.debug("Detected master heartbeat timeout. Initiating a new master election. Master was " + currMasterNodeRecord);
        }
        
        // Need to choose a new master
        newMasterNodeRecord = masterChooser.choose(memento, getHeartbeatTimeout(), ownNodeId);
        
        String newMasterNodeId = (newMasterNodeRecord == null) ? null : newMasterNodeRecord.getNodeId();
        URI newMasterNodeUri = (newMasterNodeRecord == null) ? null : newMasterNodeRecord.getUri();
        boolean weAreNewMaster = ownNodeId.equals(newMasterNodeId);
        
        if (LOG.isDebugEnabled()) {
            LOG.debug("Management node master-change required: newMaster={}; oldMaster={}; plane={}, self={}; heartbeatTimeout={}", 
                new Object[] {
                    (newMasterNodeRecord == null ? "<none>" : newMasterNodeRecord.toVerboseString()),
                    (currMasterNodeRecord == null ? currMasterNodeId+" (no memento)": currMasterNodeRecord.toVerboseString()),
                    memento,
                    ownNodeRecord.toVerboseString(), 
                    getHeartbeatTimeout()
                });
        }
        String message = "Management node "+ownNodeId+" detected ";
        String currMasterSummary =
                (Strings.isNonBlank(currMasterNodeId) ? currMasterNodeId+" " : "") + "(" + (currMasterNodeRecord==null ? "<none>" : timestampString(currMasterNodeRecord.getRemoteTimestamp())) + ")";
        if (weAreNewMaster && (ownNodeRecord.getStatus() == ManagementNodeState.MASTER)) {
            LOG.warn(message + "we must reassert master status, as we believe we should be master and other master "+
                (currMasterNodeRecord==null ? "(a node which has gone away)" : currMasterSummary)+" has failed");
            publishPromotionToMaster();
            publishHealth();
            return;
        }
        
        if (!initializing) {
            if (weAreNewMaster) {
                message += "we should be master, promoting from "+ourPrevState+"; master changing from ";
            }
            else if (currMasterNodeRecord==null && newMasterNodeId==null) message += "master change attempted but no candidates ";
            else message += "master change, from ";
            message += currMasterSummary + " to "
                + (newMasterNodeId == null ? "<none>" :
                    (weAreNewMaster ? "us " : "")
                    + newMasterNodeId + " (" + timestampString(newMasterNodeRecord.getRemoteTimestamp()) + ")" 
                    + (newMasterNodeUri!=null ? " "+newMasterNodeUri : "")  );
            // always log, if you're looking at a standby node it's useful to see the new master's URL
            LOG.info(message);
        }

        // New master is ourself: promote
        if (weAreNewMaster) {
            promoteToMaster();
        }
    }