in brooklyn-server/core/src/main/java/org/apache/brooklyn/core/mgmt/ha/HighAvailabilityManagerImpl.java [708:812]
protected void checkMaster(boolean initializing) {
ManagementPlaneSyncRecord memento = loadManagementPlaneSyncRecord(false);
if (getNodeState()==ManagementNodeState.FAILED || getNodeState()==ManagementNodeState.HOT_BACKUP) {
// if failed or hot backup then we can't promote ourselves, so no point in checking who is master
return;
}
String currMasterNodeId = memento.getMasterNodeId();
ManagementNodeSyncRecord currMasterNodeRecord = memento.getManagementNodes().get(currMasterNodeId);
ManagementNodeSyncRecord ownNodeRecord = memento.getManagementNodes().get(ownNodeId);
ManagementNodeSyncRecord newMasterNodeRecord = null;
boolean demotingSelfInFavourOfOtherMaster = false;
if (currMasterNodeRecord != null && currMasterNodeRecord.getStatus() == ManagementNodeState.MASTER && isHeartbeatOk(currMasterNodeRecord, ownNodeRecord)) {
// master seems healthy
if (ownNodeId.equals(currMasterNodeId)) {
if (LOG.isTraceEnabled()) LOG.trace("Existing master healthy (us): master={}", currMasterNodeRecord.toVerboseString());
return;
} else {
if (ownNodeRecord!=null && ownNodeRecord.getStatus() == ManagementNodeState.MASTER) {
LOG.error("Management node "+ownNodeId+" detected master change, stolen from us, deferring to "+currMasterNodeId);
newMasterNodeRecord = currMasterNodeRecord;
demotingSelfInFavourOfOtherMaster = true;
} else {
if (LOG.isTraceEnabled()) LOG.trace("Existing master healthy (remote): master={}", currMasterNodeRecord.toVerboseString());
return;
}
}
} else if (ownNodeRecord == null || !isHeartbeatOk(ownNodeRecord, ownNodeRecord)) {
// our heartbeats are also out-of-date! perhaps something wrong with persistence? just log, and don't over-react!
if (ownNodeRecord == null) {
LOG.error("No management node memento for self ("+ownNodeId+"); perhaps persister unwritable? "
+ "Master ("+currMasterNodeId+") reported failed but no-op as cannot tell conclusively");
} else {
LOG.error("This management node ("+ownNodeId+") memento heartbeats out-of-date; perhaps perister unwritable? "
+ "Master ("+currMasterNodeId+") reported failed but no-op as cannot tell conclusively"
+ ": self="+ownNodeRecord.toVerboseString());
}
return;
} else if (ownNodeId.equals(currMasterNodeId)) {
// we are supposed to be the master, but seem to be unhealthy!
LOG.warn("This management node ("+ownNodeId+") supposed to be master but reportedly unhealthy? "
+ "no-op as expect other node to fix: self="+ownNodeRecord.toVerboseString());
return;
}
if (demotingSelfInFavourOfOtherMaster) {
LOG.debug("Master-change for this node only, demoting "+ownNodeRecord.toVerboseString()+" in favour of official master "+newMasterNodeRecord.toVerboseString());
demoteTo(
BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY) ?
ManagementNodeState.HOT_STANDBY : ManagementNodeState.STANDBY);
return;
} else {
LOG.debug("Detected master heartbeat timeout. Initiating a new master election. Master was " + currMasterNodeRecord);
}
// Need to choose a new master
newMasterNodeRecord = masterChooser.choose(memento, getHeartbeatTimeout(), ownNodeId);
String newMasterNodeId = (newMasterNodeRecord == null) ? null : newMasterNodeRecord.getNodeId();
URI newMasterNodeUri = (newMasterNodeRecord == null) ? null : newMasterNodeRecord.getUri();
boolean weAreNewMaster = ownNodeId.equals(newMasterNodeId);
if (LOG.isDebugEnabled()) {
LOG.debug("Management node master-change required: newMaster={}; oldMaster={}; plane={}, self={}; heartbeatTimeout={}",
new Object[] {
(newMasterNodeRecord == null ? "<none>" : newMasterNodeRecord.toVerboseString()),
(currMasterNodeRecord == null ? currMasterNodeId+" (no memento)": currMasterNodeRecord.toVerboseString()),
memento,
ownNodeRecord.toVerboseString(),
getHeartbeatTimeout()
});
}
String message = "Management node "+ownNodeId+" detected ";
String currMasterSummary = currMasterNodeId + "(" + (currMasterNodeRecord==null ? "<none>" : timestampString(currMasterNodeRecord.getRemoteTimestamp())) + ")";
if (weAreNewMaster && (ownNodeRecord.getStatus() == ManagementNodeState.MASTER)) {
LOG.warn(message + "we must reassert master status, as was stolen and then failed at "+
(currMasterNodeRecord==null ? "a node which has gone away" : currMasterSummary));
publishPromotionToMaster();
publishHealth();
return;
}
if (!initializing) {
if (weAreNewMaster) {
message += "we should be master, changing from ";
}
else if (currMasterNodeRecord==null && newMasterNodeId==null) message += "master change attempted but no candidates ";
else message += "master change, from ";
message += currMasterSummary + " to "
+ (newMasterNodeId == null ? "<none>" :
(weAreNewMaster ? "us " : "")
+ newMasterNodeId + " (" + timestampString(newMasterNodeRecord.getRemoteTimestamp()) + ")"
+ (newMasterNodeUri!=null ? " "+newMasterNodeUri : "") );
// always log, if you're looking at a standby node it's useful to see the new master's URL
LOG.info(message);
}
// New master is ourself: promote
if (weAreNewMaster) {
promoteToMaster();
}
}