in manager/dm-server/src/main/java/org/apache/doris/stack/service/control/ResourceClusterNodeService.java [276:316]
public void agentNodeStateCheck() {
log.info("start to check agent nodes state");
List<ResourceNodeEntity> nodes = nodeRepository.findAll();
if (nodes.isEmpty()) {
log.info("no any agent nodes");
return;
}
for (ResourceNodeEntity node : nodes) {
Timestamp lastTime = node.getLastHeartBeatTimestamp();
if (lastTime == null) {
log.warn("not receive heartbeat yet form node {} {}", node.getId(), node.getHost());
if (node.getCurrentState() != ModelControlState.INIT.getValue()) {
node.setCurrentState(ModelControlState.INIT.getValue());
nodeRepository.save(node);
}
continue;
}
log.info("node {} {} last heartbeat time {}", node.getId(), node.getHost(),
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(lastTime));
if (System.currentTimeMillis() - lastTime.getTime() > 60 * 1000) {
log.warn("node {} heartbeat timeout", node.getId());
log.warn("update node {} form {} to {}", node.getId(), node.getCurrentState(),
ModelControlState.UNKNOWN.getValue());
if (node.getCurrentState() != ModelControlState.UNKNOWN.getValue()) {
node.setCurrentState(ModelControlState.UNKNOWN.getValue());
log.info("update all instance of node {} to UNKNOWN", node.getId());
updateInstancesState(node, ModelControlState.UNKNOWN.getValue());
}
} else {
log.info("node {} heartbeat state normal", node.getId());
log.warn("update node {} form {} to {}", node.getId(), node.getCurrentState(),
ModelControlState.RUNNING.getValue());
if (node.getCurrentState() != ModelControlState.RUNNING.getValue()) {
node.setCurrentState(ModelControlState.RUNNING.getValue());
}
}
nodeRepository.save(node);
}
}