in controller/cluster.go [125:164]
func (c *ClusterChecker) increaseFailureCount(shardIndex int, node store.Node) int64 {
id := node.ID()
c.failureMu.Lock()
if _, ok := c.failureCounts[id]; !ok {
c.failureCounts[id] = 0
}
c.failureCounts[id] += 1
count := c.failureCounts[id]
c.failureMu.Unlock()
// don't add the node into the failover candidates if it's not a master node
if !node.IsMaster() {
return count
}
log := logger.Get().With(
zap.String("id", node.ID()),
zap.Bool("is_master", node.IsMaster()),
zap.String("addr", node.Addr()))
if count%c.options.maxFailureCount == 0 {
cluster, err := c.clusterStore.GetCluster(c.ctx, c.namespace, c.clusterName)
if err != nil {
log.Error("Failed to get the clusterName info", zap.Error(err))
return count
}
newMasterID, err := cluster.PromoteNewMaster(c.ctx, shardIndex, node.ID(), "")
if err == nil {
// the node is normal if it can be elected as the new master,
// because it requires the node is healthy.
c.resetFailureCount(newMasterID)
err = c.clusterStore.UpdateCluster(c.ctx, c.namespace, cluster)
}
if err != nil {
log.Error("Failed to promote the new master", zap.Error(err))
} else {
log.With(zap.String("new_master_id", newMasterID)).Info("Promote the new master")
}
}
return count
}