in src/kudu/consensus/quorum_util.cc [542:867]
bool ShouldEvictReplica(const RaftConfigPB& config,
const string& leader_uuid,
int replication_factor,
MajorityHealthPolicy policy,
string* uuid_to_evict) {
if (leader_uuid.empty()) {
// If there is no leader, we can't evict anybody.
return false;
}
typedef pair<string, int> Elem;
static const auto kCmp = [](const Elem& lhs, const Elem& rhs) {
// Elements of higher priorty should pop up to the top of the queue.
return lhs.second < rhs.second;
};
typedef priority_queue<Elem, vector<Elem>, decltype(kCmp)> PeerPriorityQueue;
PeerPriorityQueue pq_non_voters(kCmp);
PeerPriorityQueue pq_voters(kCmp);
const auto peer_to_elem = [](const RaftPeerPB& peer) {
const string& peer_uuid = peer.permanent_uuid();
const auto overall_health = peer.health_report().overall_health();
// Non-voter candidates for eviction (in decreasing priority):
// * failed unrecoverably
// * failed
// * in unknown health state
// * any other
//
// Voter candidates for eviction (in decreasing priority):
// * failed unrecoverably and having the attribute REPLACE set
// * failed unrecoverably
// * failed and having the attribute REPLACE set
// * failed
// * having the attribute REPLACE set
// * in unknown health state
// * any other
int priority = 0;
switch (overall_health) {
case HealthReportPB::FAILED_UNRECOVERABLE:
priority = 8;
break;
case HealthReportPB::FAILED:
priority = 4;
break;
case HealthReportPB::HEALTHY:
priority = 0;
break;
case HealthReportPB::UNKNOWN: FALLTHROUGH_INTENDED;
default:
priority = 1;
break;
}
if (peer.member_type() == RaftPeerPB::VOTER && peer.attrs().replace()) {
priority += 2;
}
return Elem(peer_uuid, priority);
};
int num_non_voters_total = 0;
int num_voters_healthy = 0;
int num_voters_total = 0;
int num_voters_with_replace = 0;
int num_voters_viable = 0;
bool leader_with_replace = false;
bool has_non_voter_failed = false;
bool has_non_voter_failed_unrecoverable = false;
bool has_voter_failed = false;
bool has_voter_failed_unrecoverable = false;
bool has_voter_unknown_health = false;
// While working with the optional fields related to per-replica health status
// and attributes, has_a_field()-like methods are not called because of
// the appropriate default values of those fields.
VLOG(2) << "config to evaluate: " << SecureDebugString(config);
for (const RaftPeerPB& peer : config.peers()) {
DCHECK(peer.has_permanent_uuid() && !peer.permanent_uuid().empty());
const string& peer_uuid = peer.permanent_uuid();
const auto overall_health = peer.health_report().overall_health();
const bool failed = overall_health == HealthReportPB::FAILED;
const bool failed_unrecoverable = overall_health == HealthReportPB::FAILED_UNRECOVERABLE;
const bool healthy = overall_health == HealthReportPB::HEALTHY;
const bool unknown = !peer.has_health_report() ||
!peer.health_report().has_overall_health() ||
overall_health == HealthReportPB::UNKNOWN;
const bool has_replace = peer.attrs().replace();
switch (peer.member_type()) {
case RaftPeerPB::VOTER:
// A leader should always report itself as being healthy.
if (PREDICT_FALSE(peer_uuid == leader_uuid && !healthy)) {
LOG(WARNING) << Substitute("leader peer $0 reported health as $1; config: $2",
peer_uuid,
HealthReportPB_HealthStatus_Name(
peer.health_report().overall_health()),
SecureShortDebugString(config));
DCHECK(false) << "Found non-HEALTHY LEADER"; // Crash in DEBUG builds.
// TODO(KUDU-2335): We have seen this assertion in rare circumstances
// in pre-commit builds, so until we fix this lifecycle issue we
// simply do not evict any nodes when the leader is not HEALTHY.
return false;
}
++num_voters_total;
if (healthy) {
++num_voters_healthy;
if (!has_replace) {
++num_voters_viable;
}
}
if (has_replace) {
++num_voters_with_replace;
if (peer_uuid == leader_uuid) {
leader_with_replace = true;
}
}
if (peer_uuid == leader_uuid) {
// Everything below is to keep track of replicas to evict; the leader
// replica is not to be evicted.
break;
}
pq_voters.emplace(peer_to_elem(peer));
has_voter_failed |= failed;
has_voter_failed_unrecoverable |= failed_unrecoverable;
has_voter_unknown_health |= unknown;
break;
case RaftPeerPB::NON_VOTER:
DCHECK_NE(peer_uuid, leader_uuid) << peer_uuid
<< ": non-voter as a leader; " << SecureShortDebugString(config);
pq_non_voters.emplace(peer_to_elem(peer));
++num_non_voters_total;
has_non_voter_failed |= failed;
has_non_voter_failed_unrecoverable |= failed_unrecoverable;
break;
default:
LOG(DFATAL) << peer.member_type() << ": unsupported member type";
break;
}
}
// Sanity check: the leader replica UUID should not be among those to evict.
DCHECK(pq_voters.empty() || pq_voters.top().first != leader_uuid);
DCHECK(pq_non_voters.empty() || pq_non_voters.top().first != leader_uuid);
// A conservative approach is used when evicting replicas. In short, the
// removal of replicas from the tablet without exact knowledge of their health
// status could lead to removing the healthy ones and keeping the failed
// ones, or attempting a config change operation that cannot be committed.
// From the other side, if the number of voter replicas in good health is
// greater or equal to the required replication factor, a replica with any
// health status can be safely evicted without compromising the availability
// of the tablet. Also, the eviction policy is more liberal when dealing with
// failed replicas: if the total number of voter replicas is greater than or
// equal to the required replication factor, the failed replicas are evicted
// aggressively. The latter is to avoid polluting tablet servers with failed
// replicas, reducing the number of possible locations for new non-voter
// replicas created to replace the failed ones. See below for more details.
//
// * A non-voter replica may be evicted regardless of its health status
// if the number of voter replicas in good health without the 'replace'
// attribute is greater than or equal to the required replication factor.
// The idea is to not evict non-voter replicas that might be needed to reach
// the required replication factor, while a present non-voter replica could
// be a good fit to replace a voter replica, if needed.
//
// * A non-voter replica with FAILED or FAILED_UNRECOVERABLE health status
// may be evicted if the number of voter replicas in good health without
// the 'replace' attribute is greater than or equal to a strict majority
// of voter replicas. The idea is to avoid polluting available tablet
// servers with failed non-voter replicas, while replacing failed non-voters
// with healthy non-voters as aggressively as possible. Also, we want to be
// sure that an eviction can succeed before initiating it.
//
// * A voter replica may be evicted regardless of its health status
// if after the eviction the number of voter replicas in good health will be
// greater than or equal to the required replication factor and the leader
// replica itself is not marked with the 'replace' attribute. The latter
// part of the condition emerges from the following observations:
// ** By definition, a voter replica marked with the 'replace' attribute
// should be eventually evicted from the Raft group.
// ** If all voter replicas are in good health and their total count is
// greater than the target replication and only a single one is marked
// with the 'replace' attribute, that's the replica to be evicted.
// ** Kudu Raft implementation does not support evicting the leader of
// a Raft group.
// So, the removal of a leader replica marked with the 'replace' attribute
// is postponed until the leader replica steps down and becomes a follower.
//
// * A voter replica with FAILED health may be evicted only if the total
// number of voter replicas is greater than the required replication factor
// and the number of *other* voter replicas in good health without the
// 'replace' attribute is greater than or equal to a strict majority of
// voter replicas.
//
// * A voter replica with FAILED_UNRECOVERABLE health may be evicted when
// the number of *other* voter replicas in good health without the 'replace'
// attribute is greater than or equal to a strict majority of voter replicas.
//
// * A voter replica in good health marked with the 'replace' attribute may be
// evicted when the number of replicas in good health after the eviction
// is greater than or equal to the required replication factor.
bool need_to_evict_non_voter = false;
// Check if there is any excess non-voter replica. We add non-voter replicas
// to replace non-viable (i.e. failed or explicitly marked for eviction) ones.
need_to_evict_non_voter |=
num_voters_viable >= replication_factor &&
num_non_voters_total > 0;
// Some non-voter replica has failed: we want to remove those aggressively.
// This is to avoid polluting tablet servers with failed replicas. Otherwise,
// it may be a situation when it's impossible to add a new non-voter replica
// to replace failed ones.
need_to_evict_non_voter |=
has_non_voter_failed ||
has_non_voter_failed_unrecoverable;
// All the non-voter-related sub-cases are applicable only when there is at
// least one non-voter replica and a majority of voter replicas are on-line
// to commit the Raft configuration change.
const bool should_evict_non_voter = need_to_evict_non_voter &&
(num_voters_healthy >= MajoritySize(num_voters_total) ||
policy == MajorityHealthPolicy::IGNORE);
bool need_to_evict_voter = false;
// The abundant case: can evict any voter replica. The code below will select
// the most appropriate candidate.
need_to_evict_voter |= num_voters_viable > replication_factor;
// Some voter replica has failed: we want to remove those aggressively.
// This is to avoid polluting tablet servers with failed replicas. Otherwise,
// it may be a situation when it's impossible to add a new non-voter replica
// to replace failed ones.
need_to_evict_voter |= (has_voter_failed || has_voter_failed_unrecoverable);
// In case if we already have enough healthy replicas running, it's safe to
// get rid of replicas with unknown health state.
need_to_evict_voter |=
num_voters_viable >= replication_factor &&
has_voter_unknown_health;
// Working with the replicas marked with the 'replace' attribute:
// the case when too many replicas are marked with the 'replace' attribute
// while all required replicas are healthy.
//
// In the special case when the leader replica is the only one marked with the
// 'replace' attribute, the leader replica cannot be evicted.
need_to_evict_voter |= (num_voters_healthy >= replication_factor) &&
!(num_voters_with_replace == 1 && leader_with_replace) &&
((num_voters_with_replace > replication_factor) ||
(num_voters_with_replace >= replication_factor && num_voters_viable > 0));
// Working with the replicas marked with the 'replace' attribute:
// the case where a few replicas are marked with the 'replace' attribute
// while all required replicas are healthy.
//
// In the special case when the leader replica is the only one marked with the
// 'replace' attribute, the leader replica cannot be evicted.
need_to_evict_voter |=
!(num_voters_with_replace == 1 && leader_with_replace) &&
(num_voters_with_replace > 0 && num_voters_healthy > replication_factor);
// The voter-related sub-cases are applicable only when the total number of
// voter replicas is greater than the target replication factor or it's
// a non-recoverable failure; meanwhile, a majority of voter replicas should
// be on-line to commit the Raft configuration change.
const bool should_evict_voter = need_to_evict_voter &&
(num_voters_total > replication_factor ||
has_voter_failed_unrecoverable) &&
(num_voters_healthy >= MajoritySize(num_voters_total - 1) ||
policy == MajorityHealthPolicy::IGNORE);
const bool should_evict = should_evict_non_voter || should_evict_voter;
// When we have the same type of failures between voters and non-voters
// we evict non-voters first, but if there is an irreversibly failed voter and
// no irreversibly failed non-voters, then we evict such the voter first.
// That's because a transiently failed non-voter might be back and in good
// shape a few moments. Also, getting rid of a irreversibly failed voter may
// be beneficial in case of even-number-of-voters configurations: the majority
// gets more chances to be actionable if other replica fails.
//
// So, the eviction priority order is:
// (1) unrecoverable non_voters
// (2) unrecoverable voters
// (3) evictable non_voters
// (4) evictable voters
string to_evict;
if (should_evict_non_voter && has_non_voter_failed_unrecoverable) {
CHECK(!pq_non_voters.empty());
to_evict = pq_non_voters.top().first;
} else if (should_evict_voter && has_voter_failed_unrecoverable) {
CHECK(!pq_voters.empty());
to_evict = pq_voters.top().first;
} else if (should_evict_non_voter) {
CHECK(!pq_non_voters.empty());
to_evict = pq_non_voters.top().first;
} else if (should_evict_voter) {
CHECK(!pq_voters.empty());
to_evict = pq_voters.top().first;
}
DCHECK((!should_evict && to_evict.empty()) ||
(should_evict && !to_evict.empty()));
if (should_evict) {
DCHECK(!to_evict.empty());
DCHECK_NE(leader_uuid, to_evict);
if (uuid_to_evict) {
*uuid_to_evict = to_evict;
}
}
VLOG(2) << "decision: should"
<< (should_evict ? "" : "not") << " evict replica "
<< (should_evict ? to_evict : "");
return should_evict;
}