bool ShouldEvictReplica()

in src/kudu/consensus/quorum_util.cc [542:867]


bool ShouldEvictReplica(const RaftConfigPB& config,
                        const string& leader_uuid,
                        int replication_factor,
                        MajorityHealthPolicy policy,
                        string* uuid_to_evict) {
  if (leader_uuid.empty()) {
    // If there is no leader, we can't evict anybody.
    return false;
  }

  typedef pair<string, int> Elem;
  static const auto kCmp = [](const Elem& lhs, const Elem& rhs) {
    // Elements of higher priorty should pop up to the top of the queue.
    return lhs.second < rhs.second;
  };
  typedef priority_queue<Elem, vector<Elem>, decltype(kCmp)> PeerPriorityQueue;

  PeerPriorityQueue pq_non_voters(kCmp);
  PeerPriorityQueue pq_voters(kCmp);

  const auto peer_to_elem = [](const RaftPeerPB& peer) {
    const string& peer_uuid = peer.permanent_uuid();
    const auto overall_health = peer.health_report().overall_health();

    // Non-voter candidates for eviction (in decreasing priority):
    //   * failed unrecoverably
    //   * failed
    //   * in unknown health state
    //   * any other
    //
    // Voter candidates for eviction (in decreasing priority):
    //   * failed unrecoverably and having the attribute REPLACE set
    //   * failed unrecoverably
    //   * failed and having the attribute REPLACE set
    //   * failed
    //   * having the attribute REPLACE set
    //   * in unknown health state
    //   * any other

    int priority = 0;
    switch (overall_health) {
      case HealthReportPB::FAILED_UNRECOVERABLE:
        priority = 8;
        break;
      case HealthReportPB::FAILED:
        priority = 4;
        break;
      case HealthReportPB::HEALTHY:
        priority = 0;
        break;
      case HealthReportPB::UNKNOWN:   FALLTHROUGH_INTENDED;
      default:
        priority = 1;
        break;
    }
    if (peer.member_type() == RaftPeerPB::VOTER && peer.attrs().replace()) {
      priority += 2;
    }
    return Elem(peer_uuid, priority);
  };

  int num_non_voters_total = 0;

  int num_voters_healthy = 0;
  int num_voters_total = 0;
  int num_voters_with_replace = 0;
  int num_voters_viable = 0;

  bool leader_with_replace = false;

  bool has_non_voter_failed = false;
  bool has_non_voter_failed_unrecoverable = false;
  bool has_voter_failed = false;
  bool has_voter_failed_unrecoverable = false;
  bool has_voter_unknown_health = false;

  // While working with the optional fields related to per-replica health status
  // and attributes, has_a_field()-like methods are not called because of
  // the appropriate default values of those fields.
  VLOG(2) << "config to evaluate: " << SecureDebugString(config);
  for (const RaftPeerPB& peer : config.peers()) {
    DCHECK(peer.has_permanent_uuid() && !peer.permanent_uuid().empty());
    const string& peer_uuid = peer.permanent_uuid();
    const auto overall_health = peer.health_report().overall_health();
    const bool failed = overall_health == HealthReportPB::FAILED;
    const bool failed_unrecoverable = overall_health == HealthReportPB::FAILED_UNRECOVERABLE;
    const bool healthy = overall_health == HealthReportPB::HEALTHY;
    const bool unknown = !peer.has_health_report() ||
        !peer.health_report().has_overall_health() ||
        overall_health == HealthReportPB::UNKNOWN;
    const bool has_replace = peer.attrs().replace();

    switch (peer.member_type()) {
      case RaftPeerPB::VOTER:
        // A leader should always report itself as being healthy.
        if (PREDICT_FALSE(peer_uuid == leader_uuid && !healthy)) {
          LOG(WARNING) << Substitute("leader peer $0 reported health as $1; config: $2",
                                     peer_uuid,
                                     HealthReportPB_HealthStatus_Name(
                                        peer.health_report().overall_health()),
                                     SecureShortDebugString(config));
          DCHECK(false) << "Found non-HEALTHY LEADER"; // Crash in DEBUG builds.
          // TODO(KUDU-2335): We have seen this assertion in rare circumstances
          // in pre-commit builds, so until we fix this lifecycle issue we
          // simply do not evict any nodes when the leader is not HEALTHY.
          return false;
        }

        ++num_voters_total;
        if (healthy) {
          ++num_voters_healthy;
          if (!has_replace) {
            ++num_voters_viable;
          }
        }
        if (has_replace) {
          ++num_voters_with_replace;
          if (peer_uuid == leader_uuid) {
            leader_with_replace = true;
          }
        }
        if (peer_uuid == leader_uuid) {
          // Everything below is to keep track of replicas to evict; the leader
          // replica is not to be evicted.
          break;
        }

        pq_voters.emplace(peer_to_elem(peer));
        has_voter_failed |= failed;
        has_voter_failed_unrecoverable |= failed_unrecoverable;
        has_voter_unknown_health |= unknown;
        break;

      case RaftPeerPB::NON_VOTER:
        DCHECK_NE(peer_uuid, leader_uuid) << peer_uuid
            << ": non-voter as a leader; " << SecureShortDebugString(config);
        pq_non_voters.emplace(peer_to_elem(peer));
        ++num_non_voters_total;
        has_non_voter_failed |= failed;
        has_non_voter_failed_unrecoverable |= failed_unrecoverable;
        break;

      default:
        LOG(DFATAL) << peer.member_type() << ": unsupported member type";
        break;
    }
  }

  // Sanity check: the leader replica UUID should not be among those to evict.
  DCHECK(pq_voters.empty() || pq_voters.top().first != leader_uuid);
  DCHECK(pq_non_voters.empty() || pq_non_voters.top().first != leader_uuid);

  // A conservative approach is used when evicting replicas. In short, the
  // removal of replicas from the tablet without exact knowledge of their health
  // status could lead to removing the healthy ones and keeping the failed
  // ones, or attempting a config change operation that cannot be committed.
  // From the other side, if the number of voter replicas in good health is
  // greater or equal to the required replication factor, a replica with any
  // health status can be safely evicted without compromising the availability
  // of the tablet. Also, the eviction policy is more liberal when dealing with
  // failed replicas: if the total number of voter replicas is greater than or
  // equal to the required replication factor, the failed replicas are evicted
  // aggressively. The latter is to avoid polluting tablet servers with failed
  // replicas, reducing the number of possible locations for new non-voter
  // replicas created to replace the failed ones. See below for more details.
  //
  // * A non-voter replica may be evicted regardless of its health status
  //   if the number of voter replicas in good health without the 'replace'
  //   attribute is greater than or equal to the required replication factor.
  //   The idea is to not evict non-voter replicas that might be needed to reach
  //   the required replication factor, while a present non-voter replica could
  //   be a good fit to replace a voter replica, if needed.
  //
  // * A non-voter replica with FAILED or FAILED_UNRECOVERABLE health status
  //   may be evicted if the number of voter replicas in good health without
  //   the 'replace' attribute is greater than or equal to a strict majority
  //   of voter replicas. The idea is to avoid polluting available tablet
  //   servers with failed non-voter replicas, while replacing failed non-voters
  //   with healthy non-voters as aggressively as possible. Also, we want to be
  //   sure that an eviction can succeed before initiating it.
  //
  // * A voter replica may be evicted regardless of its health status
  //   if after the eviction the number of voter replicas in good health will be
  //   greater than or equal to the required replication factor and the leader
  //   replica itself is not marked with the 'replace' attribute. The latter
  //   part of the condition emerges from the following observations:
  //     ** By definition, a voter replica marked with the 'replace' attribute
  //        should be eventually evicted from the Raft group.
  //     ** If all voter replicas are in good health and their total count is
  //        greater than the target replication and only a single one is marked
  //        with the 'replace' attribute, that's the replica to be evicted.
  //     ** Kudu Raft implementation does not support evicting the leader of
  //        a Raft group.
  //    So, the removal of a leader replica marked with the 'replace' attribute
  //    is postponed until the leader replica steps down and becomes a follower.
  //
  // * A voter replica with FAILED health may be evicted only if the total
  //   number of voter replicas is greater than the required replication factor
  //   and the number of *other* voter replicas in good health without the
  //   'replace' attribute is greater than or equal to a strict majority of
  //   voter replicas.
  //
  // * A voter replica with FAILED_UNRECOVERABLE health may be evicted when
  //   the number of *other* voter replicas in good health without the 'replace'
  //   attribute is greater than or equal to a strict majority of voter replicas.
  //
  // * A voter replica in good health marked with the 'replace' attribute may be
  //   evicted when the number of replicas in good health after the eviction
  //   is greater than or equal to the required replication factor.

  bool need_to_evict_non_voter = false;

  // Check if there is any excess non-voter replica. We add non-voter replicas
  // to replace non-viable (i.e. failed or explicitly marked for eviction) ones.
  need_to_evict_non_voter |=
      num_voters_viable >= replication_factor &&
      num_non_voters_total > 0;

  // Some non-voter replica has failed: we want to remove those aggressively.
  // This is to avoid polluting tablet servers with failed replicas. Otherwise,
  // it may be a situation when it's impossible to add a new non-voter replica
  // to replace failed ones.
  need_to_evict_non_voter |=
      has_non_voter_failed ||
      has_non_voter_failed_unrecoverable;

  // All the non-voter-related sub-cases are applicable only when there is at
  // least one non-voter replica and a majority of voter replicas are on-line
  // to commit the Raft configuration change.
  const bool should_evict_non_voter = need_to_evict_non_voter &&
      (num_voters_healthy >= MajoritySize(num_voters_total) ||
       policy == MajorityHealthPolicy::IGNORE);

  bool need_to_evict_voter = false;

  // The abundant case: can evict any voter replica. The code below will select
  // the most appropriate candidate.
  need_to_evict_voter |= num_voters_viable > replication_factor;

  // Some voter replica has failed: we want to remove those aggressively.
  // This is to avoid polluting tablet servers with failed replicas. Otherwise,
  // it may be a situation when it's impossible to add a new non-voter replica
  // to replace failed ones.
  need_to_evict_voter |= (has_voter_failed || has_voter_failed_unrecoverable);

  // In case if we already have enough healthy replicas running, it's safe to
  // get rid of replicas with unknown health state.
  need_to_evict_voter |=
      num_voters_viable >= replication_factor &&
      has_voter_unknown_health;

  // Working with the replicas marked with the 'replace' attribute:
  // the case when too many replicas are marked with the 'replace' attribute
  // while all required replicas are healthy.
  //
  // In the special case when the leader replica is the only one marked with the
  // 'replace' attribute, the leader replica cannot be evicted.
  need_to_evict_voter |= (num_voters_healthy >= replication_factor) &&
      !(num_voters_with_replace == 1 && leader_with_replace) &&
      ((num_voters_with_replace > replication_factor) ||
       (num_voters_with_replace >= replication_factor && num_voters_viable > 0));

  // Working with the replicas marked with the 'replace' attribute:
  // the case where a few replicas are marked with the 'replace' attribute
  // while all required replicas are healthy.
  //
  // In the special case when the leader replica is the only one marked with the
  // 'replace' attribute, the leader replica cannot be evicted.
  need_to_evict_voter |=
      !(num_voters_with_replace == 1 && leader_with_replace) &&
      (num_voters_with_replace > 0 && num_voters_healthy > replication_factor);

  // The voter-related sub-cases are applicable only when the total number of
  // voter replicas is greater than the target replication factor or it's
  // a non-recoverable failure; meanwhile, a majority of voter replicas should
  // be on-line to commit the Raft configuration change.
  const bool should_evict_voter = need_to_evict_voter &&
      (num_voters_total > replication_factor ||
       has_voter_failed_unrecoverable) &&
      (num_voters_healthy >= MajoritySize(num_voters_total - 1) ||
       policy == MajorityHealthPolicy::IGNORE);

  const bool should_evict = should_evict_non_voter || should_evict_voter;
  // When we have the same type of failures between voters and non-voters
  // we evict non-voters first, but if there is an irreversibly failed voter and
  // no irreversibly failed non-voters, then we evict such the voter first.
  // That's because a transiently failed non-voter might be back and in good
  // shape a few moments. Also, getting rid of a irreversibly failed voter may
  // be beneficial in case of even-number-of-voters configurations: the majority
  // gets more chances to be actionable if other replica fails.
  //
  // So, the eviction priority order is:
  //   (1) unrecoverable non_voters
  //   (2) unrecoverable voters
  //   (3) evictable non_voters
  //   (4) evictable voters
  string to_evict;
  if (should_evict_non_voter && has_non_voter_failed_unrecoverable) {
    CHECK(!pq_non_voters.empty());
    to_evict = pq_non_voters.top().first;
  } else if (should_evict_voter && has_voter_failed_unrecoverable) {
    CHECK(!pq_voters.empty());
    to_evict = pq_voters.top().first;
  } else if (should_evict_non_voter) {
    CHECK(!pq_non_voters.empty());
    to_evict = pq_non_voters.top().first;
  } else if (should_evict_voter) {
    CHECK(!pq_voters.empty());
    to_evict = pq_voters.top().first;
  }

  DCHECK((!should_evict && to_evict.empty()) ||
         (should_evict && !to_evict.empty()));
  if (should_evict) {
    DCHECK(!to_evict.empty());
    DCHECK_NE(leader_uuid, to_evict);
    if (uuid_to_evict) {
      *uuid_to_evict = to_evict;
    }
  }
  VLOG(2) << "decision: should"
          << (should_evict ? "" : "not") << " evict replica "
          << (should_evict ? to_evict : "");

  return should_evict;
}