Status RaftConsensus::RequestVote()

in src/kudu/consensus/raft_consensus.cc [1992:2151]


Status RaftConsensus::RequestVote(const VoteRequestPB* request,
                                  TabletVotingState tablet_voting_state,
                                  VoteResponsePB* response) {
  TRACE_EVENT2("consensus", "RaftConsensus::RequestVote",
               "peer", peer_uuid(),
               "tablet", options_.tablet_id);
  response->set_responder_uuid(peer_uuid());

  // We must acquire the update lock in order to ensure that this vote action
  // takes place between requests.
  // Lock ordering: update_lock_ must be acquired before lock_.
  std::unique_lock<simple_spinlock> update_guard(update_lock_, std::defer_lock);
  if (FLAGS_enable_leader_failure_detection && !request->ignore_live_leader()) {
    update_guard.try_lock();
  } else {
    // If failure detection is not enabled, then we can't just reject the vote,
    // because there will be no automatic retry later. So, block for the lock.
    update_guard.lock();
  }
  if (!update_guard.owns_lock()) {
    // There is another vote or update concurrent with the vote. In that case, that
    // other request is likely to reset the timer, and we'll end up just voting
    // "NO" after waiting. To avoid starving RPC handlers and causing cascading
    // timeouts, just vote a quick NO.
    //
    // We still need to take the state lock in order to respond with term info, etc.
    ThreadRestrictions::AssertWaitAllowed();
    LockGuard l(lock_);
    return RequestVoteRespondIsBusy(request, response);
  }

  // Acquire the replica state lock so we can read / modify the consensus state.
  ThreadRestrictions::AssertWaitAllowed();
  LockGuard l(lock_);

  // Ensure our lifecycle state is compatible with voting.
  // If RaftConsensus is running, we use the latest OpId from the WAL to vote.
  // Otherwise, we must be voting while tombstoned.
  OpId local_last_logged_opid;
  switch (state_) {
    case kShutdown:
      return Status::IllegalState("cannot vote while shut down");
    case kRunning:
      // Note: it is (theoretically) possible for 'tombstone_last_logged_opid'
      // to be passed in and by the time we reach here the state is kRunning.
      // That may occur when a vote request comes in at the end of a tablet
      // copy and then tablet bootstrap completes quickly. In that case, we
      // ignore the passed-in value and use the latest OpId from our queue.
      local_last_logged_opid = queue_->GetLastOpIdInLog();
      break;
    default:
      if (!tablet_voting_state.tombstone_last_logged_opid_) {
        return Status::IllegalState("must be running to vote when last-logged opid is not known");
      }
      if (!FLAGS_raft_enable_tombstoned_voting) {
        return Status::IllegalState("must be running to vote when tombstoned voting is disabled");
      }
      local_last_logged_opid = *(tablet_voting_state.tombstone_last_logged_opid_);
#ifdef FB_DO_NOT_REMOVE
      if (tablet_voting_state.data_state_ == tablet::TABLET_DATA_COPYING) {
        LOG_WITH_PREFIX_UNLOCKED(INFO) << "voting while copying based on last-logged opid "
                                       << local_last_logged_opid;
      } else if (tablet_voting_state.data_state_ == tablet::TABLET_DATA_TOMBSTONED) {
        LOG_WITH_PREFIX_UNLOCKED(INFO) << "voting while tombstoned based on last-logged opid "
                                       << local_last_logged_opid;
      }
#endif
      break;
  }
  DCHECK(local_last_logged_opid.IsInitialized());

  // If the node is not in the configuration, allow the vote (this is required by Raft)
  // but log an informational message anyway.
  std::string hostname_port("[NOT-IN-CONFIG]");
  response->mutable_voter_context()->set_is_candidate_removed(false);

  if (!cmeta_->IsMemberInConfigWithDetail(request->candidate_uuid(), ACTIVE_CONFIG, &hostname_port)) {
    LOG_WITH_PREFIX_UNLOCKED(INFO) << "Handling vote request from an unknown peer "
                                   << request->candidate_uuid();
    if (cmeta_->IsPeerRemoved(request->candidate_uuid())) {
      response->mutable_voter_context()->set_is_candidate_removed(true);
    }
  }



  // If we've heard recently from the leader, then we should ignore the request.
  // It might be from a "disruptive" server. This could happen in a few cases:
  //
  // 1) Network partitions
  // If the leader can talk to a majority of the nodes, but is partitioned from a
  // bad node, the bad node's failure detector will trigger. If the bad node is
  // able to reach other nodes in the cluster, it will continuously trigger elections.
  //
  // 2) An abandoned node
  // It's possible that a node has fallen behind the log GC mark of the leader. In that
  // case, the leader will stop sending it requests. Eventually, the the configuration
  // will change to eject the abandoned node, but until that point, we don't want the
  // abandoned follower to disturb the other nodes.
  //
  // See also https://ramcloud.stanford.edu/~ongaro/thesis.pdf
  // section 4.2.3.

  if (withhold_votes_) {
    LOG_WITH_PREFIX_UNLOCKED(INFO) << "Rejecting vote request from peer "
                                   << request->candidate_uuid()
                                   << " " << hostname_port
                                   << " for testing.";
    return RequestVoteRespondVoteWitheld(request, hostname_port, response);
  }

  if (!request->ignore_live_leader() && MonoTime::Now() < withhold_votes_until_) {
    return RequestVoteRespondLeaderIsAlive(request, hostname_port, response);
  }

  // Candidate is running behind.
  if (request->candidate_term() < CurrentTermUnlocked()) {
    return RequestVoteRespondInvalidTerm(request, hostname_port, response);
  }

  // We already voted this term.
  if (request->candidate_term() == CurrentTermUnlocked() &&
      HasVotedCurrentTermUnlocked()) {

    // Already voted for the same candidate in the current term.
    if (GetVotedForCurrentTermUnlocked() == request->candidate_uuid()) {
      return RequestVoteRespondVoteAlreadyGranted(request, hostname_port, response);
    }

    // Voted for someone else in current term.
    return RequestVoteRespondAlreadyVotedForOther(request, hostname_port, response);
  }

  // Candidate must have last-logged OpId at least as large as our own to get
  // our vote.
  bool vote_yes = !OpIdLessThan(request->candidate_status().last_received(),
                                local_last_logged_opid);

  // Record the term advancement if necessary. We don't do so in the case of
  // pre-elections because it's possible that the node who called the pre-election
  // has actually now successfully become leader of the prior term, in which case
  // bumping our term here would disrupt it.
  if (!request->is_pre_election() &&
      request->candidate_term() > CurrentTermUnlocked()) {
    // If we are going to vote for this peer, then we will flush the consensus metadata
    // to disk below when we record the vote, and we can skip flushing the term advancement
    // to disk here.
    auto flush = vote_yes ? SKIP_FLUSH_TO_DISK : FLUSH_TO_DISK;
    RETURN_NOT_OK_PREPEND(HandleTermAdvanceUnlocked(request->candidate_term(), flush),
        Substitute("Could not step down in RequestVote. Current term: $0, candidate term: $1",
                   CurrentTermUnlocked(), request->candidate_term()));
  }

  if (!vote_yes) {
    return RequestVoteRespondLastOpIdTooOld(local_last_logged_opid, request, hostname_port, response);
  }

  // Passed all our checks. Vote granted.
  return RequestVoteRespondVoteGranted(request, hostname_port, response);
}