in src/kudu/consensus/raft_consensus.cc [1992:2151]
Status RaftConsensus::RequestVote(const VoteRequestPB* request,
TabletVotingState tablet_voting_state,
VoteResponsePB* response) {
TRACE_EVENT2("consensus", "RaftConsensus::RequestVote",
"peer", peer_uuid(),
"tablet", options_.tablet_id);
response->set_responder_uuid(peer_uuid());
// We must acquire the update lock in order to ensure that this vote action
// takes place between requests.
// Lock ordering: update_lock_ must be acquired before lock_.
std::unique_lock<simple_spinlock> update_guard(update_lock_, std::defer_lock);
if (FLAGS_enable_leader_failure_detection && !request->ignore_live_leader()) {
update_guard.try_lock();
} else {
// If failure detection is not enabled, then we can't just reject the vote,
// because there will be no automatic retry later. So, block for the lock.
update_guard.lock();
}
if (!update_guard.owns_lock()) {
// There is another vote or update concurrent with the vote. In that case, that
// other request is likely to reset the timer, and we'll end up just voting
// "NO" after waiting. To avoid starving RPC handlers and causing cascading
// timeouts, just vote a quick NO.
//
// We still need to take the state lock in order to respond with term info, etc.
ThreadRestrictions::AssertWaitAllowed();
LockGuard l(lock_);
return RequestVoteRespondIsBusy(request, response);
}
// Acquire the replica state lock so we can read / modify the consensus state.
ThreadRestrictions::AssertWaitAllowed();
LockGuard l(lock_);
// Ensure our lifecycle state is compatible with voting.
// If RaftConsensus is running, we use the latest OpId from the WAL to vote.
// Otherwise, we must be voting while tombstoned.
OpId local_last_logged_opid;
switch (state_) {
case kShutdown:
return Status::IllegalState("cannot vote while shut down");
case kRunning:
// Note: it is (theoretically) possible for 'tombstone_last_logged_opid'
// to be passed in and by the time we reach here the state is kRunning.
// That may occur when a vote request comes in at the end of a tablet
// copy and then tablet bootstrap completes quickly. In that case, we
// ignore the passed-in value and use the latest OpId from our queue.
local_last_logged_opid = queue_->GetLastOpIdInLog();
break;
default:
if (!tablet_voting_state.tombstone_last_logged_opid_) {
return Status::IllegalState("must be running to vote when last-logged opid is not known");
}
if (!FLAGS_raft_enable_tombstoned_voting) {
return Status::IllegalState("must be running to vote when tombstoned voting is disabled");
}
local_last_logged_opid = *(tablet_voting_state.tombstone_last_logged_opid_);
#ifdef FB_DO_NOT_REMOVE
if (tablet_voting_state.data_state_ == tablet::TABLET_DATA_COPYING) {
LOG_WITH_PREFIX_UNLOCKED(INFO) << "voting while copying based on last-logged opid "
<< local_last_logged_opid;
} else if (tablet_voting_state.data_state_ == tablet::TABLET_DATA_TOMBSTONED) {
LOG_WITH_PREFIX_UNLOCKED(INFO) << "voting while tombstoned based on last-logged opid "
<< local_last_logged_opid;
}
#endif
break;
}
DCHECK(local_last_logged_opid.IsInitialized());
// If the node is not in the configuration, allow the vote (this is required by Raft)
// but log an informational message anyway.
std::string hostname_port("[NOT-IN-CONFIG]");
response->mutable_voter_context()->set_is_candidate_removed(false);
if (!cmeta_->IsMemberInConfigWithDetail(request->candidate_uuid(), ACTIVE_CONFIG, &hostname_port)) {
LOG_WITH_PREFIX_UNLOCKED(INFO) << "Handling vote request from an unknown peer "
<< request->candidate_uuid();
if (cmeta_->IsPeerRemoved(request->candidate_uuid())) {
response->mutable_voter_context()->set_is_candidate_removed(true);
}
}
// If we've heard recently from the leader, then we should ignore the request.
// It might be from a "disruptive" server. This could happen in a few cases:
//
// 1) Network partitions
// If the leader can talk to a majority of the nodes, but is partitioned from a
// bad node, the bad node's failure detector will trigger. If the bad node is
// able to reach other nodes in the cluster, it will continuously trigger elections.
//
// 2) An abandoned node
// It's possible that a node has fallen behind the log GC mark of the leader. In that
// case, the leader will stop sending it requests. Eventually, the the configuration
// will change to eject the abandoned node, but until that point, we don't want the
// abandoned follower to disturb the other nodes.
//
// See also https://ramcloud.stanford.edu/~ongaro/thesis.pdf
// section 4.2.3.
if (withhold_votes_) {
LOG_WITH_PREFIX_UNLOCKED(INFO) << "Rejecting vote request from peer "
<< request->candidate_uuid()
<< " " << hostname_port
<< " for testing.";
return RequestVoteRespondVoteWitheld(request, hostname_port, response);
}
if (!request->ignore_live_leader() && MonoTime::Now() < withhold_votes_until_) {
return RequestVoteRespondLeaderIsAlive(request, hostname_port, response);
}
// Candidate is running behind.
if (request->candidate_term() < CurrentTermUnlocked()) {
return RequestVoteRespondInvalidTerm(request, hostname_port, response);
}
// We already voted this term.
if (request->candidate_term() == CurrentTermUnlocked() &&
HasVotedCurrentTermUnlocked()) {
// Already voted for the same candidate in the current term.
if (GetVotedForCurrentTermUnlocked() == request->candidate_uuid()) {
return RequestVoteRespondVoteAlreadyGranted(request, hostname_port, response);
}
// Voted for someone else in current term.
return RequestVoteRespondAlreadyVotedForOther(request, hostname_port, response);
}
// Candidate must have last-logged OpId at least as large as our own to get
// our vote.
bool vote_yes = !OpIdLessThan(request->candidate_status().last_received(),
local_last_logged_opid);
// Record the term advancement if necessary. We don't do so in the case of
// pre-elections because it's possible that the node who called the pre-election
// has actually now successfully become leader of the prior term, in which case
// bumping our term here would disrupt it.
if (!request->is_pre_election() &&
request->candidate_term() > CurrentTermUnlocked()) {
// If we are going to vote for this peer, then we will flush the consensus metadata
// to disk below when we record the vote, and we can skip flushing the term advancement
// to disk here.
auto flush = vote_yes ? SKIP_FLUSH_TO_DISK : FLUSH_TO_DISK;
RETURN_NOT_OK_PREPEND(HandleTermAdvanceUnlocked(request->candidate_term(), flush),
Substitute("Could not step down in RequestVote. Current term: $0, candidate term: $1",
CurrentTermUnlocked(), request->candidate_term()));
}
if (!vote_yes) {
return RequestVoteRespondLastOpIdTooOld(local_last_logged_opid, request, hostname_port, response);
}
// Passed all our checks. Vote granted.
return RequestVoteRespondVoteGranted(request, hostname_port, response);
}