in src/kudu/consensus/raft_consensus.cc [586:743]
Status RaftConsensus::StartElection(ElectionMode mode, ElectionContext context) {
const char* const mode_str = ModeString(mode);
TRACE_EVENT2("consensus", "RaftConsensus::StartElection",
"peer", LogPrefixThreadSafe(),
"mode", mode_str);
scoped_refptr<LeaderElection> election;
{
ThreadRestrictions::AssertWaitAllowed();
LockGuard l(lock_);
RETURN_NOT_OK(CheckRunningUnlocked());
if (!persistent_vars_->is_start_election_allowed()) {
KLOG_EVERY_N_SECS(WARNING, 300) << LogPrefixUnlocked() <<
Substitute("allow_start_election is set to false, not starting $0 [EVERY 300 seconds]", mode_str);
return Status::OK();
}
context.current_leader_uuid_ = GetLeaderUuidUnlocked();
if (context.source_uuid_.empty()) {
context.source_uuid_ = context.current_leader_uuid_;
} else if (context.source_uuid_ != context.current_leader_uuid_) {
// If the origin of the election isn't the same as the leader we're
// promoting away from, it must mean that this election is part of a chain
context.is_chained_election_ = true;
}
RaftPeerPB::Role active_role = cmeta_->active_role();
if (active_role == RaftPeerPB::LEADER) {
LOG_WITH_PREFIX_UNLOCKED(INFO) << Substitute(
"Not starting $0 -- already a leader", mode_str);
return Status::OK();
}
if (PREDICT_FALSE(!consensus::IsVoterRole(active_role))) {
// A non-voter should not start leader elections. The leader failure
// detector should be re-enabled once the non-voter replica is promoted
// to voter replica.
return Status::IllegalState("only voting members can start elections",
SecureShortDebugString(cmeta_->ActiveConfig()));
}
// In flexi raft mode, we want to start elections only in Candidate
// regions which have voter_distribution Information.
if (FLAGS_enable_flexi_raft) {
const auto& vd_map = cmeta_->ActiveConfig().voter_distribution();
if (PREDICT_FALSE(vd_map.find(peer_region()) == vd_map.end())) {
return Status::IllegalState(strings::Substitute(
"in flexi-raft only regions with valid voter distribution can start election: $0",
peer_region()));
}
}
LOG_WITH_PREFIX_UNLOCKED(INFO)
<< "Starting " << mode_str
<< " (" << ReasonString(context.reason_, GetLeaderUuidUnlocked()) << ")";
// Snooze to avoid the election timer firing again as much as possible.
// We do not disable the election timer while running an election, so that
// if the election times out, we will try again.
MonoDelta timeout = LeaderElectionExpBackoffDeltaUnlocked();
SnoozeFailureDetector(string("starting election"), timeout);
// Increment the term and vote for ourselves, unless it's a pre-election.
if (mode != PRE_ELECTION) {
// TODO(mpercy): Consider using a separate Mutex for voting, which must sync to disk.
// We skip flushing the term to disk because setting the vote just below also
// flushes to disk, and the double fsync doesn't buy us anything.
RETURN_NOT_OK(HandleTermAdvanceUnlocked(CurrentTermUnlocked() + 1,
SKIP_FLUSH_TO_DISK));
RETURN_NOT_OK(SetVotedForCurrentTermUnlocked(peer_uuid()));
}
RaftConfigPB active_config = cmeta_->ActiveConfig();
LOG_WITH_PREFIX_UNLOCKED(INFO) << "Starting " << mode_str << " with config: "
<< SecureShortDebugString(active_config);
int64_t candidate_term = CurrentTermUnlocked();
if (mode == PRE_ELECTION) {
// In a pre-election, we haven't bumped our own term yet, so we need to be
// asking for votes for the next term.
candidate_term += 1;
}
// Initialize the VoteCounter.
gscoped_ptr<VoteCounter> counter;
VoteInfo vote_info;
vote_info.vote = VOTE_GRANTED;
if (!FLAGS_enable_flexi_raft) {
int num_voters = CountVoters(active_config);
int majority_size = MajoritySize(num_voters);
counter.reset(new VoteCounter(num_voters, majority_size));
} else {
counter.reset(new FlexibleVoteCounter(
peer_uuid(),
candidate_term,
cmeta_->last_known_leader(),
active_config,
adjust_voter_distribution_));
// Populate vote history for self. Although not really needed, this makes
// the code simpler.
const std::map<int64_t, PreviousVotePB>& pvh =
cmeta_->previous_vote_history();
std::map<int64_t, PreviousVotePB>::const_iterator it = pvh.begin();
while(it != pvh.end()) {
vote_info.previous_vote_history.push_back(it->second);
it++;
}
}
// Vote for ourselves.
bool duplicate;
RETURN_NOT_OK(counter->RegisterVote(peer_uuid(), vote_info, &duplicate));
LOG_WITH_PREFIX_UNLOCKED(INFO) << "Self-Voted " << mode_str;
CHECK(!duplicate) << LogPrefixUnlocked()
<< "Inexplicable duplicate self-vote for term "
<< CurrentTermUnlocked();
// The shell VoteRequestPB is used to create the VoteRequestPB
// for each of the specific peers.
// NB: below dest_uuid is left unpopulated.
VoteRequestPB request;
request.set_ignore_live_leader(mode == ELECT_EVEN_IF_LEADER_IS_ALIVE);
request.set_candidate_uuid(peer_uuid());
request.set_candidate_term(candidate_term);
*request.mutable_candidate_context()->mutable_candidate_peer_pb() =
local_peer_pb_;
if (mode == PRE_ELECTION) {
request.set_is_pre_election(true);
}
request.set_tablet_id(options_.tablet_id);
*request.mutable_candidate_status()->mutable_last_received() =
queue_->GetLastOpIdInLog();
// active_config is cached into the LeaderElection, i.e.
// if it changes during the LeaderElection process that is not
// reacted to. Since LeaderElection operates on a snapshot of config,
// it makes LeaderElection simpler, easier to reason with.
election.reset(new LeaderElection(
std::move(active_config),
// The RaftConsensus ref passed below ensures that this raw pointer
// remains safe to use for the entirety of LeaderElection's life.
peer_proxy_factory_.get(),
std::move(request), std::move(counter), timeout,
std::bind(&RaftConsensus::ElectionCallback,
shared_from_this(),
std::move(context),
std::placeholders::_1)));
}
// Start the election outside the lock.
election->Run();
return Status::OK();
}