in bistro/remote/RemoteWorkerState.h [89:137]
std::pair<State, bool> computeState(
int64_t cur_time,
int32_t max_healthcheck_gap,
int32_t max_heartbeat_gap,
int32_t lose_unhealthy_worker_after,
// Not part of the state since it MUST be ephemeral -- we only want the
// "consensus allows a worker to become healthy" flag to be used if it
// makes the worker healthy *immediately*.
bool allowed_to_become_healthy
) const {
bool disallowed = false;
if (state_ == State::MUST_DIE) { // Can never leave this state
return std::make_pair(State::MUST_DIE, disallowed);
}
State new_state = State::HEALTHY;
// The ways to leave the NEW state are: (i) go to MUST_DIE after
// lose_unhealthy_worker_after seconds, or (ii) via
// RemoteWorker::initializeRunningTasks or BistroWorkerHandler::heartbeat
if (state_ == State::NEW) {
new_state = State::NEW;
} else if (
(cur_time > timeLastGoodHealthcheckSent_ + max_healthcheck_gap)
|| (cur_time > timeLastHeartbeatReceived_ + max_heartbeat_gap)
) {
new_state = State::UNHEALTHY;
} else if (!allowed_to_become_healthy && !hasBeenHealthy_) {
new_state = State::UNHEALTHY;
disallowed = true;
}
if (
// This is ONLY true when the worker is otherwise healthy, but is
// blocked by consensus. Don't lose such workers, since that behavior
// is actively harmful when we are having trouble achieving consensus
// due to high worker turnover (see README.worker_set_consensus).
!disallowed &&
lose_unhealthy_worker_after > 0 &&
// Without this check, we'd use a stale timeBecameUnhealthy_ when
// changing from HEALTHY to UNHEALTHY. Using != matches NEW.
new_state != State::HEALTHY && state_ != State::HEALTHY &&
// For NEW workers, the timeout begins at initialization time.
cur_time > timeBecameUnhealthy_ + lose_unhealthy_worker_after
// Don't need to add FLAGS_worker_check_interval because a worker
// always takes at least that long to go from UNHEALTHY to MUST_DIE.
) {
return std::make_pair(State::MUST_DIE, disallowed);
}
return std::make_pair(new_state, disallowed);
}