in src/kudu/transactions/coordinator_rpc.cc [129:244]
RetriableRpcStatus CoordinatorRpc::AnalyzeResponse(const Status& rpc_cb_status) {
// We only analyze OK statuses if we succeeded to do the tablet lookup. In
// either case, let's examine whatever errors exist.
RetriableRpcStatus result;
result.status = rpc_cb_status.ok() ? retrier().controller().status()
: rpc_cb_status;
if (result.status.ok() &&
resp_.has_op_result() && resp_.op_result().has_op_error()) {
// Extract the application-level error (AppStatusPB), if any, and convert it
// into Status to allow the retry logic to work as expected.
result.status = StatusFromPB(resp_.op_result().op_error());
}
// Check for specific RPC errors.
if (result.status.IsRemoteError()) {
const ErrorStatusPB* err = mutable_retrier()->controller().error_response();
if (err && err->has_code()) {
switch (err->code()) {
case ErrorStatusPB::ERROR_SERVER_TOO_BUSY:
case ErrorStatusPB::ERROR_UNAVAILABLE:
result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
return result;
default:
break;
}
}
}
// TODO(awong): it might be easier to understand if the resulting expected
// action were encoded in these status enums, e.g. RETRY_SAME_SERVER.
if (result.status.IsServiceUnavailable()) {
result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
return result;
}
// Check whether we need to get a new authentication token.
if (result.status.IsNotAuthorized()) {
const ErrorStatusPB* err = mutable_retrier()->controller().error_response();
if (err && err->has_code() &&
err->code() == ErrorStatusPB::FATAL_INVALID_AUTHENTICATION_TOKEN) {
result.result = RetriableRpcStatus::INVALID_AUTHENTICATION_TOKEN;
return result;
}
}
// If we couldn't connect to the server, e.g. it was down, failover to a
// different replica.
if (result.status.IsNetworkError()) {
result.result = RetriableRpcStatus::SERVER_NOT_ACCESSIBLE;
return result;
}
// We're done parsing the RPC controller errors. Unwrap the tserver response
// errors -- from here on out, the result status will be the response error.
if (result.status.ok() && resp_.has_error()) {
result.status = StatusFromPB(resp_.error().status());
DCHECK(!result.status.ok());
}
if (resp_.has_error()) {
const auto code = resp_.error().code();
switch (code) {
// If we get TABLET_NOT_FOUND, the replica we thought was leader
// has been deleted.
case TabletServerErrorPB::TABLET_NOT_FOUND:
case TabletServerErrorPB::TABLET_FAILED:
result.result = RetriableRpcStatus::RESOURCE_NOT_FOUND;
return result;
case TabletServerErrorPB::TABLET_NOT_RUNNING:
case TabletServerErrorPB::THROTTLED:
result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
return result;
case TabletServerErrorPB::NOT_THE_LEADER:
result.result = RetriableRpcStatus::REPLICA_NOT_LEADER;
return result;
case TabletServerErrorPB::TXN_ILLEGAL_STATE:
result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
return result;
case TabletServerErrorPB::UNKNOWN_ERROR:
default:
// The rest is handled in the code below.
break;
}
}
if (result.status.IsAborted() || result.status.IsIllegalState()) {
// This is to handle "Op aborted by new leader" Raft replication errors or
// non-a-Raft-leader errors.
result.result = RetriableRpcStatus::REPLICA_NOT_LEADER;
return result;
}
// Handle the connection negotiation failure case if overall RPC's timeout
// hasn't expired yet: if the connection negotiation returned non-OK status,
// mark the server as not accessible and rely on the RetriableRpc's logic
// to switch to an alternative tablet replica.
//
// NOTE: Connection negotiation errors related to security are handled in the
// code above: see the handlers for IsNotAuthorized(), IsRemoteError().
if (!rpc_cb_status.IsTimedOut() && !result.status.ok() &&
mutable_retrier()->controller().negotiation_failed()) {
result.result = RetriableRpcStatus::SERVER_NOT_ACCESSIBLE;
return result;
}
if (result.status.ok()) {
result.result = RetriableRpcStatus::OK;
} else {
result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
}
return result;
}