in src/kudu/client/batcher.cc [446:568]
RetriableRpcStatus WriteRpc::AnalyzeResponse(const Status& rpc_cb_status) {
RetriableRpcStatus result;
result.status = rpc_cb_status;
// If we didn't fail on tablet lookup/proxy initialization, check if we failed actually performing
// the write.
if (rpc_cb_status.ok()) {
result.status = mutable_retrier()->controller().status();
}
// Check for specific RPC errors.
if (result.status.IsRemoteError()) {
const ErrorStatusPB* err = mutable_retrier()->controller().error_response();
if (err && err->has_code()) {
switch (err->code()) {
case ErrorStatusPB::ERROR_SERVER_TOO_BUSY:
case ErrorStatusPB::ERROR_UNAVAILABLE:
result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
return result;
case ErrorStatusPB::ERROR_INVALID_AUTHORIZATION_TOKEN:
result.result = RetriableRpcStatus::INVALID_AUTHORIZATION_TOKEN;
return result;
default:
break;
}
}
}
if (result.status.IsServiceUnavailable() ||
(resp_.has_error() &&
resp_.error().code() == tserver::TabletServerErrorPB::TXN_LOCKED_RETRY_OP)) {
result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
return result;
}
// Check whether it's an invalid authn token. That's the error code the server
// sends back if authn token is expired.
if (result.status.IsNotAuthorized()) {
const ErrorStatusPB* err = mutable_retrier()->controller().error_response();
if (err && err->has_code() &&
err->code() == ErrorStatusPB::FATAL_INVALID_AUTHENTICATION_TOKEN) {
result.result = RetriableRpcStatus::INVALID_AUTHENTICATION_TOKEN;
return result;
}
}
// Failover to a replica in the event of any network failure or of a DNS resolution problem.
//
// TODO(adar): This is probably too harsh; some network failures should be
// retried on the current replica.
if (result.status.IsNetworkError()) {
result.result = RetriableRpcStatus::SERVER_NOT_ACCESSIBLE;
return result;
}
// Prefer controller failures over response failures.
if (result.status.ok() && resp_.has_error()) {
result.status = StatusFromPB(resp_.error().status());
}
// In a multi-client usage scenario, where one client is used for DDL ops
// and other is used for DML ops on the same range partition, tablet
// entry in metacache may become invalid, rendering it useless for
// subsequent operation. This check lets client know about the same.
if (result.status.IsInvalidArgument()) {
result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
return result;
}
// If we get TABLET_NOT_FOUND, the replica we thought was leader has been deleted.
if (resp_.has_error() && resp_.error().code() == tserver::TabletServerErrorPB::TABLET_NOT_FOUND) {
result.result = RetriableRpcStatus::RESOURCE_NOT_FOUND;
return result;
}
if (resp_.has_error() &&
(resp_.error().code() == tserver::TabletServerErrorPB::TXN_ILLEGAL_STATE ||
resp_.error().code() == tserver::TabletServerErrorPB::TXN_LOCKED_ABORT)) {
result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
return result;
}
// Alternatively, when we get a status code of IllegalState or Aborted, we
// assume this means that the replica we attempted to write to is not the
// current leader (maybe it got partitioned or slow and another node took
// over).
//
// TODO: This error handling block should really be rewritten to handle
// specific error codes exclusively instead of Status codes (this may
// require some server-side changes). For example, IllegalState is
// obviously way too broad an error category for this case.
if (result.status.IsIllegalState() || result.status.IsAborted()) {
// TODO(aserbin): this very broad transformation of Status::IllegalState()
// becomes a real issue when handling responses to write
// operations in the context of multi-row transactions.
// For example, Status::IllegalState() originated from
// TabletServerErrorPB::TXN_ILLEGAL_STATE response and
// Status::Abort() originated from TabletServerErrorPB::TXN_LOCKED_ABORT
// response are needlessly retried.
result.result = RetriableRpcStatus::REPLICA_NOT_LEADER;
return result;
}
// Handle the connection negotiation failure case if overall RPC's timeout
// hasn't expired yet: if the connection negotiation returned non-OK status,
// mark the server as not accessible and rely on the RetriableRpc's logic
// to switch to an alternative tablet replica.
//
// NOTE: Connection negotiation errors related to security are handled in the
// code above: see the handlers for IsNotAuthorized(), IsRemoteError().
if (!rpc_cb_status.IsTimedOut() && !result.status.ok() &&
mutable_retrier()->controller().negotiation_failed()) {
result.result = RetriableRpcStatus::SERVER_NOT_ACCESSIBLE;
return result;
}
if (result.status.ok()) {
result.result = RetriableRpcStatus::OK;
} else {
result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
}
return result;
}