RetriableRpcStatus WriteRpc::AnalyzeResponse()

in src/kudu/client/batcher.cc [446:568]


RetriableRpcStatus WriteRpc::AnalyzeResponse(const Status& rpc_cb_status) {
  RetriableRpcStatus result;
  result.status = rpc_cb_status;

  // If we didn't fail on tablet lookup/proxy initialization, check if we failed actually performing
  // the write.
  if (rpc_cb_status.ok()) {
    result.status = mutable_retrier()->controller().status();
  }

  // Check for specific RPC errors.
  if (result.status.IsRemoteError()) {
    const ErrorStatusPB* err = mutable_retrier()->controller().error_response();
    if (err && err->has_code()) {
      switch (err->code()) {
        case ErrorStatusPB::ERROR_SERVER_TOO_BUSY:
        case ErrorStatusPB::ERROR_UNAVAILABLE:
          result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
          return result;
        case ErrorStatusPB::ERROR_INVALID_AUTHORIZATION_TOKEN:
          result.result = RetriableRpcStatus::INVALID_AUTHORIZATION_TOKEN;
          return result;
        default:
          break;
      }
    }
  }

  if (result.status.IsServiceUnavailable() ||
      (resp_.has_error() &&
       resp_.error().code() == tserver::TabletServerErrorPB::TXN_LOCKED_RETRY_OP)) {
    result.result = RetriableRpcStatus::SERVICE_UNAVAILABLE;
    return result;
  }

  // Check whether it's an invalid authn token. That's the error code the server
  // sends back if authn token is expired.
  if (result.status.IsNotAuthorized()) {
    const ErrorStatusPB* err = mutable_retrier()->controller().error_response();
    if (err && err->has_code() &&
        err->code() == ErrorStatusPB::FATAL_INVALID_AUTHENTICATION_TOKEN) {
      result.result = RetriableRpcStatus::INVALID_AUTHENTICATION_TOKEN;
      return result;
    }
  }

  // Failover to a replica in the event of any network failure or of a DNS resolution problem.
  //
  // TODO(adar): This is probably too harsh; some network failures should be
  // retried on the current replica.
  if (result.status.IsNetworkError()) {
    result.result = RetriableRpcStatus::SERVER_NOT_ACCESSIBLE;
    return result;
  }

  // Prefer controller failures over response failures.
  if (result.status.ok() && resp_.has_error()) {
    result.status = StatusFromPB(resp_.error().status());
  }

  // In a multi-client usage scenario, where one client is used for DDL ops
  // and other is used for DML ops on the same range partition, tablet
  // entry in metacache may become invalid, rendering it useless for
  // subsequent operation. This check lets client know about the same.
  if (result.status.IsInvalidArgument()) {
    result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
    return result;
  }

  // If we get TABLET_NOT_FOUND, the replica we thought was leader has been deleted.
  if (resp_.has_error() && resp_.error().code() == tserver::TabletServerErrorPB::TABLET_NOT_FOUND) {
    result.result = RetriableRpcStatus::RESOURCE_NOT_FOUND;
    return result;
  }

  if (resp_.has_error() &&
      (resp_.error().code() == tserver::TabletServerErrorPB::TXN_ILLEGAL_STATE ||
       resp_.error().code() == tserver::TabletServerErrorPB::TXN_LOCKED_ABORT)) {
    result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
    return result;
  }

  // Alternatively, when we get a status code of IllegalState or Aborted, we
  // assume this means that the replica we attempted to write to is not the
  // current leader (maybe it got partitioned or slow and another node took
  // over).
  //
  // TODO: This error handling block should really be rewritten to handle
  // specific error codes exclusively instead of Status codes (this may
  // require some server-side changes). For example, IllegalState is
  // obviously way too broad an error category for this case.
  if (result.status.IsIllegalState() || result.status.IsAborted()) {
    // TODO(aserbin): this very broad transformation of Status::IllegalState()
    //                becomes a real issue when handling responses to write
    //                operations in the context of multi-row transactions.
    //                For example, Status::IllegalState() originated from
    //                TabletServerErrorPB::TXN_ILLEGAL_STATE response and
    //                Status::Abort() originated from TabletServerErrorPB::TXN_LOCKED_ABORT
    //                response are needlessly retried.
    result.result = RetriableRpcStatus::REPLICA_NOT_LEADER;
    return result;
  }

  // Handle the connection negotiation failure case if overall RPC's timeout
  // hasn't expired yet: if the connection negotiation returned non-OK status,
  // mark the server as not accessible and rely on the RetriableRpc's logic
  // to switch to an alternative tablet replica.
  //
  // NOTE: Connection negotiation errors related to security are handled in the
  //       code above: see the handlers for IsNotAuthorized(), IsRemoteError().
  if (!rpc_cb_status.IsTimedOut() && !result.status.ok() &&
      mutable_retrier()->controller().negotiation_failed()) {
    result.result = RetriableRpcStatus::SERVER_NOT_ACCESSIBLE;
    return result;
  }

  if (result.status.ok()) {
    result.result = RetriableRpcStatus::OK;
  } else {
    result.result = RetriableRpcStatus::NON_RETRIABLE_ERROR;
  }
  return result;
}