in solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java [783:993]
protected NamedList<Object> requestWithRetryOnStaleState(
SolrRequest<?> request, int retryCount, List<String> inputCollections)
throws SolrServerException, IOException {
connect(); // important to call this before you start working with the ZkStateReader
// build up a _stateVer_ param to pass to the server containing all the
// external collection state versions involved in this request, which allows
// the server to notify us that our cached state for one or more of the external
// collections is stale and needs to be refreshed ... this code has no impact on internal
// collections
String stateVerParam = null;
List<DocCollection> requestedCollections = null;
boolean isCollectionRequestOfV2 = false;
if (request instanceof V2Request) {
isCollectionRequestOfV2 = ((V2Request) request).isPerCollectionRequest();
}
boolean isAdmin =
request.getRequestType() == SolrRequestType.ADMIN && !request.requiresCollection();
if (!inputCollections.isEmpty()
&& !isAdmin
&& !isCollectionRequestOfV2) { // don't do _stateVer_ checking for admin, v2 api requests
Set<String> requestedCollectionNames = resolveAliases(inputCollections);
StringBuilder stateVerParamBuilder = null;
for (String requestedCollection : requestedCollectionNames) {
// track the version of state we're using on the client side using the _stateVer_ param
DocCollection coll = getDocCollection(requestedCollection, null);
if (coll == null) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST, "Collection not found: " + requestedCollection);
}
int collVer = coll.getZNodeVersion();
if (requestedCollections == null)
requestedCollections = new ArrayList<>(requestedCollectionNames.size());
requestedCollections.add(coll);
if (stateVerParamBuilder == null) {
stateVerParamBuilder = new StringBuilder();
} else {
stateVerParamBuilder.append(
"|"); // hopefully pipe is not an allowed char in a collection name
}
stateVerParamBuilder.append(coll.getName()).append(":").append(collVer);
}
if (stateVerParamBuilder != null) {
stateVerParam = stateVerParamBuilder.toString();
}
}
if (request.getParams() instanceof ModifiableSolrParams params) {
if (stateVerParam != null) {
params.set(STATE_VERSION, stateVerParam);
} else {
params.remove(STATE_VERSION);
}
} // else: ??? how to set this ???
NamedList<Object> resp = null;
try {
resp = sendRequest(request, inputCollections);
// to avoid an O(n) operation we always add STATE_VERSION to the last and try to read it from
// there
Object o = resp == null || resp.size() == 0 ? null : resp.get(STATE_VERSION, resp.size() - 1);
if (o != null && o instanceof Map<?, ?> invalidStates) {
// remove this because no one else needs this and tests would fail if they are comparing
// responses
resp.remove(resp.size() - 1);
for (Map.Entry<?, ?> e : invalidStates.entrySet()) {
getDocCollection((String) e.getKey(), (Integer) e.getValue());
}
}
} catch (Exception exc) {
Throwable rootCause = SolrException.getRootCause(exc);
// don't do retry support for admin requests
// or if the request doesn't have a collection specified
// or request is v2 api and its method is not GET
if (inputCollections.isEmpty()
|| isAdmin
|| (request.getApiVersion() == SolrRequest.ApiVersion.V2
&& request.getMethod() != SolrRequest.METHOD.GET)) {
if (exc instanceof SolrServerException) {
throw (SolrServerException) exc;
} else if (exc instanceof IOException) {
throw (IOException) exc;
} else if (exc instanceof RuntimeException) {
throw (RuntimeException) exc;
} else {
throw new SolrServerException(rootCause);
}
}
int errorCode =
(rootCause instanceof SolrException)
? ((SolrException) rootCause).code()
: SolrException.ErrorCode.UNKNOWN.code;
boolean wasCommError =
(rootCause instanceof ConnectException
|| rootCause instanceof SocketException
|| wasCommError(rootCause));
if (wasCommError
|| (exc instanceof RouteException
&& (errorCode == 503)) // 404 because the core does not exist 503 service unavailable
// TODO there are other reasons for 404. We need to change the solr response format from HTML
// to structured data to know that
) {
// it was a communication error. it is likely that
// the node to which the request to be sent is down . So , expire the state
// so that the next attempt would fetch the fresh state
// just re-read state for all of them, if it has not been retried
// in retryExpiryTime time
if (requestedCollections != null) {
for (DocCollection ext : requestedCollections) {
ExpiringCachedDocCollection cacheEntry = collectionStateCache.get(ext.getName());
if (cacheEntry == null) continue;
cacheEntry.maybeStale = true;
}
}
if (retryCount < MAX_STALE_RETRIES) { // if it is a communication error , we must try again
// may be, we have a stale version of the collection state,
// and we could not get any information from the server
// it is probably not worth trying again and again because
// the state would not have been updated
log.info(
"Request to collection {} failed due to ({}) {}, retry={} maxRetries={} commError={} errorCode={} - retrying",
inputCollections,
errorCode,
rootCause,
retryCount,
MAX_STALE_RETRIES,
wasCommError,
errorCode);
return requestWithRetryOnStaleState(request, retryCount + 1, inputCollections);
}
} else {
log.info("request was not communication error it seems");
}
log.info(
"Request to collection {} failed due to ({}) {}, retry={} maxRetries={} commError={} errorCode={} ",
inputCollections,
errorCode,
rootCause,
retryCount,
MAX_STALE_RETRIES,
wasCommError,
errorCode);
boolean stateWasStale = false;
if (retryCount < MAX_STALE_RETRIES
&& requestedCollections != null
&& !requestedCollections.isEmpty()
&& (SolrException.ErrorCode.getErrorCode(errorCode)
== SolrException.ErrorCode.INVALID_STATE
|| errorCode == 404)) {
// cached state for one or more external collections was stale
// re-issue request using updated state
stateWasStale = true;
// just re-read state for all of them, which is a little heavy-handed but hopefully a rare
// occurrence
for (DocCollection ext : requestedCollections) {
collectionStateCache.remove(ext.getName());
}
}
// if we experienced a communication error, it's worth checking the state
// with ZK just to make sure the node we're trying to hit is still part of the collection
if (retryCount < MAX_STALE_RETRIES
&& !stateWasStale
&& requestedCollections != null
&& !requestedCollections.isEmpty()
&& wasCommError) {
for (DocCollection ext : requestedCollections) {
DocCollection latestStateFromZk = getDocCollection(ext.getName(), null);
if (latestStateFromZk.getZNodeVersion() != ext.getZNodeVersion()) {
// looks like we couldn't reach the server because the state was stale == retry
stateWasStale = true;
// we just pulled state from ZK, so update the cache so that the retry uses it
collectionStateCache.put(
ext.getName(), new ExpiringCachedDocCollection(latestStateFromZk));
}
}
}
if (requestedCollections != null) {
requestedCollections.clear(); // done with this
}
// if the state was stale, then we retry the request once with new state pulled from Zk
if (stateWasStale) {
log.warn(
"Re-trying request to collection(s) {} after stale state error from server.",
inputCollections);
resp = requestWithRetryOnStaleState(request, retryCount + 1, inputCollections);
} else {
if (exc instanceof SolrException
|| exc instanceof SolrServerException
|| exc instanceof IOException) {
throw exc;
} else {
throw new SolrServerException(rootCause);
}
}
}
return resp;
}