in core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java [106:243]
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
String cacheKey = getCacheKey(url);
// check in the error cache first
BaseRobotRules robotRules = ERRORCACHE.getIfPresent(cacheKey);
if (robotRules != null) {
return robotRules;
}
// now try the proper cache
robotRules = CACHE.getIfPresent(cacheKey);
if (robotRules != null) {
return robotRules;
}
boolean cacheRule = true;
Set<String> redirectCacheKeys = new HashSet<>();
URL robotsUrl = null, redir = null;
LOG.debug("Cache miss {} for {}", cacheKey, url);
List<Integer> bytesFetched = new LinkedList<>();
try {
robotsUrl = new URL(url, "/robots.txt");
ProtocolResponse response = http.getProtocolOutput(robotsUrl.toString(), fetchRobotsMd);
int code = response.getStatusCode();
bytesFetched.add(response.getContent() != null ? response.getContent().length : 0);
// According to RFC9309, the crawler should follow at least 5 consecutive redirects
// to get the robots.txt file.
int numRedirects = 0;
// The base URL to resolve relative redirect locations is set initially to the default
// URL path ("/robots.txt") and updated when redirects were followed.
redir = robotsUrl;
while ((code == 301 || code == 302 || code == 303 || code == 307 || code == 308)
&& numRedirects < MAX_NUM_REDIRECTS) {
numRedirects++;
String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION);
LOG.debug("Redirected from {} to {}", redir, redirection);
if (StringUtils.isNotBlank(redirection)) {
redir = new URL(redir, redirection);
if (redir.getPath().equals("/robots.txt") && redir.getQuery() == null) {
// only if the path (including the query part) of the redirect target is
// `/robots.txt` we can get/put the rules from/to the cache under the host
// key of the redirect target
String keyredir = getCacheKey(redir);
RobotRules cachedRediRobotRules = CACHE.getIfPresent(keyredir);
if (cachedRediRobotRules != null) {
// cache also for the source host
LOG.debug(
"Found robots for {} (redirected) under key {} in cache",
redir,
keyredir);
LOG.debug(
"Caching redirected robots from key {} under key {}",
keyredir,
cacheKey);
CACHE.put(cacheKey, cachedRediRobotRules);
return cachedRediRobotRules;
} else {
// Remember the target host/authority, we can cache the rules, too.
redirectCacheKeys.add(keyredir);
}
} else {
LOG.debug(
"Robots for {} redirected to {} (not cached for target host because not at root)",
url,
redir);
}
response = http.getProtocolOutput(redir.toString(), Metadata.empty);
code = response.getStatusCode();
bytesFetched.add(
response.getContent() != null ? response.getContent().length : 0);
} else {
LOG.debug("Got redirect response {} for robots {} without location", code, url);
break;
}
}
// Parsing found rules according to RFC 9309
if (code == 200) {
// Only if the status code 200 is returned, the rules are parsed
String ct = response.getMetadata().getFirstValue(HttpHeaders.CONTENT_TYPE);
robotRules = parseRules(url.toString(), response.getContent(), ct, agentNames);
} else if (code == 403 && !allowForbidden) {
// If the fetch of the robots.txt file is forbidden, then forbid also the fetch
// of the other pages within this host
robotRules = FORBID_ALL_RULES;
} else if (code == 429) {
// Handling Too many requests similar to a server error
// https://support.google.com/webmasters/answer/9679690#robots_details
cacheRule = false;
robotRules = FORBID_ALL_RULES;
} else if (code >= 500 && code <= 599) { // in range between 500 and 599
// If the fetch of the robots.txt file is not possible due to a server error, then
// better not crawl the remaining pages within this domain
cacheRule = false;
robotRules = FORBID_ALL_RULES;
if (allow5xx) {
robotRules = EMPTY_RULES; // allow all
}
} else {
robotRules = EMPTY_RULES; // allow all
}
} catch (Throwable t) {
LOG.info("Couldn't get robots.txt for {} : {}", url, t.toString());
cacheRule = false;
robotRules = EMPTY_RULES;
}
Cache<String, RobotRules> cacheToUse = CACHE;
String cacheName = "success";
if (!cacheRule) {
cacheToUse = ERRORCACHE;
cacheName = "error";
}
RobotRules cached = new RobotRules(robotRules);
LOG.debug("Caching robots for {} under key {} in cache {}", url, cacheKey, cacheName);
cacheToUse.put(cacheKey, cached);
// cache robot rules for redirections
// get here only if the target has not been found in the cache
for (String keyredir : redirectCacheKeys) {
// keyredir isn't null only if the robots.txt file of the target is
// at the root
LOG.debug("Caching robots for {} under key {} in cache {}", redir, keyredir, cacheName);
cacheToUse.put(keyredir, cached);
}
RobotRules live = new RobotRules(robotRules);
live.setContentLengthFetched(Ints.toArray(bytesFetched));
return live;
}