in src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java [114:272]
public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
List<Content> robotsTxtContent) {
if (LOG.isTraceEnabled() && isAllowListed(url)) {
LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
}
String cacheKey = getCacheKey(url);
BaseRobotRules robotRules = CACHE.get(cacheKey);
if (robotRules != null) {
return robotRules; // cached rule
} else if (LOG.isTraceEnabled()) {
LOG.trace("Robots.txt cache miss {}", url);
}
boolean cacheRule = true;
Set<String> redirectCacheKeys = new HashSet<>();
if (isAllowListed(url)) {
// check in advance whether a host is allowlisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Allowlisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}",
url.getHost());
} else {
URL robotsUrl = null, robotsUrlRedir = null;
try {
robotsUrl = new URL(url, "/robots.txt");
/*
* Redirect counter - following redirects up to the configured maximum
* ("five consecutive redirects" as per RFC 9309).
*/
int numRedirects = 0;
/*
* The base URL to resolve relative redirect locations is set initially
* to the default URL path ("/robots.txt") and updated when redirects
* were followed.
*/
robotsUrlRedir = robotsUrl;
Response response = ((HttpBase) http).getResponse(robotsUrl,
new CrawlDatum(), true);
int code = response.getCode();
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrl, response);
}
while (isRedirect(code) && numRedirects < maxNumRedirects) {
numRedirects++;
String redirectionLocation = response.getHeader("Location");
if (StringUtils.isNotBlank(redirectionLocation)) {
LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir,
redirectionLocation);
try {
robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
} catch (MalformedURLException e) {
LOG.info(
"Failed to resolve redirect location for robots.txt: {} -> {} ({})",
robotsUrlRedir, redirectionLocation, e.getMessage());
break;
}
response = ((HttpBase) http).getResponse(robotsUrlRedir,
new CrawlDatum(), true);
code = response.getCode();
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrlRedir, response);
}
} else {
LOG.info(
"No HTTP redirect Location header for robots.txt: {} (status code: {})",
robotsUrlRedir, code);
break;
}
if ("/robots.txt".equals(robotsUrlRedir.getFile())) {
/*
* If a redirect points to a path /robots.txt on a different host
* (or a different authority scheme://host:port/, in general), we
* can lookup the cache for cached rules from the target host.
*/
String redirectCacheKey = getCacheKey(robotsUrlRedir);
robotRules = CACHE.get(redirectCacheKey);
LOG.debug(
"Found cached robots.txt rules for {} (redirected to {}) under target key {}",
url, robotsUrlRedir, redirectCacheKey);
if (robotRules != null) {
/* If found, cache and return the rules for the source host. */
CACHE.put(cacheKey, robotRules);
return robotRules;
} else {
/*
* Remember the target host/authority, we can cache the rules,
* too.
*/
redirectCacheKeys.add(redirectCacheKey);
}
}
if (numRedirects == maxNumRedirects && isRedirect(code)) {
LOG.info(
"Reached maximum number of robots.txt redirects for {} (assuming no robots.txt, allow all)",
url);
}
}
LOG.debug("Fetched robots.txt for {} with status code {}", url, code);
if (code == 200) // found rules: parse them
robotRules = parseRules(url.toString(), response.getContent(),
response.getHeader("Content-Type"), agentNames);
else if ((code == 403) && (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
else if (code >= 500 || code == 429) {
// 5xx server errors or 429 Too Many Requests
cacheRule = false; // try again later to fetch robots.txt
if (deferVisits503) {
// signal fetcher to suspend crawling for this host
robotRules = DEFER_VISIT_RULES;
} else {
robotRules = EMPTY_RULES;
}
} else {
robotRules = EMPTY_RULES; // use default rules
}
} catch (Throwable t) {
if (robotsUrl == null || robotsUrlRedir == null) {
LOG.info("Couldn't get robots.txt for {}", url, t);
} else if (robotsUrl.equals(robotsUrlRedir)) {
LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl,
t);
} else {
LOG.info(
"Couldn't get redirected robots.txt for {} (redirected to {}): {}",
url, robotsUrlRedir, t);
}
cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
}
}
if (cacheRule) {
CACHE.put(cacheKey, robotRules); // cache rules for host
for (String redirectCacheKey : redirectCacheKeys) {
/*
* and also for redirect target hosts where URL path and query were
* found to be "/robots.txt"
*/
CACHE.put(redirectCacheKey, robotRules);
}
}
return robotRules;
}