public BaseRobotRules getRobotRulesSet()

in core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java [106:243]


    public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {

        String cacheKey = getCacheKey(url);

        // check in the error cache first
        BaseRobotRules robotRules = ERRORCACHE.getIfPresent(cacheKey);
        if (robotRules != null) {
            return robotRules;
        }

        // now try the proper cache
        robotRules = CACHE.getIfPresent(cacheKey);
        if (robotRules != null) {
            return robotRules;
        }

        boolean cacheRule = true;
        Set<String> redirectCacheKeys = new HashSet<>();

        URL robotsUrl = null, redir = null;

        LOG.debug("Cache miss {} for {}", cacheKey, url);
        List<Integer> bytesFetched = new LinkedList<>();
        try {
            robotsUrl = new URL(url, "/robots.txt");
            ProtocolResponse response = http.getProtocolOutput(robotsUrl.toString(), fetchRobotsMd);
            int code = response.getStatusCode();
            bytesFetched.add(response.getContent() != null ? response.getContent().length : 0);

            // According to RFC9309, the crawler should follow at least 5 consecutive redirects
            // to get the robots.txt file.
            int numRedirects = 0;
            // The base URL to resolve relative redirect locations is set initially to the default
            // URL path ("/robots.txt") and updated when redirects were followed.
            redir = robotsUrl;

            while ((code == 301 || code == 302 || code == 303 || code == 307 || code == 308)
                    && numRedirects < MAX_NUM_REDIRECTS) {
                numRedirects++;
                String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION);
                LOG.debug("Redirected from {} to {}", redir, redirection);
                if (StringUtils.isNotBlank(redirection)) {
                    redir = new URL(redir, redirection);
                    if (redir.getPath().equals("/robots.txt") && redir.getQuery() == null) {
                        // only if the path (including the query part) of the redirect target is
                        // `/robots.txt` we can get/put the rules from/to the cache under the host
                        // key of the redirect target
                        String keyredir = getCacheKey(redir);
                        RobotRules cachedRediRobotRules = CACHE.getIfPresent(keyredir);
                        if (cachedRediRobotRules != null) {
                            // cache also for the source host
                            LOG.debug(
                                    "Found robots for {} (redirected) under key {} in cache",
                                    redir,
                                    keyredir);
                            LOG.debug(
                                    "Caching redirected robots from key {} under key {}",
                                    keyredir,
                                    cacheKey);
                            CACHE.put(cacheKey, cachedRediRobotRules);
                            return cachedRediRobotRules;
                        } else {
                            // Remember the target host/authority, we can cache the rules, too.
                            redirectCacheKeys.add(keyredir);
                        }
                    } else {
                        LOG.debug(
                                "Robots for {} redirected to {} (not cached for target host because not at root)",
                                url,
                                redir);
                    }

                    response = http.getProtocolOutput(redir.toString(), Metadata.empty);
                    code = response.getStatusCode();
                    bytesFetched.add(
                            response.getContent() != null ? response.getContent().length : 0);
                } else {
                    LOG.debug("Got redirect response {} for robots {} without location", code, url);
                    break;
                }
            }

            // Parsing found rules according to RFC 9309
            if (code == 200) {
                // Only if the status code 200 is returned, the rules are parsed
                String ct = response.getMetadata().getFirstValue(HttpHeaders.CONTENT_TYPE);
                robotRules = parseRules(url.toString(), response.getContent(), ct, agentNames);
            } else if (code == 403 && !allowForbidden) {
                // If the fetch of the robots.txt file is forbidden, then forbid also the fetch
                // of the other pages within this host
                robotRules = FORBID_ALL_RULES;
            } else if (code == 429) {
                // Handling Too many requests similar to a server error
                // https://support.google.com/webmasters/answer/9679690#robots_details
                cacheRule = false;
                robotRules = FORBID_ALL_RULES;
            } else if (code >= 500 && code <= 599) { // in range between 500 and 599
                // If the fetch of the robots.txt file is not possible due to a server error, then
                // better not crawl the remaining pages within this domain
                cacheRule = false;
                robotRules = FORBID_ALL_RULES;
                if (allow5xx) {
                    robotRules = EMPTY_RULES; // allow all
                }
            } else {
                robotRules = EMPTY_RULES; // allow all
            }
        } catch (Throwable t) {
            LOG.info("Couldn't get robots.txt for {} : {}", url, t.toString());
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }

        Cache<String, RobotRules> cacheToUse = CACHE;
        String cacheName = "success";
        if (!cacheRule) {
            cacheToUse = ERRORCACHE;
            cacheName = "error";
        }

        RobotRules cached = new RobotRules(robotRules);

        LOG.debug("Caching robots for {} under key {} in cache {}", url, cacheKey, cacheName);
        cacheToUse.put(cacheKey, cached);

        // cache robot rules for redirections
        // get here only if the target has not been found in the cache
        for (String keyredir : redirectCacheKeys) {
            // keyredir isn't null only if the robots.txt file of the target is
            // at the root
            LOG.debug("Caching robots for {} under key {} in cache {}", redir, keyredir, cacheName);
            cacheToUse.put(keyredir, cached);
        }

        RobotRules live = new RobotRules(robotRules);
        live.setContentLengthFetched(Ints.toArray(bytesFetched));
        return live;
    }