public BaseRobotRules getRobotRulesSet()

in src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java [114:272]


  public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
      List<Content> robotsTxtContent) {

    if (LOG.isTraceEnabled() && isAllowListed(url)) {
      LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
    }

    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = CACHE.get(cacheKey);

    if (robotRules != null) {
      return robotRules; // cached rule
    } else if (LOG.isTraceEnabled()) {
      LOG.trace("Robots.txt cache miss {}", url);
    }

    boolean cacheRule = true;
    Set<String> redirectCacheKeys = new HashSet<>();

    if (isAllowListed(url)) {
      // check in advance whether a host is allowlisted
      // (we do not need to fetch robots.txt)
      robotRules = EMPTY_RULES;
      LOG.info("Allowlisted host found for: {}", url);
      LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}",
          url.getHost());

    } else {
      URL robotsUrl = null, robotsUrlRedir = null;
      try {
        robotsUrl = new URL(url, "/robots.txt");

        /*
         * Redirect counter - following redirects up to the configured maximum
         * ("five consecutive redirects" as per RFC 9309).
         */
        int numRedirects = 0;
        /*
         * The base URL to resolve relative redirect locations is set initially
         * to the default URL path ("/robots.txt") and updated when redirects
         * were followed.
         */
        robotsUrlRedir = robotsUrl;

        Response response = ((HttpBase) http).getResponse(robotsUrl,
            new CrawlDatum(), true);
        int code = response.getCode();
        if (robotsTxtContent != null) {
          addRobotsContent(robotsTxtContent, robotsUrl, response);
        }

        while (isRedirect(code) && numRedirects < maxNumRedirects) {
          numRedirects++;

          String redirectionLocation = response.getHeader("Location");
          if (StringUtils.isNotBlank(redirectionLocation)) {
            LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir,
                redirectionLocation);
            try {
              robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
            } catch (MalformedURLException e) {
              LOG.info(
                  "Failed to resolve redirect location for robots.txt: {} -> {} ({})",
                  robotsUrlRedir, redirectionLocation, e.getMessage());
              break;
            }
            response = ((HttpBase) http).getResponse(robotsUrlRedir,
                new CrawlDatum(), true);
            code = response.getCode();
            if (robotsTxtContent != null) {
              addRobotsContent(robotsTxtContent, robotsUrlRedir, response);
            }
          } else {
            LOG.info(
                "No HTTP redirect Location header for robots.txt: {} (status code: {})",
                robotsUrlRedir, code);
            break;
          }

          if ("/robots.txt".equals(robotsUrlRedir.getFile())) {
            /*
             * If a redirect points to a path /robots.txt on a different host
             * (or a different authority scheme://host:port/, in general), we
             * can lookup the cache for cached rules from the target host.
             */
            String redirectCacheKey = getCacheKey(robotsUrlRedir);
            robotRules = CACHE.get(redirectCacheKey);
            LOG.debug(
                "Found cached robots.txt rules for {} (redirected to {}) under target key {}",
                url, robotsUrlRedir, redirectCacheKey);
            if (robotRules != null) {
              /* If found, cache and return the rules for the source host. */
              CACHE.put(cacheKey, robotRules);
              return robotRules;
            } else {
              /*
               * Remember the target host/authority, we can cache the rules,
               * too.
               */
              redirectCacheKeys.add(redirectCacheKey);
            }
          }

          if (numRedirects == maxNumRedirects && isRedirect(code)) {
            LOG.info(
                "Reached maximum number of robots.txt redirects for {} (assuming no robots.txt, allow all)",
                url);
          }
        }

        LOG.debug("Fetched robots.txt for {} with status code {}", url, code);
        if (code == 200) // found rules: parse them
          robotRules = parseRules(url.toString(), response.getContent(),
              response.getHeader("Content-Type"), agentNames);

        else if ((code == 403) && (!allowForbidden))
          robotRules = FORBID_ALL_RULES; // use forbid all

        else if (code >= 500 || code == 429) {
          // 5xx server errors or 429 Too Many Requests
          cacheRule = false; // try again later to fetch robots.txt
          if (deferVisits503) {
            // signal fetcher to suspend crawling for this host
            robotRules = DEFER_VISIT_RULES;
          } else {
            robotRules = EMPTY_RULES;
          }
        } else {
          robotRules = EMPTY_RULES; // use default rules
        }
      } catch (Throwable t) {
        if (robotsUrl == null || robotsUrlRedir == null) {
          LOG.info("Couldn't get robots.txt for {}", url, t);
        } else if (robotsUrl.equals(robotsUrlRedir)) {
          LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl,
              t);
        } else {
          LOG.info(
              "Couldn't get redirected robots.txt for {} (redirected to {}): {}",
              url, robotsUrlRedir, t);
        }
        cacheRule = false; // try again later to fetch robots.txt
        robotRules = EMPTY_RULES;
      }
    }

    if (cacheRule) {
      CACHE.put(cacheKey, robotRules); // cache rules for host
      for (String redirectCacheKey : redirectCacheKeys) {
        /*
         * and also for redirect target hosts where URL path and query were
         * found to be "/robots.txt"
         */
        CACHE.put(redirectCacheKey, robotRules);
      }
    }

    return robotRules;
  }