private void generateSitemapUrlDatum()

in src/java/org/apache/nutch/util/SitemapProcessor.java [224:309]


    private void generateSitemapUrlDatum(Protocol protocol, String url, Context context) throws Exception {
      ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
      ProtocolStatus status = output.getStatus();
      Content content = output.getContent();

      // Following redirects http > https and what else
      int maxRedir = this.maxRedir;
      while (!output.getStatus().isSuccess() && output.getStatus().isRedirect() && maxRedir > 0) {
        String[] stuff = output.getStatus().getArgs();
        url = filterNormalize(stuff[0]);
        
        // get out!
        if (url == null) {
          break;
        }
        output = protocol.getProtocolOutput(new Text(url), datum);
        status = output.getStatus();
        content = output.getContent();
        
        maxRedir--;
      }

      if(status.getCode() != ProtocolStatus.SUCCESS) {
        // If there were any problems fetching the sitemap, log the error and let it go. Not sure how often
        // sitemaps are redirected. In future we might have to handle redirects.
        context.getCounter("Sitemap", "failed_fetches").increment(1);
        LOG.error("Error while fetching the sitemap. Status code: {} for {}", status.getCode(), url);
        return;
      }

      AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(), content.getContent(), new URL(url));

      if(asm instanceof SiteMap) {
        LOG.info("Parsing sitemap file: {}", asm.getUrl().toString());
        SiteMap sm = (SiteMap) asm;
        Collection<SiteMapURL> sitemapUrls = sm.getSiteMapUrls();
        for(SiteMapURL sitemapUrl: sitemapUrls) {
          // If 'strict' is ON, only allow valid urls. Else allow all urls
          if(!strict || sitemapUrl.isValid()) {
            String key = filterNormalize(sitemapUrl.getUrl().toString());

            if (key != null) {
              CrawlDatum sitemapUrlDatum = new CrawlDatum();
              sitemapUrlDatum.setStatus(CrawlDatum.STATUS_INJECTED);
              sitemapUrlDatum.setScore((float) sitemapUrl.getPriority());

              if(sitemapUrl.getChangeFrequency() != null) {
                int fetchInterval = -1;
                switch(sitemapUrl.getChangeFrequency()) {
                  case ALWAYS:  fetchInterval = 1;        break;
                  case HOURLY:  fetchInterval = 3600;     break; // 60*60
                  case DAILY:   fetchInterval = 86400;    break; // 60*60*24
                  case WEEKLY:  fetchInterval = 604800;   break; // 60*60*24*7
                  case MONTHLY: fetchInterval = 2592000;  break; // 60*60*24*30
                  case YEARLY:  fetchInterval = 31536000; break; // 60*60*24*365
                  case NEVER:   fetchInterval = Integer.MAX_VALUE; break; // Loose "NEVER" contract
                }
                sitemapUrlDatum.setFetchInterval(fetchInterval);
              }

              if(sitemapUrl.getLastModified() != null) {
                sitemapUrlDatum.setModifiedTime(sitemapUrl.getLastModified().getTime());
              }

              context.write(new Text(key), sitemapUrlDatum);
            }
          }
        }
      }
      else if (asm instanceof SiteMapIndex) {
        SiteMapIndex index = (SiteMapIndex) asm;
        Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps(true);

        if (sitemapUrls.isEmpty()) {
          return;
        }

        LOG.info("Parsing sitemap index file: {}", index.getUrl().toString());
        for (AbstractSiteMap sitemap : sitemapUrls) {
          String sitemapUrl = filterNormalize(sitemap.getUrl().toString());
          if (sitemapUrl != null) {
            generateSitemapUrlDatum(protocol, sitemapUrl, context);
          }
        }
      }
    }