in src/java/org/apache/nutch/util/SitemapProcessor.java [224:309]
private void generateSitemapUrlDatum(Protocol protocol, String url, Context context) throws Exception {
ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
// Following redirects http > https and what else
int maxRedir = this.maxRedir;
while (!output.getStatus().isSuccess() && output.getStatus().isRedirect() && maxRedir > 0) {
String[] stuff = output.getStatus().getArgs();
url = filterNormalize(stuff[0]);
// get out!
if (url == null) {
break;
}
output = protocol.getProtocolOutput(new Text(url), datum);
status = output.getStatus();
content = output.getContent();
maxRedir--;
}
if(status.getCode() != ProtocolStatus.SUCCESS) {
// If there were any problems fetching the sitemap, log the error and let it go. Not sure how often
// sitemaps are redirected. In future we might have to handle redirects.
context.getCounter("Sitemap", "failed_fetches").increment(1);
LOG.error("Error while fetching the sitemap. Status code: {} for {}", status.getCode(), url);
return;
}
AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(), content.getContent(), new URL(url));
if(asm instanceof SiteMap) {
LOG.info("Parsing sitemap file: {}", asm.getUrl().toString());
SiteMap sm = (SiteMap) asm;
Collection<SiteMapURL> sitemapUrls = sm.getSiteMapUrls();
for(SiteMapURL sitemapUrl: sitemapUrls) {
// If 'strict' is ON, only allow valid urls. Else allow all urls
if(!strict || sitemapUrl.isValid()) {
String key = filterNormalize(sitemapUrl.getUrl().toString());
if (key != null) {
CrawlDatum sitemapUrlDatum = new CrawlDatum();
sitemapUrlDatum.setStatus(CrawlDatum.STATUS_INJECTED);
sitemapUrlDatum.setScore((float) sitemapUrl.getPriority());
if(sitemapUrl.getChangeFrequency() != null) {
int fetchInterval = -1;
switch(sitemapUrl.getChangeFrequency()) {
case ALWAYS: fetchInterval = 1; break;
case HOURLY: fetchInterval = 3600; break; // 60*60
case DAILY: fetchInterval = 86400; break; // 60*60*24
case WEEKLY: fetchInterval = 604800; break; // 60*60*24*7
case MONTHLY: fetchInterval = 2592000; break; // 60*60*24*30
case YEARLY: fetchInterval = 31536000; break; // 60*60*24*365
case NEVER: fetchInterval = Integer.MAX_VALUE; break; // Loose "NEVER" contract
}
sitemapUrlDatum.setFetchInterval(fetchInterval);
}
if(sitemapUrl.getLastModified() != null) {
sitemapUrlDatum.setModifiedTime(sitemapUrl.getLastModified().getTime());
}
context.write(new Text(key), sitemapUrlDatum);
}
}
}
}
else if (asm instanceof SiteMapIndex) {
SiteMapIndex index = (SiteMapIndex) asm;
Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps(true);
if (sitemapUrls.isEmpty()) {
return;
}
LOG.info("Parsing sitemap index file: {}", index.getUrl().toString());
for (AbstractSiteMap sitemap : sitemapUrls) {
String sitemapUrl = filterNormalize(sitemap.getUrl().toString());
if (sitemapUrl != null) {
generateSitemapUrlDatum(protocol, sitemapUrl, context);
}
}
}
}