in core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java [182:315]
private List<Outlink> parseSiteMap(
String url, byte[] content, String contentType, Metadata parentMetadata)
throws UnknownFormatException, IOException {
URL sURL = new URL(url);
long start = System.currentTimeMillis();
AbstractSiteMap siteMap;
// let the parser guess what the mimetype is
if (StringUtils.isBlank(contentType) || contentType.contains("octet-stream")) {
siteMap = parser.parseSiteMap(content, sURL);
} else {
siteMap = parser.parseSiteMap(contentType, content, sURL);
}
long end = System.currentTimeMillis();
averagedMetrics.update(end - start);
List<Outlink> links = new ArrayList<>();
if (siteMap.isIndex()) {
SiteMapIndex smi = (SiteMapIndex) siteMap;
Collection<AbstractSiteMap> subsitemaps = smi.getSitemaps();
Calendar rightNow = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
int delay = 0;
// keep the subsitemaps as outlinks
// they will be fetched and parsed in the following steps
Iterator<AbstractSiteMap> iter = subsitemaps.iterator();
while (iter.hasNext()) {
AbstractSiteMap asm = iter.next();
String target = asm.getUrl().toExternalForm();
Date lastModified = asm.getLastModified();
String lastModifiedValue = "";
if (lastModified != null) {
// filter based on the published date
if (filterHoursSinceModified != -1) {
if (lastModified.before(rightNow.getTime())) {
LOG.info(
"{} has a modified date {} which is more than {} hours old",
target,
lastModified,
filterHoursSinceModified);
continue;
}
}
lastModifiedValue = lastModified.toString();
}
Outlink ol =
filterOutlink(
sURL,
target,
parentMetadata,
isSitemapKey,
"true",
"sitemap.lastModified",
lastModifiedValue);
if (ol == null) {
continue;
}
// add a delay
if (this.scheduleSitemapsWithDelay > 0) {
if (delay > 0) {
ol.getMetadata()
.setValue(DefaultScheduler.DELAY_METADATA, Integer.toString(delay));
}
delay += this.scheduleSitemapsWithDelay;
}
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
}
// sitemap files
else {
SiteMap sm = (SiteMap) siteMap;
// TODO see what we can do with the LastModified info
Collection<SiteMapURL> sitemapURLs = sm.getSiteMapUrls();
Iterator<SiteMapURL> iter = sitemapURLs.iterator();
while (iter.hasNext()) {
SiteMapURL smurl = iter.next();
// TODO handle priority in metadata
double priority = smurl.getPriority();
// TODO convert the frequency into a numerical value and handle
// it in metadata
ChangeFrequency freq = smurl.getChangeFrequency();
String target = smurl.getUrl().toExternalForm();
String lastModifiedValue = "";
Date lastModified = smurl.getLastModified();
if (lastModified != null) {
// filter based on the published date
if (filterHoursSinceModified != -1) {
Calendar rightNow =
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
if (lastModified.before(rightNow.getTime())) {
LOG.info(
"{} has a modified date {} which is more than {} hours old",
target,
lastModified.toString(),
filterHoursSinceModified);
continue;
}
}
lastModifiedValue = lastModified.toString();
}
Outlink ol =
filterOutlink(
sURL,
target,
parentMetadata,
isSitemapKey,
"false",
"sitemap.lastModified",
lastModifiedValue);
if (ol == null) {
continue;
}
parseExtensionAttributes(smurl, ol.getMetadata());
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
}
return links;
}