in core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java [166:225]
private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetadata)
throws Exception {
List<Outlink> links = new ArrayList<>();
SyndFeed feed;
try (ByteArrayInputStream is = new ByteArrayInputStream(content)) {
SyndFeedInput input = new SyndFeedInput();
feed = input.build(new InputSource(is));
}
URL sURL = new URL(url);
List<SyndEntry> entries = feed.getEntries();
for (SyndEntry entry : entries) {
String targetURL = entry.getLink();
// targetURL can be null?!?
// e.g. feed does not use links but guid
if (StringUtils.isBlank(targetURL)) {
targetURL = entry.getUri();
if (StringUtils.isBlank(targetURL)) {
continue;
}
}
Outlink newLink = filterOutlink(sURL, targetURL, parentMetadata);
if (newLink == null) continue;
String title = entry.getTitle();
if (StringUtils.isNotBlank(title)) {
newLink.getMetadata().setValue("feed.title", title.trim());
}
Date publishedDate = entry.getPublishedDate();
if (publishedDate != null) {
// filter based on the published date
if (filterHoursSincePub != -1) {
Calendar rightNow =
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
rightNow.add(Calendar.HOUR, -filterHoursSincePub);
if (publishedDate.before(rightNow.getTime())) {
LOG.info(
"{} has a published date {} which is more than {} hours old",
targetURL,
publishedDate,
filterHoursSincePub);
continue;
}
}
newLink.getMetadata().setValue("feed.publishedDate", publishedDate.toString());
}
SyndContent description = entry.getDescription();
if (description != null && StringUtils.isNotBlank(description.getValue())) {
newLink.getMetadata().setValue("feed.description", description.getValue());
}
links.add(newLink);
}
return links;
}