private List parseFeed()

in core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java [166:225]


    private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetadata)
            throws Exception {
        List<Outlink> links = new ArrayList<>();

        SyndFeed feed;
        try (ByteArrayInputStream is = new ByteArrayInputStream(content)) {
            SyndFeedInput input = new SyndFeedInput();
            feed = input.build(new InputSource(is));
        }

        URL sURL = new URL(url);

        List<SyndEntry> entries = feed.getEntries();
        for (SyndEntry entry : entries) {
            String targetURL = entry.getLink();
            // targetURL can be null?!?
            // e.g. feed does not use links but guid
            if (StringUtils.isBlank(targetURL)) {
                targetURL = entry.getUri();
                if (StringUtils.isBlank(targetURL)) {
                    continue;
                }
            }
            Outlink newLink = filterOutlink(sURL, targetURL, parentMetadata);
            if (newLink == null) continue;

            String title = entry.getTitle();
            if (StringUtils.isNotBlank(title)) {
                newLink.getMetadata().setValue("feed.title", title.trim());
            }

            Date publishedDate = entry.getPublishedDate();
            if (publishedDate != null) {
                // filter based on the published date
                if (filterHoursSincePub != -1) {
                    Calendar rightNow =
                            Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
                    rightNow.add(Calendar.HOUR, -filterHoursSincePub);
                    if (publishedDate.before(rightNow.getTime())) {
                        LOG.info(
                                "{} has a published date {} which is more than {} hours old",
                                targetURL,
                                publishedDate,
                                filterHoursSincePub);
                        continue;
                    }
                }
                newLink.getMetadata().setValue("feed.publishedDate", publishedDate.toString());
            }

            SyndContent description = entry.getDescription();
            if (description != null && StringUtils.isNotBlank(description.getValue())) {
                newLink.getMetadata().setValue("feed.description", description.getValue());
            }

            links.add(newLink);
        }

        return links;
    }