private List parseSiteMap()

in core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java [182:315]


    private List<Outlink> parseSiteMap(
            String url, byte[] content, String contentType, Metadata parentMetadata)
            throws UnknownFormatException, IOException {

        URL sURL = new URL(url);
        long start = System.currentTimeMillis();
        AbstractSiteMap siteMap;
        // let the parser guess what the mimetype is
        if (StringUtils.isBlank(contentType) || contentType.contains("octet-stream")) {
            siteMap = parser.parseSiteMap(content, sURL);
        } else {
            siteMap = parser.parseSiteMap(contentType, content, sURL);
        }
        long end = System.currentTimeMillis();
        averagedMetrics.update(end - start);

        List<Outlink> links = new ArrayList<>();

        if (siteMap.isIndex()) {
            SiteMapIndex smi = (SiteMapIndex) siteMap;
            Collection<AbstractSiteMap> subsitemaps = smi.getSitemaps();

            Calendar rightNow = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
            rightNow.add(Calendar.HOUR, -filterHoursSinceModified);

            int delay = 0;

            // keep the subsitemaps as outlinks
            // they will be fetched and parsed in the following steps
            Iterator<AbstractSiteMap> iter = subsitemaps.iterator();
            while (iter.hasNext()) {
                AbstractSiteMap asm = iter.next();
                String target = asm.getUrl().toExternalForm();

                Date lastModified = asm.getLastModified();
                String lastModifiedValue = "";
                if (lastModified != null) {
                    // filter based on the published date
                    if (filterHoursSinceModified != -1) {
                        if (lastModified.before(rightNow.getTime())) {
                            LOG.info(
                                    "{} has a modified date {} which is more than {} hours old",
                                    target,
                                    lastModified,
                                    filterHoursSinceModified);
                            continue;
                        }
                    }
                    lastModifiedValue = lastModified.toString();
                }

                Outlink ol =
                        filterOutlink(
                                sURL,
                                target,
                                parentMetadata,
                                isSitemapKey,
                                "true",
                                "sitemap.lastModified",
                                lastModifiedValue);
                if (ol == null) {
                    continue;
                }

                // add a delay
                if (this.scheduleSitemapsWithDelay > 0) {
                    if (delay > 0) {
                        ol.getMetadata()
                                .setValue(DefaultScheduler.DELAY_METADATA, Integer.toString(delay));
                    }
                    delay += this.scheduleSitemapsWithDelay;
                }

                links.add(ol);
                LOG.debug("{} : [sitemap] {}", url, target);
            }
        }
        // sitemap files
        else {
            SiteMap sm = (SiteMap) siteMap;
            // TODO see what we can do with the LastModified info
            Collection<SiteMapURL> sitemapURLs = sm.getSiteMapUrls();
            Iterator<SiteMapURL> iter = sitemapURLs.iterator();
            while (iter.hasNext()) {
                SiteMapURL smurl = iter.next();

                // TODO handle priority in metadata
                double priority = smurl.getPriority();
                // TODO convert the frequency into a numerical value and handle
                // it in metadata
                ChangeFrequency freq = smurl.getChangeFrequency();

                String target = smurl.getUrl().toExternalForm();
                String lastModifiedValue = "";
                Date lastModified = smurl.getLastModified();
                if (lastModified != null) {
                    // filter based on the published date
                    if (filterHoursSinceModified != -1) {
                        Calendar rightNow =
                                Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
                        rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
                        if (lastModified.before(rightNow.getTime())) {
                            LOG.info(
                                    "{} has a modified date {} which is more than {} hours old",
                                    target,
                                    lastModified.toString(),
                                    filterHoursSinceModified);
                            continue;
                        }
                    }
                    lastModifiedValue = lastModified.toString();
                }

                Outlink ol =
                        filterOutlink(
                                sURL,
                                target,
                                parentMetadata,
                                isSitemapKey,
                                "false",
                                "sitemap.lastModified",
                                lastModifiedValue);

                if (ol == null) {
                    continue;
                }
                parseExtensionAttributes(smurl, ol.getMetadata());
                links.add(ol);
                LOG.debug("{} : [sitemap] {}", url, target);
            }
        }

        return links;
    }