protected List toOutlinks()

in core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java [484:544]


    protected List<Outlink> toOutlinks(
            String url, Metadata metadata, Map<String, List<String>> slinks) {

        if (slinks.size() == 0) {
            return new LinkedList<>();
        }

        URL sourceUrl;
        try {
            sourceUrl = new URL(url);
        } catch (MalformedURLException e) {
            // we would have known by now as previous components check whether
            // the URL is valid
            LOG.error("MalformedURLException on {}", url);
            eventCounter.scope("error_invalid_source_url").incrBy(1);
            return new LinkedList<>();
        }

        final Map<String, Outlink> outlinks = new HashMap<>();

        for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) {

            // got enough
            if (maxOutlinksPerPage >= 0 && outlinks.size() >= maxOutlinksPerPage) {
                LOG.info(
                        "Found {} unique links for {} trimming to {}",
                        slinks.size(),
                        url,
                        maxOutlinksPerPage);
                break;
            }

            String targetURL = linkEntry.getKey();

            Outlink ol = filterOutlink(sourceUrl, targetURL, metadata);
            if (ol == null) {
                eventCounter.scope("outlink_filtered").incr();
                continue;
            }

            // the same link could already be there post-normalisation
            Outlink old = outlinks.get(ol.getTargetURL());
            if (old != null) {
                ol = old;
            }

            List<String> anchors = linkEntry.getValue();
            if (trackAnchors && anchors.size() > 0) {
                ol.getMetadata().addValues(ANCHORS_KEY_NAME, anchors);
                // sets the first anchor
                ol.setAnchor(anchors.get(0));
            }

            if (old == null) {
                outlinks.put(ol.getTargetURL(), ol);
                eventCounter.scope("outlink_kept").incr();
            }
        }

        return new LinkedList<>(outlinks.values());
    }