public void filter()

in core/src/main/java/org/apache/stormcrawler/jsoup/LinkParseFilter.java [61:123]


    public void filter(
            String URL, byte[] content, org.jsoup.nodes.Document doc, ParseResult parse) {

        ParseData parseData = parse.get(URL);
        Metadata metadata = parseData.getMetadata();

        Map<String, Outlink> dedup = new HashMap<String, Outlink>();

        for (Outlink o : parse.getOutlinks()) {
            dedup.put(o.getTargetURL(), o);
        }

        java.net.URL sourceUrl;
        try {
            sourceUrl = new URL(URL);
        } catch (MalformedURLException e1) {
            // we would have known by now as previous components check whether
            // the URL is valid
            LOG.error("MalformedURLException on {}", URL);
            return;
        }

        // applies the XPATH expression in the order in which they are produced
        for (List<LabelledExpression> leList : expressions.values()) {
            for (LabelledExpression le : leList) {
                try {
                    List<String> values = le.evaluate(doc);
                    if (values == null || values.isEmpty()) {
                        continue;
                    }
                    for (String target : values) {
                        // resolve URL
                        target = URLUtil.resolveURL(sourceUrl, target).toExternalForm();

                        // apply filtering
                        target = urlFilters.filter(sourceUrl, metadata, target);
                        if (target == null) {
                            continue;
                        }

                        // check whether we already have this link
                        if (dedup.containsKey(target)) {
                            continue;
                        }

                        // create outlink
                        Outlink ol = new Outlink(target);

                        // get the metadata for the outlink from the parent one
                        Metadata metadataOL =
                                metadataTransfer.getMetaForOutlink(target, URL, metadata);

                        ol.setMetadata(metadataOL);
                        dedup.put(ol.getTargetURL(), ol);
                    }
                } catch (Exception e) {
                    LOG.error("Error evaluating {}: {}", le.key, e);
                }
            }
        }

        parse.setOutlinks(new ArrayList(dedup.values()));
    }