in core/src/main/java/org/apache/stormcrawler/jsoup/LinkParseFilter.java [61:123]
public void filter(
String URL, byte[] content, org.jsoup.nodes.Document doc, ParseResult parse) {
ParseData parseData = parse.get(URL);
Metadata metadata = parseData.getMetadata();
Map<String, Outlink> dedup = new HashMap<String, Outlink>();
for (Outlink o : parse.getOutlinks()) {
dedup.put(o.getTargetURL(), o);
}
java.net.URL sourceUrl;
try {
sourceUrl = new URL(URL);
} catch (MalformedURLException e1) {
// we would have known by now as previous components check whether
// the URL is valid
LOG.error("MalformedURLException on {}", URL);
return;
}
// applies the XPATH expression in the order in which they are produced
for (List<LabelledExpression> leList : expressions.values()) {
for (LabelledExpression le : leList) {
try {
List<String> values = le.evaluate(doc);
if (values == null || values.isEmpty()) {
continue;
}
for (String target : values) {
// resolve URL
target = URLUtil.resolveURL(sourceUrl, target).toExternalForm();
// apply filtering
target = urlFilters.filter(sourceUrl, metadata, target);
if (target == null) {
continue;
}
// check whether we already have this link
if (dedup.containsKey(target)) {
continue;
}
// create outlink
Outlink ol = new Outlink(target);
// get the metadata for the outlink from the parent one
Metadata metadataOL =
metadataTransfer.getMetaForOutlink(target, URL, metadata);
ol.setMetadata(metadataOL);
dedup.put(ol.getTargetURL(), ol);
}
} catch (Exception e) {
LOG.error("Error evaluating {}: {}", le.key, e);
}
}
}
parse.setOutlinks(new ArrayList(dedup.values()));
}