in core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java [484:544]
protected List<Outlink> toOutlinks(
String url, Metadata metadata, Map<String, List<String>> slinks) {
if (slinks.size() == 0) {
return new LinkedList<>();
}
URL sourceUrl;
try {
sourceUrl = new URL(url);
} catch (MalformedURLException e) {
// we would have known by now as previous components check whether
// the URL is valid
LOG.error("MalformedURLException on {}", url);
eventCounter.scope("error_invalid_source_url").incrBy(1);
return new LinkedList<>();
}
final Map<String, Outlink> outlinks = new HashMap<>();
for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) {
// got enough
if (maxOutlinksPerPage >= 0 && outlinks.size() >= maxOutlinksPerPage) {
LOG.info(
"Found {} unique links for {} trimming to {}",
slinks.size(),
url,
maxOutlinksPerPage);
break;
}
String targetURL = linkEntry.getKey();
Outlink ol = filterOutlink(sourceUrl, targetURL, metadata);
if (ol == null) {
eventCounter.scope("outlink_filtered").incr();
continue;
}
// the same link could already be there post-normalisation
Outlink old = outlinks.get(ol.getTargetURL());
if (old != null) {
ol = old;
}
List<String> anchors = linkEntry.getValue();
if (trackAnchors && anchors.size() > 0) {
ol.getMetadata().addValues(ANCHORS_KEY_NAME, anchors);
// sets the first anchor
ol.setAnchor(anchors.get(0));
}
if (old == null) {
outlinks.put(ol.getTargetURL(), ol);
eventCounter.scope("outlink_kept").incr();
}
}
return new LinkedList<>(outlinks.values());
}