in src/java/org/apache/nutch/scoring/webgraph/WebGraph.java [346:410]
public void reduce(Text key, Iterable<NutchWritable> values,
Context context)
throws IOException, InterruptedException {
// aggregate all outlinks, get the most recent timestamp for a fetch
// which should be the timestamp for all of the most recent outlinks
long mostRecent = 0L;
List<LinkDatum> outlinkList = new ArrayList<>();
for (NutchWritable val : values) {
final Writable value = val.get();
if (value instanceof LinkDatum) {
// loop through, change out most recent timestamp if needed
LinkDatum next = (LinkDatum) value;
long timestamp = next.getTimestamp();
if (mostRecent == 0L || mostRecent < timestamp) {
mostRecent = timestamp;
}
outlinkList.add(WritableUtils.clone(next, conf));
context.getCounter("WebGraph.outlinks", "added links").increment(1);
} else if (value instanceof BooleanWritable) {
BooleanWritable delete = (BooleanWritable) value;
// Actually, delete is always true, otherwise we don't emit it in the
// mapper in the first place
if (delete.get() == true) {
// This page is gone, do not emit it's outlinks
context.getCounter("WebGraph.outlinks", "removed links").increment(1);
return;
}
}
}
// get the url, domain, and host for the url
String url = key.toString();
String domain = URLUtil.getDomainName(url);
String host = URLUtil.getHost(url);
// setup checking sets for domains and pages
Set<String> domains = new HashSet<>();
Set<String> pages = new HashSet<>();
// loop through the link datums
for (LinkDatum datum : outlinkList) {
// get the url, host, domain, and page for each outlink
String toUrl = datum.getUrl();
String toDomain = URLUtil.getDomainName(toUrl);
String toHost = URLUtil.getHost(toUrl);
String toPage = URLUtil.getPage(toUrl);
datum.setLinkType(LinkDatum.OUTLINK);
// outlinks must be the most recent and conform to internal url and
// limiting rules, if it does collect it
if (datum.getTimestamp() == mostRecent
&& (!limitPages || (limitPages && !pages.contains(toPage)))
&& (!limitDomains || (limitDomains && !domains.contains(toDomain)))
&& (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
&& (!ignoreDomain || (ignoreDomain && !toDomain
.equalsIgnoreCase(domain)))) {
context.write(key, datum);
pages.add(toPage);
domains.add(toDomain);
}
}
}