in src/java/org/apache/nutch/crawl/LinkDb.java [103:171]
public void map(Text key, ParseData parseData,
Context context)
throws IOException, InterruptedException {
String fromUrl = key.toString();
String fromHost = getHost(fromUrl);
if (urlNormalizers != null) {
try {
fromUrl = urlNormalizers
.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping {} :", fromUrl, e);
fromUrl = null;
}
}
if (fromUrl != null && urlFilters != null) {
try {
fromUrl = urlFilters.filter(fromUrl); // filter the url
} catch (Exception e) {
LOG.warn("Skipping {} :", fromUrl, e);
fromUrl = null;
}
}
if (fromUrl == null)
return; // discard all outlinks
Outlink[] outlinks = parseData.getOutlinks();
Inlinks inlinks = new Inlinks();
for (int i = 0; i < outlinks.length; i++) {
Outlink outlink = outlinks[i];
String toUrl = outlink.getToUrl();
if (ignoreInternalLinks) {
String toHost = getHost(toUrl);
if (toHost == null || toHost.equals(fromHost)) { // internal link
continue; // skip it
}
} else if (ignoreExternalLinks) {
String toHost = getHost(toUrl);
if (toHost == null || !toHost.equals(fromHost)) { // external link skip it
continue;
}
}
if (urlNormalizers != null) {
try {
// normalize the url
toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB);
} catch (Exception e) {
LOG.warn("Skipping {} :", toUrl, e);
toUrl = null;
}
}
if (toUrl != null && urlFilters != null) {
try {
toUrl = urlFilters.filter(toUrl); // filter the url
} catch (Exception e) {
LOG.warn("Skipping {} :", toUrl, e);
toUrl = null;
}
}
if (toUrl == null)
continue;
inlinks.clear();
String anchor = outlink.getAnchor(); // truncate long anchors
if (anchor.length() > maxAnchorLength) {
anchor = anchor.substring(0, maxAnchorLength);
}
inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
context.write(new Text(toUrl), inlinks);
}
}