in src/java/org/apache/nutch/parse/ParseOutputFormat.java [401:467]
public static String filterNormalize(String fromUrl, String toUrl,
String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
String ignoreExternalLinksMode, URLFilters filters,
URLExemptionFilters exemptionFilters, URLNormalizers normalizers,
String urlNormalizerScope) {
// ignore links to self (or anchors within the page)
if (fromUrl.equals(toUrl)) {
return null;
}
if (ignoreExternalLinks || ignoreInternalLinks) {
URL targetURL = null;
try {
targetURL = new URL(toUrl);
} catch (MalformedURLException e1) {
return null; // skip it
}
if (ignoreExternalLinks) {
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
//FIXME: toDomain will never be null, correct?
if (toDomain == null || !toDomain.equals(origin)) {
return null; // skip it
}
} else {
String toHost = targetURL.getHost().toLowerCase();
if (!toHost.equals(origin)) { // external host link
if (exemptionFilters == null // check if it is exempted?
|| !exemptionFilters.isExempted(fromUrl, toUrl)) {
return null; ///skip it, This external url is not exempted.
}
}
}
}
if (ignoreInternalLinks) {
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
//FIXME: toDomain will never be null, correct?
if (toDomain == null || toDomain.equals(origin)) {
return null; // skip it
}
} else {
String toHost = targetURL.getHost().toLowerCase();
//FIXME: toDomain will never be null, correct?
if (toHost == null || toHost.equals(origin)) {
return null; // skip it
}
}
}
}
try {
if (normalizers != null) {
toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize
// the url
}
if (filters != null) {
toUrl = filters.filter(toUrl); // filter the url
}
if (toUrl == null) {
return null;
}
} catch (Exception e) {
return null;
}
return toUrl;
}