in core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java [207:251]
public boolean filter(String url, Metadata metadata) throws MalformedURLException {
URL u = new URL(url);
// first try the full hostname
String hostname = u.getHost();
if (checkScope(hostNameRules.get(hostname), u)) {
return true;
}
// then on the various components of the domain
final String[] domainParts = hostname.split("\\.");
String domain = null;
for (int i = domainParts.length - 1; i >= 0; i--) {
domain = domainParts[i] + (domain == null ? "" : "." + domain);
if (checkScope(domainRules.get(domain), u)) {
return true;
}
}
// check on parent's URL metadata
for (MDScope scope : metadataRules) {
final String[] vals = metadata.getValues(scope.getKey());
if (vals == null) {
continue;
}
for (String v : vals) {
if (v.equalsIgnoreCase(scope.getValue())) {
FastURLFilter.LOG.debug(
"Filtering {} matching metadata {}:{}",
url,
scope.getKey(),
scope.getValue());
if (checkScope(scope, u)) {
return true;
}
}
}
}
if (checkScope(globalRules, u)) {
return true;
}
return false;
}