in src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java [221:311]
private boolean isValidAuthority(String authority) {
if (authority == null) {
return false;
}
Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
if (!authorityMatcher.matches()) {
return false;
}
boolean ipV4Address = false;
boolean hostname = false;
// check if authority is IP address or hostname
String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
Matcher matchIPV4Pat = IP_V4_DOMAIN_PATTERN.matcher(hostIP);
ipV4Address = matchIPV4Pat.matches();
if (ipV4Address) {
// this is an IP address so check components
for (int i = 1; i <= 4; i++) {
String ipSegment = matchIPV4Pat.group(i);
if (ipSegment == null || ipSegment.length() <= 0) {
return false;
}
try {
if (Integer.parseInt(ipSegment) > 255) {
return false;
}
} catch (NumberFormatException e) {
return false;
}
}
} else {
// Domain is hostname name
hostname = DOMAIN_PATTERN.matcher(hostIP).matches();
}
// rightmost hostname will never start with a digit.
if (hostname) {
// LOW-TECH FIX FOR VALIDATOR-202
// TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
char[] chars = hostIP.toCharArray();
int size = 1;
for (int i = 0; i < chars.length; i++) {
if (chars[i] == '.') {
size++;
}
}
String[] domainSegment = new String[size];
int segCount = 0;
int segLen = 0;
Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP);
while (atomMatcher.find()) {
domainSegment[segCount] = atomMatcher.group();
segLen = domainSegment[segCount].length() + 1;
hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
segCount++;
}
String topLevel = domainSegment[segCount - 1];
if (topLevel.length() < 2) {
return false;
}
// First letter of top level must be a alpha
if (!ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()) {
return false;
}
// Make sure there's a host name preceding the authority.
if (segCount < 2) {
return false;
}
}
if (!hostname && !ipV4Address) {
return false;
}
String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
if (port != null) {
if (!PORT_PATTERN.matcher(port).matches()) {
return false;
}
}
String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
return isBlankOrNull(extra);
}