in core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java [329:386]
private String unescapePath(String path) {
Matcher matcher = illegalEscapePattern.matcher(path);
StringBuilder sb = null;
int end = 0;
while (matcher.find()) {
if (sb == null) {
sb = new StringBuilder();
}
// Append everything up to this group
sb.append(path.substring(end, matcher.start()));
String group = matcher.group(1);
int letter = Integer.valueOf(group, 16);
sb.append((char) letter);
end = matcher.end();
}
// we got a replacement
if (sb != null) {
// append whatever is left
sb.append(path.substring(end));
path = sb.toString();
end = 0;
}
matcher = unescapeRulePattern.matcher(path);
if (!matcher.find()) {
return path;
}
sb = new StringBuilder();
// Traverse over all encoded groups
do {
// Append everything up to this group
sb.append(path, end, matcher.start());
// Get the integer representation of this hexadecimal encoded
// character
int letter = Integer.valueOf(matcher.group(1), 16);
if (letter < 128 && unescapedCharacters[letter]) {
// character should be unescaped in URLs
sb.append((char) letter);
} else {
// Append the whole sequence as uppercase
sb.append(matcher.group().toUpperCase(Locale.ROOT));
}
end = matcher.end();
} while (matcher.find());
// Append the rest if there's anything left
sb.append(path.substring(end));
return sb.toString();
}