in src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java [128:230]
private void parseConf(String propertyValue) {
if (propertyValue == null || propertyValue.trim().length() == 0) {
return;
}
// At the start, all replacements apply globally to every host.
Pattern hostPattern = Pattern.compile(".*");
Pattern urlPattern = null;
// Split the property into lines
Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
while (lineMatcher.find()) {
String line = lineMatcher.group();
if (line != null && line.length() > 0) {
// Split the line into field and value
Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim());
if (nameValueMatcher.find()) {
String fieldName = nameValueMatcher.group(1).trim();
String value = nameValueMatcher.group(2);
if (fieldName != null && value != null) {
// Check if the field name is one of our special cases.
if (HOSTMATCH.equals(fieldName)) {
urlPattern = null;
try {
hostPattern = Pattern.compile(value);
} catch (PatternSyntaxException pse) {
LOG.error("hostmatch pattern " + value + " does not compile: "
+ pse.getMessage());
// Deactivate this invalid match set by making it match no host.
hostPattern = Pattern.compile("willnotmatchanyhost");
}
} else if (URLMATCH.equals(fieldName)) {
try {
urlPattern = Pattern.compile(value);
} catch (PatternSyntaxException pse) {
LOG.error("urlmatch pattern " + value + " does not compile: "
+ pse.getMessage());
// Deactivate this invalid match set by making it match no url.
urlPattern = Pattern.compile("willnotmatchanyurl");
}
} else if (value.length() > 3) {
String toFieldName = fieldName;
// If the fieldname has a colon, this indicates a different target
// field.
if (fieldName.indexOf(':') > 0) {
toFieldName = fieldName.substring(fieldName.indexOf(':') + 1);
fieldName = fieldName.substring(0, fieldName.indexOf(':'));
}
String sep = value.substring(0, 1);
// Divide the value into pattern / replacement / flags.
value = value.substring(1);
if (!value.contains(sep)) {
LOG.error("Pattern '" + line
+ "', not parseable. Missing separator " + sep);
continue;
}
String pattern = value.substring(0, value.indexOf(sep));
value = value.substring(pattern.length() + 1);
String replacement = value;
if (value.contains(sep)) {
replacement = value.substring(0, value.indexOf(sep));
}
int flags = 0;
if (value.length() > replacement.length() + 1) {
value = value.substring(replacement.length() + 1).trim();
try {
flags = Integer.parseInt(value);
} catch (NumberFormatException e) {
LOG.error("Pattern " + line + ", has invalid flags component");
continue;
}
}
Integer iFlags = (flags > 0) ? Integer.valueOf(flags) : null;
// Make a FieldReplacer out of these params.
FieldReplacer fr = new FieldReplacer(fieldName, toFieldName,
pattern, replacement, iFlags);
// Add this field replacer to the list for this host or URL.
if (urlPattern != null) {
List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern);
if (lfp == null) {
lfp = new ArrayList<FieldReplacer>();
}
lfp.add(fr);
FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
} else {
List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST
.get(hostPattern);
if (lfp == null) {
lfp = new ArrayList<FieldReplacer>();
}
lfp.add(fr);
FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
}
}
}
}
}
}
}