in opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java [332:386]
private List<String> processTok(String tok) {
boolean tokAdded = false;
String original = tok;
List<String> out = new ArrayList<>();
LinkedList<String> suffix = new LinkedList<>();
char first = tok.charAt(0);
if (first == '«') {
out.add(Character.toString(first));
tok = tok.substring(1);
}
char last = tok.charAt(tok.length() - 1);
if (last == '»' || last == ':' || last == ',' || last == '!' ) {
suffix.add(Character.toString(last));
tok = tok.substring(0, tok.length() - 1);
}
// lets split all hyphens
if (this.splitHyphenatedTokens && tok.contains("-") && tok.length() > 1) {
Matcher matcher = HYPHEN_PATTERN.matcher(tok);
String firstTok = null;
String hyphen = "-";
String secondTok = null;
String rest = null;
if (matcher.matches()) {
if (matcher.group(1) != null) {
firstTok = matcher.group(2);
} else if (matcher.group(3) != null) {
secondTok = matcher.group(4);
rest = matcher.group(5);
} else if (matcher.group(6) != null) {
firstTok = matcher.group(7);
secondTok = matcher.group(8);
rest = matcher.group(9);
}
addIfNotEmpty(firstTok, out);
addIfNotEmpty(hyphen, out);
addIfNotEmpty(secondTok, out);
addIfNotEmpty(rest, out);
tokAdded = true;
}
}
if (!tokAdded) {
if (!original.equals(tok) && tok.length() > 1
&& !ALPHANUMERIC_PATTERN.matcher(tok).matches()) {
out.addAll(processTok(tok));
} else {
out.add(tok);
}
}
out.addAll(suffix);
return out;
}