in opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/en/TokenSampleStream.java [57:109]
public TokenSample next() {
String[] tokens = line.split("\\s+");
if (tokens.length == 0) {
evenq = true;
}
StringBuilder sb = new StringBuilder(line.length());
List<Span> spans = new ArrayList<>();
int length = 0;
for (int ti = 0; ti < tokens.length; ti++) {
String token = tokens[ti];
String lastToken = ti - 1 >= 0 ? tokens[ti - 1] : "";
token = switch (token) {
case "-LRB-" -> "(";
case "-LCB-" -> "{";
case "-RRB-" -> ")";
case "-RCB-" -> "}";
default -> token;
};
if (sb.length() != 0) {
if (!alphaNumeric.matcher(token).find() || token.startsWith("'") || token.equalsIgnoreCase("n't")) {
if ((token.equals("``") || token.equals("--") || token.equals("$") ||
token.equals("(") || token.equals("&") || token.equals("#") ||
(token.equals("\"") && (evenq && ti != tokens.length - 1)))
&& (!lastToken.equals("(") || !lastToken.equals("{"))) {
length++;
}
}
else {
if (!lastToken.equals("``") && (!lastToken.equals("\"") || evenq) && !lastToken.equals("(")
&& !lastToken.equals("{") && !lastToken.equals("$") && !lastToken.equals("#")) {
length++;
}
}
}
if (token.equals("\"")) {
evenq = ti == tokens.length - 1 || !evenq;
}
if (sb.length() < length) {
sb.append(" ");
}
sb.append(token);
spans.add(new Span(length, length + token.length()));
length += token.length();
}
try {
line = in.readLine();
} catch (IOException e) {
logger.error(e.getLocalizedMessage(), e);
line = null;
}
return new TokenSample(sb.toString(),spans.toArray(new Span[0]));
}