in opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java [100:171]
public NameSample read() throws IOException {
if (nameSamples.isEmpty()) {
String doc = samples.read();
if (doc != null) {
boolean clearAdaptiveData = true;
String line;
try (BufferedReader docIn = new BufferedReader(new StringReader(doc))) {
while ((line = docIn.readLine()) != null) {
if (line.startsWith(TAG_DOC_OPEN)) {
continue;
}
if (line.equals(TAG_DOC_CLOSE)) {
break;
}
String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(line);
List<Span> entities = new LinkedList<>();
List<String> cleanedTokens = new ArrayList<>(tokens.length);
int tokenIndex = 0;
int entityBeginIndex = -1;
String entityType = null;
boolean insideStartEnmaxTag = false;
for (String token : tokens) {
// Split here, next part of tag is in new token
if (token.startsWith(TAG_ENAMEX_OPEN)) {
insideStartEnmaxTag = true;
continue;
}
if (insideStartEnmaxTag) {
String typeBegin = TYPE;
if (token.startsWith(typeBegin)) {
int typeEnd = token.indexOf("\"", typeBegin.length());
entityType = StringUtil.toLowerCase(token.substring(typeBegin.length(), typeEnd));
}
if (token.contains(SYMBOL_CLOSE)) {
entityBeginIndex = tokenIndex;
insideStartEnmaxTag = false;
} else {
continue;
}
}
if (token.endsWith(TAG_ENAMEX_CLOSE)) {
entities.add(new Span(entityBeginIndex, tokenIndex + 1, entityType));
entityBeginIndex = -1;
}
cleanedTokens.add(convertToken(token));
tokenIndex++;
}
nameSamples.add(new NameSample(cleanedTokens.toArray(new String[0]),
entities.toArray(new Span[0]), clearAdaptiveData));
clearAdaptiveData = false;
}
}
}
}
if (!nameSamples.isEmpty()) {
return nameSamples.remove(0);
} else {
return null;
}
}