in opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java [98:168]
protected Iterator<Event> createEvents(TokenSample tokenSample) {
List<Event> events = new ArrayList<>(50);
Span[] tokens = tokenSample.getTokenSpans();
String text = tokenSample.getText();
if (tokens.length > 0) {
int start = tokens[0].getStart();
int end = tokens[tokens.length - 1].getEnd();
String sent = text.substring(start, end);
Span[] candTokens = WhitespaceTokenizer.INSTANCE.tokenizePos(sent);
int firstTrainingToken = -1;
int lastTrainingToken = -1;
for (Span candToken : candTokens) {
Span cSpan = candToken;
String ctok = sent.substring(cSpan.getStart(), cSpan.getEnd());
//adjust cSpan to text offsets
cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
//should we skip this token
if (ctok.length() > 1 && (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {
//find offsets of annotated tokens inside of candidate tokens
boolean foundTrainingTokens = false;
for (int ti = lastTrainingToken + 1; ti < tokens.length; ti++) {
if (cSpan.contains(tokens[ti])) {
if (!foundTrainingTokens) {
firstTrainingToken = ti;
foundTrainingTokens = true;
}
lastTrainingToken = ti;
}
else if (cSpan.getEnd() < tokens[ti].getEnd()) {
break;
}
else if (tokens[ti].getEnd() < cSpan.getStart()) {
//keep looking
}
else {
logger.warn("Bad training token: {} cand: {} token={}", tokens[ti], cSpan,
text.substring(tokens[ti].getStart(), tokens[ti].getEnd()));
}
}
// create training data
if (foundTrainingTokens) {
for (int ti = firstTrainingToken; ti <= lastTrainingToken; ti++) {
Span tSpan = tokens[ti];
int cStart = cSpan.getStart();
for (int i = tSpan.getStart() + 1; i < tSpan.getEnd(); i++) {
String[] context = cg.getContext(ctok, i - cStart);
events.add(new Event(TokenizerME.NO_SPLIT, context));
}
if (tSpan.getEnd() != cSpan.getEnd()) {
String[] context = cg.getContext(ctok, tSpan.getEnd() - cStart);
events.add(new Event(TokenizerME.SPLIT, context));
}
}
}
}
}
}
return events.iterator();
}