in opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java [246:317]
private void processLeaf(Leaf leaf, List<String> sentence, List<Span> names) {
boolean alreadyAdded = false;
if (leftContractionPart != null) {
// will handle the contraction
String right = leaf.getLexeme();
String c = PortugueseContractionUtility.toContraction(
leftContractionPart, right);
if (c != null) {
String[] parts = WHITESPACE_PATTERN.split(c);
sentence.addAll(Arrays.asList(parts));
alreadyAdded = true;
} else {
// contraction was missing! why?
sentence.add(leftContractionPart);
// keep alreadyAdded false.
}
leftContractionPart = null;
}
String namedEntityTag = null;
int startOfNamedEntity = -1;
String leafTag = leaf.getSecondaryTag();
boolean expandLastNER = false; // used when we find a <NER2> tag
if (leafTag != null) {
if (leafTag.contains("<sam->") && !alreadyAdded) {
String[] lexemes = UNDERLINE_PATTERN.split(leaf.getLexeme());
if (lexemes.length > 1) {
sentence.addAll(Arrays.asList(lexemes).subList(0, lexemes.length - 1));
}
leftContractionPart = lexemes[lexemes.length - 1];
return;
}
if (leafTag.contains("<NER2>")) {
// this one an be part of the last name
expandLastNER = true;
}
namedEntityTag = getNER(leafTag);
}
if (namedEntityTag != null) {
startOfNamedEntity = sentence.size();
}
if (!alreadyAdded) {
sentence.addAll(processLexeme(leaf.getLexeme()));
}
if (namedEntityTag != null) {
names
.add(new Span(startOfNamedEntity, sentence.size(), namedEntityTag));
}
if (expandLastNER) {
// if the current leaf has the tag <NER2>, it can be the continuation of
// a NER.
// we check if it is true, and expand the last NER
int lastIndex = names.size() - 1;
if (names.size() > 0) {
Span last = names.get(lastIndex);
if (last.getEnd() == sentence.size() - 1) {
names.set(lastIndex, new Span(last.getStart(), sentence.size(),
last.getType()));
}
}
}
}