in opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java [192:264]
boolean addNamedEntities(Map<Integer, String> entityIDtoEntityType,
Map<Integer, List<Integer>> entityIDsToTokens) throws IOException {
boolean fileWithoutIssues = true;
if (sentenceTokens == null) {
throw new IOException("Named entity labels provided for an un-tokenized sentence.");
}
//for each named entity identify its span
for (Map.Entry<Integer, List<Integer>> namedEntity : entityIDsToTokens.entrySet()) {
int entityID = namedEntity.getKey();
String type = entityIDtoEntityType.get(entityID);
List<Integer> tokenIDs = namedEntity.getValue();
int start = sentenceTokens.size();
int end = 0;
boolean entityInThisSentence = false;
for (int tokenID : tokenIDs) {
if (tokensById.containsKey(tokenID)) {
entityInThisSentence = true;
if (tokensById.get(tokenID) < start) {
start = tokensById.get(tokenID);
}
if (tokensById.get(tokenID) > end) {
end = tokensById.get(tokenID) + 1;
}
}
}
if (entityInThisSentence) {
namedEntities.add(new Span(start, end, type));
}
}
Comparator<Span> compareByStart = Comparator.comparingInt(Span::getStart);
namedEntities.sort(compareByStart);
Set<Integer> overlaps = new HashSet<>();
int leftIndex = 0;
int rightIndex = leftIndex + 1;
while (rightIndex < namedEntities.size()) {
Span leftSpan = namedEntities.get(leftIndex);
Span rightSpan = namedEntities.get(rightIndex);
if (leftSpan.contains(rightSpan) || leftSpan.crosses(rightSpan)) {
logger.warn("Named entities overlap. This is forbidden in OpenNLP." +
"\n\tKeeping the longer of them.");
if (rightSpan.length() > leftSpan.length()) {
overlaps.add(leftIndex);
} else {
overlaps.add(rightIndex);
}
fileWithoutIssues = false;
rightIndex++;
} else {
leftIndex++;
}
}
if (!fileWithoutIssues) {
List<Span> namedEntitiesNoOverlaps = new ArrayList<>();
for (int i = 0; i < namedEntities.size() - 1; i++) {
if (!overlaps.contains(i)) {
namedEntitiesNoOverlaps.add(namedEntities.get(i));
}
}
namedEntities = Collections.unmodifiableList(namedEntitiesNoOverlaps);
}
return fileWithoutIssues;
}