boolean addNamedEntities()

in opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java [192:264]


  boolean addNamedEntities(Map<Integer, String> entityIDtoEntityType,
                           Map<Integer, List<Integer>> entityIDsToTokens) throws IOException {
    boolean fileWithoutIssues = true;
    if (sentenceTokens == null) {
      throw new IOException("Named entity labels provided for an un-tokenized sentence.");
    }

    //for each named entity identify its span
    for (Map.Entry<Integer, List<Integer>> namedEntity : entityIDsToTokens.entrySet()) {

      int entityID = namedEntity.getKey();
      String type = entityIDtoEntityType.get(entityID);

      List<Integer> tokenIDs = namedEntity.getValue();

      int start = sentenceTokens.size();
      int end = 0;
      boolean entityInThisSentence = false;
      for (int tokenID : tokenIDs) {

        if (tokensById.containsKey(tokenID)) {
          entityInThisSentence = true;
          if (tokensById.get(tokenID) < start) {
            start = tokensById.get(tokenID);
          }
          if (tokensById.get(tokenID) > end) {
            end = tokensById.get(tokenID) + 1;
          }
        }
      }

      if (entityInThisSentence) {
        namedEntities.add(new Span(start, end, type));
      }

    }

    Comparator<Span> compareByStart = Comparator.comparingInt(Span::getStart);
    namedEntities.sort(compareByStart);

    Set<Integer> overlaps = new HashSet<>();
    int leftIndex = 0;
    int rightIndex = leftIndex + 1;
    while (rightIndex < namedEntities.size()) {
      Span leftSpan = namedEntities.get(leftIndex);
      Span rightSpan = namedEntities.get(rightIndex);
      if (leftSpan.contains(rightSpan) || leftSpan.crosses(rightSpan)) {
        logger.warn("Named entities overlap. This is forbidden in OpenNLP." +
            "\n\tKeeping the longer of them.");
        if (rightSpan.length() > leftSpan.length()) {
          overlaps.add(leftIndex);
        } else {
          overlaps.add(rightIndex);
        }
        fileWithoutIssues = false;
        rightIndex++;
      } else {
        leftIndex++;
      }
    }

    if (!fileWithoutIssues) {
      List<Span> namedEntitiesNoOverlaps = new ArrayList<>();
      for (int i = 0; i < namedEntities.size() - 1; i++) {
        if (!overlaps.contains(i)) {
          namedEntitiesNoOverlaps.add(namedEntities.get(i));
        }
      }
      namedEntities = Collections.unmodifiableList(namedEntitiesNoOverlaps);
    }

    return fileWithoutIssues;
  }