public List find()

in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java [53:168]


  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, Span[][] namesBySentence) {
    ArrayList<LinkedSpan> spans = new ArrayList<>();

    if (linkerProperties == null) {
      throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
    }
    //countryMentions = countryContext.regexfind(doctext);
    AdminBoundaryContext context = countryContext.getContext(doctext);
    for (int s = 0; s < sentences.length; s++) {
      Span[] names = namesBySentence[s];

      Span[] tokenSpans = tokensBySentence[s];
      String[] tokens = Span.spansToStrings(tokenSpans, sentences[s].getCoveredText(doctext));

      String[] matches = Span.spansToStrings(names, tokens);

      for (int i = 0; i < matches.length; i++) {

        ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
        if (!context.getWhereClauses().isEmpty()) {
          for (String whereclause : context.getWhereClauses()) {
            ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, whereclause);
            for (GazetteerEntry gazetteerEntry : find) {
              if (!geoNamesEntries.contains(gazetteerEntry)) {
                geoNamesEntries.add(gazetteerEntry);
              }
            }

          }
        } else {//this means there were no where clauses generated so the where clause will default to look at the entire index
          ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions ");
          for (GazetteerEntry gazetteerEntry : find) {
            if (!geoNamesEntries.contains(gazetteerEntry)) {
              geoNamesEntries.add(gazetteerEntry);
            }
          }
        }
        if (geoNamesEntries.isEmpty()) {
          continue;
        }
        /*
         * Normalize the returned scores for this name... this will assist the
         * sort
         */
        if (!spans.isEmpty()) {

          Double maxscore = 0d;
          for (BaseLink gazetteerEntry : geoNamesEntries) {
            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
            if (deNormScore.compareTo(maxscore) > 0) {
              maxscore = deNormScore;
            }
          }
          for (BaseLink gazetteerEntry : geoNamesEntries) {
            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
            Double normalize = normalize(deNormScore, 0d, maxscore);
            gazetteerEntry.getScoreMap().put("normlucene", normalize);
          }
        }
        LinkedSpan<BaseLink> newspan = new LinkedSpan<>(geoNamesEntries, names[i], 0);
        newspan.setSearchTerm(matches[i]);
        newspan.setLinkedEntries(geoNamesEntries);
        newspan.setSentenceid(s);
        spans.add(newspan);
      }

    }

    if (!scorers.isEmpty()) {
      for (LinkedEntityScorer scorer : scorers) {
        scorer.score(spans, doctext, sentences, linkerProperties, context);
      }
    }
    /*
     * sort the data with the best score on top based on the sum of the scores
     * below from the score map for each baselink object
     */
    for (LinkedSpan<BaseLink> s : spans) {
      ArrayList<BaseLink> linkedData = s.getLinkedEntries();
      linkedData.sort(Collections.reverseOrder((o1, o2) -> {
        Map<String, Double> o1scoreMap = o1.getScoreMap();
        Map<String, Double> o2scoreMap = o2.getScoreMap();
        if (o1scoreMap.size() != o2scoreMap.size()) {
          return 0;
        }
        double sumo1 = 0d;
        double sumo2 = 0d;
        for (String object : o1scoreMap.keySet()) {
          if (object.equals("typescore")
                  || object.equals("countrycontext")
                  || object.equals("placenamedicecoef")
                  || object.equals("provincecontext")
                  || object.equals("geohashbin")
                  || object.equals("normlucene")) {
            sumo1 += o1scoreMap.get(object);
            sumo2 += o2scoreMap.get(object);
          }
        }

        return Double.compare(sumo1,
                sumo2);
      }));
      //prune the list to topN
      Iterator<BaseLink> iterator = linkedData.iterator();
      int n = 0;
      while (iterator.hasNext()) {
        if (n >= topN) {
          iterator.remove();
        }
        iterator.next();
        n++;
      }
    }

    return spans;
  }