private LinkedSpan simpleProximityAnalysis()

in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java [107:171]


  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
    Double score = 0.0;
    /*
     * get the index of the actual span, beginning of sentence //should generate
     * tokens from sentence and create a char offset... //could have large
     * sentences due to poor sentence detection or wonky doc text
     */
    int sentenceIdx = span.getSentenceid();
    int sentIndexInDoc = sentences[sentenceIdx].getStart();
    /*
     * create a map of all the span's proximal country mentions in the document
     * Map< countrycode, set of <distances from this NamedEntity>>
     */
    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<>();
    //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
    for (String cCode : countryHits.keySet()) {
      // iterate over all the regex start values and calculate an offset
      for (Integer cHit : countryHits.get(cCode)) {
        Integer absDist = Math.abs(sentIndexInDoc - cHit);
        //only include near mentions based on a heuristic
        //TODO make this a property
        //  if (absDist < maxAllowedDistance) {
        if (distancesFromCodeMap.containsKey(cCode)) {
          distancesFromCodeMap.get(cCode).add(absDist);
        } else {
          HashSet<Integer> newset = new HashSet<>();
          newset.add(absDist);
          distancesFromCodeMap.put(cCode, newset);
        }
      }
    }
    //we now know how far this named entity is from every country mention in the document

    /*
     * the gaz matches that have a country code that have mentions in the doc
     * that are closest to the Named Entity should return the best score.
     * Analyzemap generates a likelihood score that the toponym from the gaz is
     * referring to one of the countries, i.e, Map<countrycode, prob that this
     * span is referring to the toponym form this code key>
     */
    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
    for (BaseLink link : span.getLinkedEntries()) {
      //getItemParentId is the country code
      String spanCountryCode = link.getItemParentID();
      if (scoreMap.containsKey(spanCountryCode)) {

        score = scoreMap.get(spanCountryCode);
        ///does the name extracted match a country name?
        if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {
          //if so, is it the correct country code for that name?
          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
            //boost the score because it is likely that this is the location in the text, so add 50% to the score or set to 1
            score = (score + .75) > 1.0 ? 1d : (score + .75);

            if (link.getItemParentID().equals(dominantCode)) {
              score = (score + .25) > 1.0 ? 1d : (score + .25);
            }
          }
        }
      }

      link.getScoreMap().put("countrycontext", score);
    }
    return span;
  }