geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java [73:148]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - public List score(List linkedData, Map> countryHits, Map> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { this.nameCodesMap = nameCodesMap; setDominantCode(countryHits); for (LinkedSpan linkedspan : linkedData) { linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); } return linkedData; } /** * sets class level variable to a code based on the number of mentions * * @param countryHits */ private void setDominantCode(Map> countryHits) { int hits = -1; for (String code : countryHits.keySet()) { if (countryHits.get(code).size() > hits) { hits = countryHits.get(code).size(); dominantCode = code; } } } /** * Generates distances from each country mention to the span's location in the * doc text. Ultimately an attempt to ensure that ambiguously named toponyms * are resolved to the correct country and coordinate. * * @param sentences * @param countryHits * @param span * @return */ private LinkedSpan simpleProximityAnalysis(Span[] sentences, Map> countryHits, LinkedSpan span, Integer maxAllowedDistance) { Double score = 0.0; /* * get the index of the actual span, beginning of sentence //should generate * tokens from sentence and create a char offset... //could have large * sentences due to poor sentence detection or wonky doc text */ int sentenceIdx = span.getSentenceid(); int sentIndexInDoc = sentences[sentenceIdx].getStart(); /* * create a map of all the span's proximal country mentions in the document * Map< countrycode, set of > */ Map> distancesFromCodeMap = new HashMap<>(); //map = Map> for (String cCode : countryHits.keySet()) { // iterate over all the regex start values and calculate an offset for (Integer cHit : countryHits.get(cCode)) { Integer absDist = Math.abs(sentIndexInDoc - cHit); //only include near mentions based on a heuristic //TODO make this a property // if (absDist < maxAllowedDistance) { if (distancesFromCodeMap.containsKey(cCode)) { distancesFromCodeMap.get(cCode).add(absDist); } else { HashSet newset = new HashSet<>(); newset.add(absDist); distancesFromCodeMap.put(cCode, newset); } } } //we now know how far this named entity is from every country mention in the document /* * the gaz matches that have a country code that have mentions in the doc * that are closest to the Named Entity should return the best score. * Analyzemap generates a likelihood score that the toponym from the gaz is * referring to one of the countries, i.e, Map */ Map scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java [84:161]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - public List score(List linkedData, Map> countryHits, Map> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { this.nameCodesMap = nameCodesMap; setDominantCode(countryHits); for (LinkedSpan linkedspan : linkedData) { linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); } return linkedData; } /** * sets class level variable to a code based on the number of mentions * * @param countryHits */ private void setDominantCode(Map> countryHits) { int hits = -1; for (String code : countryHits.keySet()) { if (countryHits.get(code).size() > hits) { hits = countryHits.get(code).size(); dominantCode = code; } } } /** * Generates distances from each country mention to the span's location in the * doc text. Ultimately an attempt to ensure that ambiguously named toponyms * are resolved to the correct country and coordinate. * * @param sentences * @param countryHits * @param span * @return */ private LinkedSpan simpleProximityAnalysis(Span[] sentences, Map> countryHits, LinkedSpan span, Integer maxAllowedDistance) { Double score = 0.0; /* * get the index of the actual span, begining of sentence //should generate * tokens from sentence and create a char offset... //could have large * sentences due to poor sentence detection or wonky doc text */ int sentenceIdx = span.getSentenceid(); int sentIndexInDoc = sentences[sentenceIdx].getStart(); /** * create a map of all the span's proximal country mentions in the document * Map< countrycode, set of > */ Map> distancesFromCodeMap = new HashMap<>(); //map = Map> for (String cCode : countryHits.keySet()) { //iterate over all the regex start values and calculate an offset for (Integer cHit : countryHits.get(cCode)) { Integer absDist = Math.abs(sentIndexInDoc - cHit); //only include near mentions based on a heuristic //TODO make this a property // if (absDist < maxAllowedDistance) { if (distancesFromCodeMap.containsKey(cCode)) { distancesFromCodeMap.get(cCode).add(absDist); } else { HashSet newset = new HashSet<>(); newset.add(absDist); distancesFromCodeMap.put(cCode, newset); } } //} } //we now know how far this named entity is from every country mention in the document /** * the gaz matches that have a country code that have mentions in the doc * that are closest to the Named Entity should return the best score. * Analyzemap generates a likelihood score that the toponym from the gaz is * referring to one of the countries, i.e, Map */ Map scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -