in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java [117:184]
private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits,
LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
Double score = 0.0;
/*
* get the index of the actual span, beginning of sentence //should generate
* tokens from sentence and create a char offset... //could have large
* sentences due to poor sentence detection or wonky doc text
*/
int sentenceIdx = span.getSentenceid();
int sentIndexInDoc = sentences[sentenceIdx].getStart();
/*
* create a map of all the span's proximal country mentions in the document
* Map< countrycode, set of <distances from this NamedEntity>>
*/
Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<>();
//map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
for (String cCode : countryHits.keySet()) {
// iterate over all the regex start values and calculate an offset
for (Integer cHit : countryHits.get(cCode)) {
int absDist = Math.abs(sentIndexInDoc - cHit);
//only include near mentions based on a heuristic
//TODO make this a property
// if (absDist < maxAllowedDistance) {
if (distancesFromCodeMap.containsKey(cCode)) {
distancesFromCodeMap.get(cCode).add(absDist);
} else {
distancesFromCodeMap.put(cCode, new HashSet<>(absDist));
}
}
}
//we now know how far this named entity is from every country mention in the document
/*
* the gaz matches that have a country code that have mentions in the doc
* that are closest to the Named Entity should return the best score.
* Analyzemap generates a likelihood score that the toponym from the gaz is
* referring to one of the countries, i.e, Map<countrycode, prob that this
* span is referring to the toponym form this code key>
*/
Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
if (scoreMap.isEmpty()) {
return span;
}
for (BaseLink link : span.getLinkedEntries()) {
//getItemParentId is the country code
GazetteerEntry entry = (GazetteerEntry) link;
String spanCountryCode = entry.getProvinceCode();
if (scoreMap.containsKey(spanCountryCode)) {
score = scoreMap.get(spanCountryCode);
///does the name extracted match a province name?
if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {
//if so, is it the correct country code for that name?
if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) {
//boost the score because it is likely that this is the location in the text, so add 50% to the score or set to 1
//TODO: make this smarter
score = (score + .75) > 1.0 ? 1d : (score + .75);
if (entry.getProvinceCode().equals(dominantCode)) {
score = (score + .25) > 1.0 ? 1d : (score + .25);
}
}
}
}
link.getScoreMap().put("provincecontext", score);
}
return span;
}