in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java [53:168]
public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, Span[][] namesBySentence) {
ArrayList<LinkedSpan> spans = new ArrayList<>();
if (linkerProperties == null) {
throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
}
//countryMentions = countryContext.regexfind(doctext);
AdminBoundaryContext context = countryContext.getContext(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
Span[] tokenSpans = tokensBySentence[s];
String[] tokens = Span.spansToStrings(tokenSpans, sentences[s].getCoveredText(doctext));
String[] matches = Span.spansToStrings(names, tokens);
for (int i = 0; i < matches.length; i++) {
ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
if (!context.getWhereClauses().isEmpty()) {
for (String whereclause : context.getWhereClauses()) {
ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, whereclause);
for (GazetteerEntry gazetteerEntry : find) {
if (!geoNamesEntries.contains(gazetteerEntry)) {
geoNamesEntries.add(gazetteerEntry);
}
}
}
} else {//this means there were no where clauses generated so the where clause will default to look at the entire index
ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions ");
for (GazetteerEntry gazetteerEntry : find) {
if (!geoNamesEntries.contains(gazetteerEntry)) {
geoNamesEntries.add(gazetteerEntry);
}
}
}
if (geoNamesEntries.isEmpty()) {
continue;
}
/*
* Normalize the returned scores for this name... this will assist the
* sort
*/
if (!spans.isEmpty()) {
Double maxscore = 0d;
for (BaseLink gazetteerEntry : geoNamesEntries) {
Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
if (deNormScore.compareTo(maxscore) > 0) {
maxscore = deNormScore;
}
}
for (BaseLink gazetteerEntry : geoNamesEntries) {
Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
Double normalize = normalize(deNormScore, 0d, maxscore);
gazetteerEntry.getScoreMap().put("normlucene", normalize);
}
}
LinkedSpan<BaseLink> newspan = new LinkedSpan<>(geoNamesEntries, names[i], 0);
newspan.setSearchTerm(matches[i]);
newspan.setLinkedEntries(geoNamesEntries);
newspan.setSentenceid(s);
spans.add(newspan);
}
}
if (!scorers.isEmpty()) {
for (LinkedEntityScorer scorer : scorers) {
scorer.score(spans, doctext, sentences, linkerProperties, context);
}
}
/*
* sort the data with the best score on top based on the sum of the scores
* below from the score map for each baselink object
*/
for (LinkedSpan<BaseLink> s : spans) {
ArrayList<BaseLink> linkedData = s.getLinkedEntries();
linkedData.sort(Collections.reverseOrder((o1, o2) -> {
Map<String, Double> o1scoreMap = o1.getScoreMap();
Map<String, Double> o2scoreMap = o2.getScoreMap();
if (o1scoreMap.size() != o2scoreMap.size()) {
return 0;
}
double sumo1 = 0d;
double sumo2 = 0d;
for (String object : o1scoreMap.keySet()) {
if (object.equals("typescore")
|| object.equals("countrycontext")
|| object.equals("placenamedicecoef")
|| object.equals("provincecontext")
|| object.equals("geohashbin")
|| object.equals("normlucene")) {
sumo1 += o1scoreMap.get(object);
sumo2 += o2scoreMap.get(object);
}
}
return Double.compare(sumo1,
sumo2);
}));
//prune the list to topN
Iterator<BaseLink> iterator = linkedData.iterator();
int n = 0;
while (iterator.hasNext()) {
if (n >= topN) {
iterator.remove();
}
iterator.next();
n++;
}
}
return spans;
}