in caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java [105:218]
protected synchronized IStatus run(IProgressMonitor monitor) {
// TODO: Check if model path changed, compared to last run, if so reload
// TODO: Check if the model itself changed, compared to last run, if so reload
if (nameFinder == null) {
try {
nameFinder = new MultiModelNameFinder(modelPath, modelTypes);
} catch (IOException e) {
return new Status(IStatus.CANCEL, OpenNLPPlugin.ID, e.getMessage());
}
}
if (nameFinder != null) {
nameFinder.clearAdaptiveData(); // TODO: If model loading fails we get a NPE here!
nameList = new ArrayList<>();
// TODO: Name tokens, should be for the entire text,
// not just the prev sentences ...
Set<String> nameTokens = new HashSet<>();
for (Span sentence : sentences) {
// Create token list for sentence
List<Span> sentenceTokens = new ArrayList<>();
for (Span token : tokens) {
if (sentence.contains(token)) {
sentenceTokens.add(token);
}
}
String[] tokenStrings = new String[sentenceTokens.size()];
for (int i = 0; i < sentenceTokens.size(); i++) {
Span token = sentenceTokens.get(i);
tokenStrings[i] = token.getCoveredText(text).toString();
}
Map<Integer, String> verifiedNameTokens = new HashMap<>();
// Note: This is slow!
// iterate over names, to find token indexes
if (verifiedNames != null) {
for (Span verifiedName : verifiedNames) {
boolean isStart = true;
for (int i = 0; i < sentenceTokens.size(); i++) {
if (verifiedName.contains(sentenceTokens.get(i))) {
String outcome;
// Need better mechanism here, first token in entity should be start!
if (isStart) {
outcome = NameFinderME.START;
isStart = false;
}
else {
outcome = NameFinderME.CONTINUE;
}
// TODO: Overlapping names are dangerous here!
// TODO: We could use type information here ...
// as part of the outcome!
verifiedNameTokens.put(i, verifiedName.getType() + "-" + outcome);
StringPattern pattern = StringPattern.recognize(tokenStrings[i]);
boolean useToken = true;
if (ignoreShortTokens && tokenStrings[i].length() < 4) {
useToken = false;
}
else if (onlyConsiderAllLetterTokens && !pattern.isAllLetter()) {
useToken = false;
}
else if (onlyConsiderInitialLetterTokens && !pattern.isInitialCapitalLetter()) {
useToken = false;
}
if (useToken) {
nameTokens.add(verifiedName.getType() + "-" + tokenStrings[i]);
}
}
}
}
}
nameFinder.setRestriction(verifiedNameTokens);
nameFinder.setNameOnlyTokens(nameTokens);
// TODO: Use multiple name finders here ....
ConfidenceSpan[] names = nameFinder.find(tokenStrings);
for (ConfidenceSpan name : names) {
// add sentence offset here ...
int beginIndex = sentenceTokens.get(name.getStart()).getStart();
int endIndex = sentenceTokens.get(name.getEnd() - 1).getEnd();
String coveredText = text.substring(beginIndex, endIndex);
nameList.add(new PotentialAnnotation(beginIndex, endIndex, coveredText,
name.getConfidence(), name.getType()));
}
}
}
// TODO: If there is a problem return an error status,
// and calling client can fetch error message via method call
// Use OpenNLPPlugin to log errors ...
return new Status(IStatus.OK, OpenNLPPlugin.ID, "OK");
}