in caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java [341:530]
void runNameFinder() {
// TODO: Check if sentences do overlap
// TODO: Check if tokens do overlap
// TODO: Check that tokens do not intersect with sentence span
IPreferenceStore store = editor.getCasDocumentProvider().getTypeSystemPreferenceStore(editor.getEditorInput());
// TODO: All preferences should be retrieved when the name finder executed!
// Just move it down the run method ...
nameTypeNames = store.getString(OpenNLPPreferenceConstants.NAME_TYPE).split(",");
for (int i = 0; i < nameTypeNames.length; i++) {
nameTypeNames[i] = nameTypeNames[i].trim();
if (nameTypeNames[i].isEmpty()) {
nameFinderView.setMessage("Name type name(s) must be set!");
return;
}
}
confirmedEntities.clear();
for (String nameTypeName : nameTypeNames) {
Type nameType = input.getCAS().getTypeSystem().getType(nameTypeName);
// TODO: Do error handling!
if (nameType == null)
return;
FSIndex<AnnotationFS> nameAnnotations = input.getCAS()
.getAnnotationIndex(nameType);
for (AnnotationFS nameAnnotation : nameAnnotations) {
// TODO: Entity must have a type ...
PotentialAnnotation entity = new PotentialAnnotation(nameAnnotation.getBegin(),
nameAnnotation.getEnd(), nameAnnotation.getCoveredText(), null,
nameAnnotation.getType().getName());
confirmedEntities.add(entity); // TODO: This needs to go into a second list!
}
}
nameFinder.addJobChangeListener(new NameFinderJobListener());
String sentenceTypeName = store.getString(OpenNLPPreferenceConstants.SENTENCE_TYPE);
if (sentenceTypeName.isEmpty()) {
nameFinderView.setMessage("Sentence type is not set!");
return;
}
String[] modelPathes = store.getString(OpenNLPPreferenceConstants.NAME_FINDER_MODEL_PATH).split(",");
for (int i = 0; i < modelPathes.length; i++) {
modelPathes[i] = modelPathes[i].trim();
if (modelPathes[i].isEmpty()) {
nameFinderView.setMessage("Model path is not set!");
return;
}
}
CAS cas = input.getCAS();
String additionalSentenceTypes = store.getString(OpenNLPPreferenceConstants.ADDITIONAL_SENTENCE_TYPE);
String text = cas.getDocumentText();
if (text != null) {
Type[] sentenceTypes = UIMAUtil.splitTypes(
sentenceTypeName + "," + additionalSentenceTypes, ',', cas.getTypeSystem());
if (sentenceTypes == null) {
nameFinderView.setMessage("Sentence type does not exist in type system!");
return;
}
String tokenName = store.getString(OpenNLPPreferenceConstants.TOKEN_TYPE);
if (tokenName.isEmpty()) {
nameFinderView.setMessage("Token type name is not set!");
return;
}
Type tokenType = cas.getTypeSystem().getType(tokenName);
if (tokenType == null) {
nameFinderView.setMessage("Token type does not exist in type system!");
return;
}
List<Span> sentences = new ArrayList<>();
List<Span> tokens = new ArrayList<>();
for (Iterator<AnnotationFS> sentenceIterator =
UIMAUtil.createMultiTypeIterator(cas, sentenceTypes);
sentenceIterator.hasNext();) {
AnnotationFS sentenceAnnotation = sentenceIterator.next();
// TODO: Add code to detect overlapping sentences ... not allowed!
sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
// Performance Note:
// The following code has O(n^2) complexity, can be optimized
// by using a token iterate over all tokens and manual weaving.
FSIndex<AnnotationFS> allTokens = cas.getAnnotationIndex(tokenType);
ContainingConstraint containingConstraint =
new ContainingConstraint(sentenceAnnotation);
Iterator<AnnotationFS> containingTokens = cas.createFilteredIterator(
allTokens.iterator(), containingConstraint);
while (containingTokens.hasNext()) {
AnnotationFS token = containingTokens.next();
tokens.add(new Span(token.getBegin(), token.getEnd()));
}
}
List<Span> nameSpans = new ArrayList<>();
for (String nameTypeName : nameTypeNames) {
Type nameType = cas.getTypeSystem().getType(nameTypeName);
if (nameType == null) {
nameFinderView.setMessage("Name type " + nameTypeName + " does not exist in type system!");
return;
}
FSIndex<AnnotationFS> nameAnnotations = cas
.getAnnotationIndex(nameType);
for (AnnotationFS nameAnnotation : nameAnnotations) {
nameSpans.add(new Span(nameAnnotation.getBegin(), nameAnnotation.getEnd(),
nameAnnotation.getType().getName()));
}
}
// Bug: Changing the data of the name finder will cause an issue if it is already running!
nameFinder.setText(text);
if (sentences.size() == 0) {
nameFinderView.setMessage("CAS must at least contain one sentence!");
return;
}
nameFinder.setSentences(sentences.toArray(new Span[0]));
if (tokens.size() == 0) {
nameFinderView.setMessage("CAS must at least contain one token within a sentence!");
return;
}
nameFinder.setTokens(tokens.toArray(new Span[0]));
nameFinder.setModelPath(modelPathes, nameTypeNames);
if (!nameFinder.isSystem()) {
nameFinder.setSystem(true);
}
boolean isRecallBoostingEnabled =
store.getBoolean(OpenNLPPreferenceConstants.ENABLE_CONFIRMED_NAME_DETECTION);
if (isRecallBoostingEnabled) {
nameFinder.setVerifiedNames(nameSpans.toArray(new Span[0]));
}
else {
nameFinder.setVerifiedNames(null);
}
nameFinder.setIgnoreShortTokens(store.getBoolean(
OpenNLPPreferenceConstants.IGNORE_SHORT_TOKENS));
nameFinder.setOnlyConsiderAllLetterTokens(store.getBoolean(
OpenNLPPreferenceConstants.ONLY_CONSIDER_ALL_LETTER_TOKENS));
nameFinder.setOnlyConsiderInitialCapitalLetterTokens(store.getBoolean(
OpenNLPPreferenceConstants.ONLY_CONSIDER_INITIAL_CAPITAL_TOKENS));
nameFinder.schedule();
}
}