in ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java [308:552]
public void process( JCas jCas ) throws AnalysisEngineProcessException {
getLogger().info( "Processing ..." );
String documentId = DocIdUtil.getDocumentID( jCas );
String domainId = "";
String domainFeature = null;
if ( this.featureFunctionExtractors.size() <= 0 ) {
this.ffDomainAdaptor = null;
}
if ( documentId != null ) {
getLogger().debug( "processing next doc: " + documentId );
// set the domain to be FeatureFunction'ed into all extractors
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
domainId = fileToDomain.get( documentId );
// if domain is not found, no warning -- just considers general domain
ffDomainAdaptor.setDomain( domainId );
} else if ( !fileToDomain.isEmpty() ) {
domainFeature = fileToDomain.get( documentId );
}
} else {
getLogger().debug( "processing next doc (doc id is null)" );
}
this.lastLabel = "<BEGIN>";
// // get gold standard relation instances during testing for error analysis
// if (! this.isTraining() && printErrors) {
// JCas goldView;
// try {
// goldView = jCas.getView("GoldView");
// } catch(CASException e) {
// throw new AnalysisEngineProcessException(e);
// }
//
// //categoryLookup = createCategoryLookup(goldView);
// }
final JCas annotationView = getAnnotationView( jCas );
// Map<IdentifiedAnnotation, Collection<Sentence>> coveringSentenceMap = JCasUtil.indexCovering(annotationView, IdentifiedAnnotation.class, Sentence.class);
// Map<Sentence, Collection<BaseToken>> tokensCoveredInSentenceMap = JCasUtil.indexCovered(annotationView, Sentence.class, BaseToken.class);
// Map<IdentifiedAnnotation, Collection<Zone>> coveringZoneMap =
// JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Zone.class);
// Map<IdentifiedAnnotation, Collection<Sentence>> coveringSents =
// JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Sentence.class);
// List<Instance<String>> instances = new ArrayList<Instance<String>>();
// generate a list of training instances for each sentence in the document
// Use an indexed map. This is faster than calling select and then selectCovering within a loop.
final Map<Sentence, List<Annotation>> sentenceAnnotationMap
= JCasUtil.indexCovered( annotationView, Sentence.class, Annotation.class );
// Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
final Collection<IdentifiedAnnotation> entities = new ArrayList<>();
final Collection<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
final Collection<BaseToken> baseTokens = new ArrayList<>();
for(Sentence coveringSent : JCasUtil.select(annotationView, Sentence.class)){
Collection<Annotation> coveredAnnotations = sentenceAnnotationMap.get(coveringSent);
// Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
// Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
entities.clear();
cues.clear();
baseTokens.clear();
for ( Annotation annotation : coveredAnnotations ) {
if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
entities.add( (IdentifiedAnnotation)annotation );
} else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
cues.add( (AssertionCuePhraseAnnotation)annotation );
} else if ( annotation instanceof BaseToken ) {
baseTokens.add( (BaseToken)annotation );
}
}
for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
if ( identifiedAnnotation.getPolarity() == -1 ) {
getLogger().debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
identifiedAnnotation.getBegin(),
identifiedAnnotation.getEnd(),
identifiedAnnotation.getPolarity(),
identifiedAnnotation.getClass().getName() ) );
}
Instance<String> instance = new Instance<>();
if ( domainFeature != null ) {
instance.add( new Feature( "Domain", domainFeature ) );
}
// // extract all features that require only the entity mention annotation
// instance.addAll(tokenFeatureExtractor.extract(jCas, entityMention));
// extract all features that require the token and sentence annotations
//Sentence sentence = sentenceList.iterator().next();
/*
if (sentence != null)
{
for (ContextExtractor<IdentifiedAnnotation> extractor : this.contextFeatureExtractors) {
instance.addAll(extractor.extractWithin(annotationView, entityMention, sentence));
}
} else
{
// TODO extract context features for annotations that don't fall within a sentence
LOGGER.log(Level.WARN, "FIXME/TODO: generate context features for entities that don't fall within a sentence");
}
*/
/*
for (ContextExtractor<BaseToken> extractor : this.tokenContextFeatureExtractors) {
instance.addAll(extractor.extract(annotationView, entityMention));
}
*/
// only use extract this version if not doing domain adaptation
if ( ffDomainAdaptor == null ) {
for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
// instance.addAll(extractor.extractWithin(annotationView, entityMention, sentence));
// if ( coveringSent != null ) {
instance.addAll( extractor
.extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
// } else {
// instance.addAll( extractor.extract( annotationView, identifiedAnnotation ) );
// }
}
}
int closest = Integer.MAX_VALUE;
AssertionCuePhraseAnnotation closestCue = null;
for ( AssertionCuePhraseAnnotation cue : cues ) {
// It is much faster to count between BaseTokens already isolated within the same sentence.
final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
if ( betweenCount < closest ) {
closestCue = cue;
closest = betweenCount;
}
// instance.addAll(cuePhraseInWindowExtractor.extractBetween(jCas, cue, entityOrEventMention));
}
if ( closestCue != null && closest < 21 ) {
instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
// instance.add(new Feature("ClosestCue_Phrase", closestCue.getCuePhrase()));
instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );
// add hack-ey domain adaptation to these hacked-in features
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
instance.addAll( ffDomainAdaptor
.apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
instance.addAll( ffDomainAdaptor
.apply( new Feature( "ClosestCue_PhraseFamily", closestCue
.getCuePhraseAssertionFamily() ) ) );
instance.addAll( ffDomainAdaptor
.apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
}
}
// }
// if (cuePhraseFeatures != null && !cuePhraseFeatures.isEmpty())
// {
// instance.addAll(cuePhraseFeatures);
// }
// 7/9/13 SRH trying to make it work just for anatomical site
int eemTypeId = identifiedAnnotation.getTypeID();
if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
// 7/9/13 srh modified per tmiller so it's binary but not numeric feature
//instance.add(new Feature("ENTITY_TYPE_" + entityOrEventMention.getTypeID()));
instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
// add hack-ey domain adaptation to these hacked-in features
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
}
}
/* This hurts recall more than it helps precision
else if (eemTypeId == CONST.NE_TYPE_ID_DRUG) {
// 7/10 adding drug
instance.add(new Feature("ENTITY_TYPE_DRUG"));
}
*/
// only extract these features if not doing domain adaptation
if ( ffDomainAdaptor == null ) {
for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
}
}
for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
}
// List<Feature> zoneFeatures = extractZoneFeatures(coveringZoneMap, entityOrEventMention);
// if (zoneFeatures != null && !zoneFeatures.isEmpty())
// {
// instance.addAll(zoneFeatures);
// }
List<Feature> feats = instance.getFeatures();
// List<Feature> lcFeats = new ArrayList<Feature>();
for ( Feature feat : feats ) {
if ( feat instanceof TreeFeature ||
(feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
feat.getName().startsWith( "WORD" ) ||
feat.getName().startsWith( "NEG" ))) ) {
continue;
}
if ( feat.getName() != null &&
(feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
feat.getName().contains( "_NEG" )) ) {
continue;
}
if ( feat.getValue() instanceof String ) {
feat.setValue( ((String)feat.getValue()).toLowerCase() );
}
}
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
// TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
}
}
// grab the output label
setClassLabel( identifiedAnnotation, instance );
if ( this.isTraining() ) {
// apply feature selection, if necessary
if ( this.featureSelection != null ) {
feats = this.featureSelection.transform( feats );
}
// ensures that the (possibly) transformed feats are used
if ( instance.getOutcome() != null ) {
if ( coin.nextDouble() < this.portionOfDataToUse ) {
this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
}
}
}
}
}
}