in ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java [278:487]
public void process( JCas jCas ) throws AnalysisEngineProcessException {
String documentId = DocIdUtil.getDocumentID( jCas );
String domainId = "";
String domainFeature = null;
if ( this.featureFunctionExtractors.size() <= 0 ) {
this.ffDomainAdaptor = null;
}
if ( documentId != null ) {
LOGGER.debug( "processing next doc: " + documentId );
// set the domain to be FeatureFunction'ed into all extractors
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
domainId = fileToDomain.get( documentId );
// if domain is not found, no warning -- just considers general domain
ffDomainAdaptor.setDomain( domainId );
} else if ( !fileToDomain.isEmpty() ) {
domainFeature = fileToDomain.get( documentId );
}
} else {
LOGGER.debug( "processing next doc (doc id is null)" );
}
this.lastLabel = "<BEGIN>";
final JCas annotationView = getAnnotationView( jCas );
// generate a list of training instances for each sentence in the document
// Use an indexed map. This is faster than calling select and then selectCovering within a loop.
final Map<Sentence, List<Annotation>> sentenceAnnotationMap
= JCasUtil.indexCovered( annotationView, Sentence.class, Annotation.class );
// Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
final List<IdentifiedAnnotation> entities = new ArrayList<>();
final List<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
final List<BaseToken> baseTokens = new ArrayList<>();
// 25 Dec 2018 10:51:49 INFO CleartkAnalysisEngine - Assigning Attributes ...
// 25 Dec 2018 14:35:45 INFO CleartkAnalysisEngine - Finished Assigning Attributes
// Rather than iterate through all features again, just sort the sentences that have already been fetched.
// As far as I can tell, order should be unnecessary.
// Using a treemap that is sorted during putAll prevents the need to run a Map.get(..) - fast, but not that fast.
// 25 Dec 2018 14:52:37 INFO CleartkAnalysisEngine - Assigning Attributes ...
// 25 Dec 2018 18:32:24 INFO CleartkAnalysisEngine - Finished Assigning Attributes
//
// TODO : Windowed Assertion:
// 26 Dec 2018 16:21:30 INFO CleartkAnalysisEngine - Assigning Attributes ...
// 26 Dec 2018 17:38:11 INFO CleartkAnalysisEngine - Finished Assigning Attributes
final TreeMap<Sentence, Collection<Annotation>> sentenceTreeMap
= new TreeMap<>( Comparator.comparingInt( Sentence::getBegin ) );
sentenceTreeMap.putAll( sentenceAnnotationMap );
// History needs full list of sentences
final List<Sentence> sentenceList = new ArrayList<>(sentenceTreeMap.keySet() );
for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
// ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
((AbstractWindowedFeatureExtractor1)extractor).setSentences( sentenceList );
}
}
for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
// ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
((AbstractWindowedFeatureExtractor1)extractor).setSentences( sentenceList );
}
}
int sentenceIndex = -1;
for ( Map.Entry<Sentence, Collection<Annotation>> sortedEntry : sentenceTreeMap.entrySet() ) {
sentenceIndex++;
final Sentence coveringSent = sortedEntry.getKey();
final List<Annotation> coveredAnnotations = new ArrayList<>( sortedEntry.getValue() );
coveredAnnotations.sort( Comparator.comparingInt( Annotation::getBegin ) );
// _windowedContexts.forEach( c -> c.setWindow( coveredAnnotations ) );
// Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
// Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
entities.clear();
cues.clear();
baseTokens.clear();
for ( Annotation annotation : coveredAnnotations ) {
if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
entities.add( (IdentifiedAnnotation)annotation );
} else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
cues.add( (AssertionCuePhraseAnnotation)annotation );
} else if ( annotation instanceof BaseToken ) {
baseTokens.add( (BaseToken)annotation );
}
}
_windowedContexts.forEach( c -> c.setWindow( baseTokens ) );
for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
if ( identifiedAnnotation.getPolarity() == -1 ) {
LOGGER.debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
identifiedAnnotation.getBegin(),
identifiedAnnotation.getEnd(),
identifiedAnnotation.getPolarity(),
identifiedAnnotation.getClass().getName() ) );
}
Instance<String> instance = new Instance<>();
if ( domainFeature != null ) {
instance.add( new Feature( "Domain", domainFeature ) );
}
// only use extract this version if not doing domain adaptation
if ( ffDomainAdaptor == null ) {
for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
instance.addAll( extractor
.extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
}
}
int closest = Integer.MAX_VALUE;
AssertionCuePhraseAnnotation closestCue = null;
for ( AssertionCuePhraseAnnotation cue : cues ) {
// It is much faster to count between BaseTokens already isolated within the same sentence.
final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
if ( betweenCount < closest ) {
closestCue = cue;
closest = betweenCount;
}
}
if ( closestCue != null && closest < 21 ) {
instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );
// add hack-ey domain adaptation to these hacked-in features
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
instance.addAll( ffDomainAdaptor
.apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
instance.addAll( ffDomainAdaptor
.apply( new Feature( "ClosestCue_PhraseFamily", closestCue
.getCuePhraseAssertionFamily() ) ) );
instance.addAll( ffDomainAdaptor
.apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
}
}
// 7/9/13 SRH trying to make it work just for anatomical site
int eemTypeId = identifiedAnnotation.getTypeID();
if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
// 7/9/13 srh modified per tmiller so it's binary but not numeric feature
instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
// add hack-ey domain adaptation to these hacked-in features
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
}
}
// only extract these features if not doing domain adaptation
if ( ffDomainAdaptor == null ) {
for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
}
instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
}
}
for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
}
instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
}
List<Feature> feats = instance.getFeatures();
for ( Feature feat : feats ) {
if ( feat instanceof TreeFeature ||
(feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
feat.getName().startsWith( "WORD" ) ||
feat.getName().startsWith( "NEG" ))) ) {
continue;
}
if ( feat.getName() != null &&
(feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
feat.getName().contains( "_NEG" )) ) {
continue;
}
if ( feat.getValue() instanceof String ) {
feat.setValue( ((String)feat.getValue()).toLowerCase() );
}
}
if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
// TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
}
}
// grab the output label
setClassLabel( identifiedAnnotation, instance );
if ( this.isTraining() ) {
// apply feature selection, if necessary
if ( this.featureSelection != null ) {
feats = this.featureSelection.transform( feats );
}
// ensures that the (possibly) transformed feats are used
if ( instance.getOutcome() != null ) {
if ( coin.nextDouble() < this.portionOfDataToUse ) {
this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
}
}
}
}
}
}