public void process()

in ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java [278:487]
156 lines of code
53 McCabe index (conditional complexity)

   public void process( JCas jCas ) throws AnalysisEngineProcessException {
      String documentId = DocIdUtil.getDocumentID( jCas );
      String domainId = "";
      String domainFeature = null;

      if ( this.featureFunctionExtractors.size() <= 0 ) {
         this.ffDomainAdaptor = null;
      }

      if ( documentId != null ) {
         LOGGER.debug( "processing next doc: " + documentId );
         // set the domain to be FeatureFunction'ed into all extractors
         if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
            domainId = fileToDomain.get( documentId );
            // if domain is not found, no warning -- just considers general domain
            ffDomainAdaptor.setDomain( domainId );
         } else if ( !fileToDomain.isEmpty() ) {
            domainFeature = fileToDomain.get( documentId );
         }
      } else {
         LOGGER.debug( "processing next doc (doc id is null)" );
      }

      this.lastLabel = "<BEGIN>";

      final JCas annotationView = getAnnotationView( jCas );

      // generate a list of training instances for each sentence in the document
      // Use an indexed map.  This is faster than calling select and then selectCovering within a loop.
      final Map<Sentence, List<Annotation>> sentenceAnnotationMap
            = JCasUtil.indexCovered( annotationView, Sentence.class, Annotation.class );
      // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
      final List<IdentifiedAnnotation> entities = new ArrayList<>();
      final List<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
      final List<BaseToken> baseTokens = new ArrayList<>();
//          25 Dec 2018 10:51:49  INFO CleartkAnalysisEngine - Assigning Attributes ...
//          25 Dec 2018 14:35:45  INFO CleartkAnalysisEngine - Finished Assigning Attributes
      // Rather than iterate through all features again, just sort the sentences that have already been fetched.
      // As far as I can tell, order should be unnecessary.
      // Using a treemap that is sorted during putAll prevents the need to run a Map.get(..) - fast, but not that fast.
//          25 Dec 2018 14:52:37  INFO CleartkAnalysisEngine - Assigning Attributes ...
//          25 Dec 2018 18:32:24  INFO CleartkAnalysisEngine - Finished Assigning Attributes
      //
      //  TODO : Windowed Assertion:
//      26 Dec 2018 16:21:30  INFO CleartkAnalysisEngine - Assigning Attributes ...
//      26 Dec 2018 17:38:11  INFO CleartkAnalysisEngine - Finished Assigning Attributes
      final TreeMap<Sentence, Collection<Annotation>> sentenceTreeMap
            = new TreeMap<>( Comparator.comparingInt( Sentence::getBegin ) );
      sentenceTreeMap.putAll( sentenceAnnotationMap );
      // History needs full list of sentences
      final List<Sentence> sentenceList = new ArrayList<>(sentenceTreeMap.keySet() );
      for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
         if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
//            ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
            ((AbstractWindowedFeatureExtractor1)extractor).setSentences( sentenceList );
         }
      }
      for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
         if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
//            ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
            ((AbstractWindowedFeatureExtractor1)extractor).setSentences( sentenceList );
         }
      }

      int sentenceIndex = -1;
      for ( Map.Entry<Sentence, Collection<Annotation>> sortedEntry : sentenceTreeMap.entrySet() ) {
         sentenceIndex++;
         final Sentence coveringSent = sortedEntry.getKey();
         final List<Annotation> coveredAnnotations = new ArrayList<>( sortedEntry.getValue() );
         coveredAnnotations.sort( Comparator.comparingInt( Annotation::getBegin ) );
//         _windowedContexts.forEach( c -> c.setWindow( coveredAnnotations ) );
         // Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
         // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
         entities.clear();
         cues.clear();
         baseTokens.clear();
         for ( Annotation annotation : coveredAnnotations ) {
            if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
               entities.add( (IdentifiedAnnotation)annotation );
            } else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
               cues.add( (AssertionCuePhraseAnnotation)annotation );
            } else if ( annotation instanceof BaseToken ) {
               baseTokens.add( (BaseToken)annotation );
            }
         }
         _windowedContexts.forEach( c -> c.setWindow( baseTokens ) );

         for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
            if ( identifiedAnnotation.getPolarity() == -1 ) {
               LOGGER.debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
                     identifiedAnnotation.getBegin(),
                     identifiedAnnotation.getEnd(),
                     identifiedAnnotation.getPolarity(),
                     identifiedAnnotation.getClass().getName() ) );
            }
            Instance<String> instance = new Instance<>();

            if ( domainFeature != null ) {
               instance.add( new Feature( "Domain", domainFeature ) );
            }
            // only use extract this version if not doing domain adaptation
            if ( ffDomainAdaptor == null ) {
               for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
                  instance.addAll( extractor
                        .extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
               }
            }

            int closest = Integer.MAX_VALUE;
            AssertionCuePhraseAnnotation closestCue = null;
            for ( AssertionCuePhraseAnnotation cue : cues ) {
               // It is much faster to count between BaseTokens already isolated within the same sentence.
               final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
               if ( betweenCount < closest ) {
                  closestCue = cue;
                  closest = betweenCount;
               }
            }
            if ( closestCue != null && closest < 21 ) {
               instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
               instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
               instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );

               // add hack-ey domain adaptation to these hacked-in features
               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
                  instance.addAll( ffDomainAdaptor
                        .apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
                  instance.addAll( ffDomainAdaptor
                        .apply( new Feature( "ClosestCue_PhraseFamily", closestCue
                              .getCuePhraseAssertionFamily() ) ) );
                  instance.addAll( ffDomainAdaptor
                        .apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
               }

            }

            // 7/9/13 SRH trying to make it work just for anatomical site
            int eemTypeId = identifiedAnnotation.getTypeID();
            if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
               // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
               instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
               // add hack-ey domain adaptation to these hacked-in features
               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
                  instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
               }
            }

            // only extract these features if not doing domain adaptation
            if ( ffDomainAdaptor == null ) {
               for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
                  if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
                     ((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
                  }
                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
               }
            }

            for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
               if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
                  ((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
               }
               instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
            }

            List<Feature> feats = instance.getFeatures();

            for ( Feature feat : feats ) {
               if ( feat instanceof TreeFeature ||
                    (feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
                                                feat.getName().startsWith( "WORD" ) ||
                                                feat.getName().startsWith( "NEG" ))) ) {
                  continue;
               }
               if ( feat.getName() != null &&
                    (feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
                     feat.getName().contains( "_NEG" )) ) {
                  continue;
               }
               if ( feat.getValue() instanceof String ) {
                  feat.setValue( ((String)feat.getValue()).toLowerCase() );
               }
            }

            if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
               for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
                  // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
               }
            }


            // grab the output label
            setClassLabel( identifiedAnnotation, instance );

            if ( this.isTraining() ) {
               // apply feature selection, if necessary
               if ( this.featureSelection != null ) {
                  feats = this.featureSelection.transform( feats );
               }

               // ensures that the (possibly) transformed feats are used
               if ( instance.getOutcome() != null ) {
                  if ( coin.nextDouble() < this.portionOfDataToUse ) {
                     this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
                  }
               }
            }
         }
      }
   }