public void process()

in ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java [308:552]
132 lines of code
47 McCabe index (conditional complexity)

   public void process( JCas jCas ) throws AnalysisEngineProcessException {
      getLogger().info( "Processing ..." );
      String documentId = DocIdUtil.getDocumentID( jCas );
      String domainId = "";
      String domainFeature = null;

      if ( this.featureFunctionExtractors.size() <= 0 ) {
         this.ffDomainAdaptor = null;
      }

      if ( documentId != null ) {
          getLogger().debug( "processing next doc: " + documentId );
         // set the domain to be FeatureFunction'ed into all extractors
         if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
            domainId = fileToDomain.get( documentId );
            // if domain is not found, no warning -- just considers general domain
            ffDomainAdaptor.setDomain( domainId );
         } else if ( !fileToDomain.isEmpty() ) {
            domainFeature = fileToDomain.get( documentId );
         }
      } else {
          getLogger().debug( "processing next doc (doc id is null)" );
      }

      this.lastLabel = "<BEGIN>";

//    // get gold standard relation instances during testing for error analysis
//    if (! this.isTraining() && printErrors) {
//      JCas goldView;
//      try {
//        goldView = jCas.getView("GoldView");
//      } catch(CASException e) {
//        throw new AnalysisEngineProcessException(e);
//      }
//      
//      //categoryLookup = createCategoryLookup(goldView); 
//    }

      final JCas annotationView = getAnnotationView( jCas );

//    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSentenceMap = JCasUtil.indexCovering(annotationView, IdentifiedAnnotation.class, Sentence.class);
//    Map<Sentence, Collection<BaseToken>> tokensCoveredInSentenceMap = JCasUtil.indexCovered(annotationView, Sentence.class, BaseToken.class);

//    Map<IdentifiedAnnotation, Collection<Zone>> coveringZoneMap =
//        JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Zone.class);
//    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSents =
//        JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Sentence.class);

//    List<Instance<String>> instances = new ArrayList<Instance<String>>();
      // generate a list of training instances for each sentence in the document
      // Use an indexed map.  This is faster than calling select and then selectCovering within a loop.
      final Map<Sentence, List<Annotation>> sentenceAnnotationMap
            = JCasUtil.indexCovered( annotationView, Sentence.class, Annotation.class );
      // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
      final Collection<IdentifiedAnnotation> entities = new ArrayList<>();
      final Collection<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
      final Collection<BaseToken> baseTokens = new ArrayList<>();
      for(Sentence coveringSent : JCasUtil.select(annotationView, Sentence.class)){
         Collection<Annotation> coveredAnnotations = sentenceAnnotationMap.get(coveringSent);
         // Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
         // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
         entities.clear();
         cues.clear();
         baseTokens.clear();
         for ( Annotation annotation : coveredAnnotations ) {
            if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
               entities.add( (IdentifiedAnnotation)annotation );
            } else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
               cues.add( (AssertionCuePhraseAnnotation)annotation );
            } else if ( annotation instanceof BaseToken ) {
               baseTokens.add( (BaseToken)annotation );
            }
         }

         for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
            if ( identifiedAnnotation.getPolarity() == -1 ) {
                getLogger().debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
                     identifiedAnnotation.getBegin(),
                     identifiedAnnotation.getEnd(),
                     identifiedAnnotation.getPolarity(),
                     identifiedAnnotation.getClass().getName() ) );
            }
            Instance<String> instance = new Instance<>();

            if ( domainFeature != null ) {
               instance.add( new Feature( "Domain", domainFeature ) );
            }
//      // extract all features that require only the entity mention annotation
//      instance.addAll(tokenFeatureExtractor.extract(jCas, entityMention));

            // extract all features that require the token and sentence annotations

            //Sentence sentence = sentenceList.iterator().next();
      
      /*
      if (sentence != null)
      {
        for (ContextExtractor<IdentifiedAnnotation> extractor : this.contextFeatureExtractors) {
          instance.addAll(extractor.extractWithin(annotationView, entityMention, sentence));
        }
      } else
      {
        // TODO extract context features for annotations that don't fall within a sentence
        LOGGER.log(Level.WARN, "FIXME/TODO: generate context features for entities that don't fall within a sentence");
      }
      */
      
      /*
      for (ContextExtractor<BaseToken> extractor : this.tokenContextFeatureExtractors) {
          instance.addAll(extractor.extract(annotationView, entityMention));
        }
        */

            // only use extract this version if not doing domain adaptation
            if ( ffDomainAdaptor == null ) {
               for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
//    		  instance.addAll(extractor.extractWithin(annotationView, entityMention, sentence));
//                  if ( coveringSent != null ) {
                  instance.addAll( extractor
                        .extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
//                  } else {
//                     instance.addAll( extractor.extract( annotationView, identifiedAnnotation ) );
//                  }
               }
            }

            int closest = Integer.MAX_VALUE;
            AssertionCuePhraseAnnotation closestCue = null;
            for ( AssertionCuePhraseAnnotation cue : cues ) {
               // It is much faster to count between BaseTokens already isolated within the same sentence.
               final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
               if ( betweenCount < closest ) {
                  closestCue = cue;
                  closest = betweenCount;
               }

//          instance.addAll(cuePhraseInWindowExtractor.extractBetween(jCas, cue, entityOrEventMention));
            }
            if ( closestCue != null && closest < 21 ) {
               instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
//          instance.add(new Feature("ClosestCue_Phrase", closestCue.getCuePhrase()));
               instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
               instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );

               // add hack-ey domain adaptation to these hacked-in features
               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
                  instance.addAll( ffDomainAdaptor
                        .apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
                  instance.addAll( ffDomainAdaptor
                        .apply( new Feature( "ClosestCue_PhraseFamily", closestCue
                              .getCuePhraseAssertionFamily() ) ) );
                  instance.addAll( ffDomainAdaptor
                        .apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
               }

            }
//            }
//      if (cuePhraseFeatures != null && !cuePhraseFeatures.isEmpty())
//      {
//        instance.addAll(cuePhraseFeatures);
//      }


            // 7/9/13 SRH trying to make it work just for anatomical site
            int eemTypeId = identifiedAnnotation.getTypeID();
            if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
               // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
               //instance.add(new Feature("ENTITY_TYPE_" + entityOrEventMention.getTypeID()));
               instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
               // add hack-ey domain adaptation to these hacked-in features
               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
                  instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
               }
            }
      /* This hurts recall more than it helps precision
      else if (eemTypeId == CONST.NE_TYPE_ID_DRUG) {
    	  // 7/10 adding drug
    	  instance.add(new Feature("ENTITY_TYPE_DRUG"));
      }
      */

            // only extract these features if not doing domain adaptation
            if ( ffDomainAdaptor == null ) {
               for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
               }
            }

            for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
               instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
            }

//      List<Feature> zoneFeatures = extractZoneFeatures(coveringZoneMap, entityOrEventMention);
//      if (zoneFeatures != null && !zoneFeatures.isEmpty())
//      {
//        instance.addAll(zoneFeatures);
//      }

            List<Feature> feats = instance.getFeatures();
//      List<Feature> lcFeats = new ArrayList<Feature>();

            for ( Feature feat : feats ) {
               if ( feat instanceof TreeFeature ||
                    (feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
                                                feat.getName().startsWith( "WORD" ) ||
                                                feat.getName().startsWith( "NEG" ))) ) {
                  continue;
               }
               if ( feat.getName() != null &&
                    (feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
                     feat.getName().contains( "_NEG" )) ) {
                  continue;
               }
               if ( feat.getValue() instanceof String ) {
                  feat.setValue( ((String)feat.getValue()).toLowerCase() );
               }
            }

            if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
               for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
                  // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
               }
            }


            // grab the output label
            setClassLabel( identifiedAnnotation, instance );

            if ( this.isTraining() ) {
               // apply feature selection, if necessary
               if ( this.featureSelection != null ) {
                  feats = this.featureSelection.transform( feats );
               }

               // ensures that the (possibly) transformed feats are used
               if ( instance.getOutcome() != null ) {
                  if ( coin.nextDouble() < this.portionOfDataToUse ) {
                     this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
                  }
               }
            }
         }
      }
   }