private void processDocument()

in ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java [283:428]


  private void processDocument( final JCas jCas, final JCas prevCas, final Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation>
          relationLookup) throws AnalysisEngineProcessException {
    // lookup from pair of annotations to binary text relation
    // note: assumes that there will be at most one relation per pair

    Map<Markable,ConllDependencyNode> depHeadMap = new HashMap<>();
    for(Markable m: JCasUtil.select(jCas, Markable.class)){
      ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, m);
      depHeadMap.put(m, headNode);
    }
    for(RelationFeaturesExtractor featEx : this.relationExtractors){
      if(featEx instanceof MarkableCacheRelationExtractor){
        ((MarkableCacheRelationExtractor)featEx).setCache(depHeadMap);
      }
    }
    for(FeatureExtractor1 featEx : this.mentionExtractors){
      if(featEx instanceof MarkableCacheRelationExtractor){
        ((MarkableCacheRelationExtractor)featEx).setCache(depHeadMap);
      }
    }
    this.resetPairers( jCas, depHeadMap );

    final Map<Segment, List<Markable>> segmentMarkables = JCasUtil.indexCovered( jCas, Segment.class, Markable.class );
    for ( Segment segment : JCasUtil.select(jCas, Segment.class) ) {
      for ( Markable mention : segmentMarkables.get(segment) ) {
//        System.out.println( "MCCA Markable: " + mention.getCoveredText() + " :" + mention.getBegin() + "," + mention.getEnd() );
        //        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
        boolean singleton = true;
        double maxScore = 0.0;
        CollectionTextRelation maxCluster = null;
        String mentionView = mention.getView().getViewName();

        for ( CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs( jCas, mention, prevCas ) ) {
          CollectionTextRelation cluster = pair.getCluster();
          Markable firstElement = JCasUtil.select(cluster.getMembers(), Markable.class).iterator().next();
          String clusterHeadView = firstElement.getView().getViewName();
//          System.out.println( "   MCCA Pair Cluster: " + pair.getCluster().getCategory() );
//          System.out.println("MCCA Cluster head: " + firstElement.getCoveredText() + " :" + firstElement.getBegin() + "," + firstElement.getEnd());
          // apply all the feature extractors to extract the list of features
          List<Feature> features = new ArrayList<>();
          for ( RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> extractor : this.relationExtractors ) {
            List<Feature> feats = extractor.extract( jCas, cluster, mention );
            if ( feats != null ) {
              //              Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
              features.addAll( feats );
//System.out.println( "      MCCA Extract: " + extractor.getClass().getSimpleName() + "   Features:");
//feats.forEach( f -> System.out.println( "         " + f.toString() ) );
            }
          }

          for ( FeatureExtractor1<Markable> extractor : this.mentionExtractors ) {
            features.addAll( extractor.extract( jCas, mention ) );
          }

          // here is where feature conjunctions can go (dupFeatures)
          List<Feature> dupFeatures = new ArrayList<>();
          if(!mentionView.equals(clusterHeadView)){
            features.add(new Feature("IsCrossDoc", true));
            for( Feature feature : features ){
              dupFeatures.add(new Feature("CrossDoc_" + feature.getName(), feature.getValue()));
            }
          }
          features.addAll( dupFeatures );
          // sanity check on feature values
          for ( Feature feature : features ) {
            if ( feature.getValue() == null ) {
              feature.setValue( "NULL" );
              String message = String.format( "Null value found in %s from %s", feature, features );
              System.err.println( message );
            }
          }


          // during training, feed the features to the data writer
          if ( this.isTraining() ) {
            String category = this.getRelationCategory( relationLookup, cluster, mention );
            if ( category == null ) {
              continue;
            }

            // create a classification instance and write it to the training data
            this.dataWriter.write( new Instance<>( category, features ) );
            if ( !category.equals( NO_RELATION_CATEGORY ) ) {
//              LOGGER.warn("Coref training: Writing link between mention: " + mention.getCoveredText() + " and previous cluster containing mention: " + firstElement.getCoveredText());
              if(!clusterHeadView.equals(mentionView)){
                LOGGER.info("Writing positive instance linking mention [" + mention.getCoveredText() + "] to cluster with elements from previous document");
              }
              singleton = false;
              break;
            }
          }

          // during classification feed the features to the classifier and create
          // annotations
          else {
            if(!clusterHeadView.equals(mentionView)){
              LOGGER.info("Comparing new mention to cluster with elements from previous document");
            }
            String predictedCategory = this.classify( features );
//System.out.println( "      MCCA Predicted Category: " + predictedCategory + "    Scores:" );
            // TODO look at scores in classifier and try best-pair rather than first-pair?
            Map<String, Double> scores = this.classifier.score( features );
//scores.forEach( (k,v) -> System.out.println( "         " + k + " = " + v ) );
            // add a relation annotation if a true relation was predicted
            if ( !predictedCategory.equals( NO_RELATION_CATEGORY ) ) {
              //              Logger.getLogger("MCAnnotator").info(String.format("Making a pair with score %f", scores.get(predictedCategory)));
              if ( greedyFirst ) {
                createRelation( jCas, cluster, mention, predictedCategory, scores.get( predictedCategory ) );
                singleton = false;
                if(!clusterHeadView.equals(mentionView)){
                  LOGGER.info("Linking new mention to cluster with elements from previous document");
                }
                // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
                // for "best first" need to keep track of all relations with scores and only keep the highest
                break;
              }
              if ( scores.get( predictedCategory ) > maxScore ) {
                maxScore = scores.get( predictedCategory );
                maxCluster = cluster;
              }
            }
          }
        }
        if ( !this.isTraining() && !greedyFirst && maxCluster != null ) {
          // make a link with the max cluster
          createRelation( jCas, maxCluster, mention, "CoreferenceClusterMember", maxScore );
        }

        // if we got this far and never matched up the markable then add it to list.
        // do this even during training -- adds non-chain markables to antecedent list which will be seen during testing.
        if ( singleton ) {
          // make the markable it's own cluster:
          CollectionTextRelation chain = new CollectionTextRelation( jCas );
          chain.setCategory( "Identity" );
          NonEmptyFSList list = new NonEmptyFSList( jCas );
          list.setHead( mention );
          list.setTail( new EmptyFSList( jCas ) );
          chain.setMembers( list );
          chain.addToIndexes();
          list.addToIndexes();
          list.getTail().addToIndexes();
        }
      }
    }
    createEventClusters( jCas );
  }