in ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java [283:428]
private void processDocument( final JCas jCas, final JCas prevCas, final Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation>
relationLookup) throws AnalysisEngineProcessException {
// lookup from pair of annotations to binary text relation
// note: assumes that there will be at most one relation per pair
Map<Markable,ConllDependencyNode> depHeadMap = new HashMap<>();
for(Markable m: JCasUtil.select(jCas, Markable.class)){
ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, m);
depHeadMap.put(m, headNode);
}
for(RelationFeaturesExtractor featEx : this.relationExtractors){
if(featEx instanceof MarkableCacheRelationExtractor){
((MarkableCacheRelationExtractor)featEx).setCache(depHeadMap);
}
}
for(FeatureExtractor1 featEx : this.mentionExtractors){
if(featEx instanceof MarkableCacheRelationExtractor){
((MarkableCacheRelationExtractor)featEx).setCache(depHeadMap);
}
}
this.resetPairers( jCas, depHeadMap );
final Map<Segment, List<Markable>> segmentMarkables = JCasUtil.indexCovered( jCas, Segment.class, Markable.class );
for ( Segment segment : JCasUtil.select(jCas, Segment.class) ) {
for ( Markable mention : segmentMarkables.get(segment) ) {
// System.out.println( "MCCA Markable: " + mention.getCoveredText() + " :" + mention.getBegin() + "," + mention.getEnd() );
// ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
boolean singleton = true;
double maxScore = 0.0;
CollectionTextRelation maxCluster = null;
String mentionView = mention.getView().getViewName();
for ( CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs( jCas, mention, prevCas ) ) {
CollectionTextRelation cluster = pair.getCluster();
Markable firstElement = JCasUtil.select(cluster.getMembers(), Markable.class).iterator().next();
String clusterHeadView = firstElement.getView().getViewName();
// System.out.println( " MCCA Pair Cluster: " + pair.getCluster().getCategory() );
// System.out.println("MCCA Cluster head: " + firstElement.getCoveredText() + " :" + firstElement.getBegin() + "," + firstElement.getEnd());
// apply all the feature extractors to extract the list of features
List<Feature> features = new ArrayList<>();
for ( RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> extractor : this.relationExtractors ) {
List<Feature> feats = extractor.extract( jCas, cluster, mention );
if ( feats != null ) {
// Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
features.addAll( feats );
//System.out.println( " MCCA Extract: " + extractor.getClass().getSimpleName() + " Features:");
//feats.forEach( f -> System.out.println( " " + f.toString() ) );
}
}
for ( FeatureExtractor1<Markable> extractor : this.mentionExtractors ) {
features.addAll( extractor.extract( jCas, mention ) );
}
// here is where feature conjunctions can go (dupFeatures)
List<Feature> dupFeatures = new ArrayList<>();
if(!mentionView.equals(clusterHeadView)){
features.add(new Feature("IsCrossDoc", true));
for( Feature feature : features ){
dupFeatures.add(new Feature("CrossDoc_" + feature.getName(), feature.getValue()));
}
}
features.addAll( dupFeatures );
// sanity check on feature values
for ( Feature feature : features ) {
if ( feature.getValue() == null ) {
feature.setValue( "NULL" );
String message = String.format( "Null value found in %s from %s", feature, features );
System.err.println( message );
}
}
// during training, feed the features to the data writer
if ( this.isTraining() ) {
String category = this.getRelationCategory( relationLookup, cluster, mention );
if ( category == null ) {
continue;
}
// create a classification instance and write it to the training data
this.dataWriter.write( new Instance<>( category, features ) );
if ( !category.equals( NO_RELATION_CATEGORY ) ) {
// LOGGER.warn("Coref training: Writing link between mention: " + mention.getCoveredText() + " and previous cluster containing mention: " + firstElement.getCoveredText());
if(!clusterHeadView.equals(mentionView)){
LOGGER.info("Writing positive instance linking mention [" + mention.getCoveredText() + "] to cluster with elements from previous document");
}
singleton = false;
break;
}
}
// during classification feed the features to the classifier and create
// annotations
else {
if(!clusterHeadView.equals(mentionView)){
LOGGER.info("Comparing new mention to cluster with elements from previous document");
}
String predictedCategory = this.classify( features );
//System.out.println( " MCCA Predicted Category: " + predictedCategory + " Scores:" );
// TODO look at scores in classifier and try best-pair rather than first-pair?
Map<String, Double> scores = this.classifier.score( features );
//scores.forEach( (k,v) -> System.out.println( " " + k + " = " + v ) );
// add a relation annotation if a true relation was predicted
if ( !predictedCategory.equals( NO_RELATION_CATEGORY ) ) {
// Logger.getLogger("MCAnnotator").info(String.format("Making a pair with score %f", scores.get(predictedCategory)));
if ( greedyFirst ) {
createRelation( jCas, cluster, mention, predictedCategory, scores.get( predictedCategory ) );
singleton = false;
if(!clusterHeadView.equals(mentionView)){
LOGGER.info("Linking new mention to cluster with elements from previous document");
}
// break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
// for "best first" need to keep track of all relations with scores and only keep the highest
break;
}
if ( scores.get( predictedCategory ) > maxScore ) {
maxScore = scores.get( predictedCategory );
maxCluster = cluster;
}
}
}
}
if ( !this.isTraining() && !greedyFirst && maxCluster != null ) {
// make a link with the max cluster
createRelation( jCas, maxCluster, mention, "CoreferenceClusterMember", maxScore );
}
// if we got this far and never matched up the markable then add it to list.
// do this even during training -- adds non-chain markables to antecedent list which will be seen during testing.
if ( singleton ) {
// make the markable it's own cluster:
CollectionTextRelation chain = new CollectionTextRelation( jCas );
chain.setCategory( "Identity" );
NonEmptyFSList list = new NonEmptyFSList( jCas );
list.setHead( mention );
list.setTail( new EmptyFSList( jCas ) );
chain.setMembers( list );
chain.addToIndexes();
list.addToIndexes();
list.getTail().addToIndexes();
}
}
}
createEventClusters( jCas );
}