in ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ODIEVectorFileWriter.java [187:366]
public void process(JCas jcas) {
// System.err.println("processCas-ing");
if(!initialized) return;
// JCas jcas;
// try {
// jcas = arg0.getCurrentView().getJCas();
// } catch (CASException e) {
// e.printStackTrace();
// System.err.println("No processing done in ODIEVectoFileWriter!");
// return;
// }
String docId = DocIdUtil.getDocumentID( jcas );
docId = docId.substring(docId.lastIndexOf('/')+1, docId.length());
// Hashtable<Integer, Integer> sysId2AlignId = new Hashtable<Integer, Integer>();
// Hashtable<Integer, Integer> goldId2AlignId = new Hashtable<Integer, Integer>();
// Hashtable<Integer, Integer> alignId2GoldId = new Hashtable<Integer, Integer>();
if (docId==null) docId = "141471681_1";
System.out.println("creating vectors for "+docId);
// Vector<Span> goldSpans = loadGoldStandard(docId, goldSpan2id);
int numPos = 0;
FSIterator markIter = jcas.getAnnotationIndex(Markable.type).iterator();
LinkedList<Annotation> lm = FSIteratorToList.convert(markIter);
// while(markIter.hasNext()){
// Markable m = (Markable) markIter.next();
// String key = m.getBegin() + "-" + m.getEnd();
// markables.put(key, m);
// }
labeler = new GoldStandardLabeler(goldStandardDir, docId, lm);
// Vector<Span> sysSpans = loadSystemPairs(lm, docId);
// align the spans
FSIterator iter = null;
// FSIterator iter = jcas.getJFSIndexRepository().getAllIndexedFS(AnaphoricityVecInstance.type);
// int numVecs = corefNodes.size();
// log.info(numVecs + " nodes at the start of processing...");
// if(anaphora){
// while(iter.hasNext()){
// AnaphoricityVecInstance vec = (AnaphoricityVecInstance) iter.next();
// String nodeStr = vec.getVector();
// int label = getLabel(nodeStr);
// if(label == 1) posAnaphInst++;
// else if(label == 0) negAnaphInst++;
// anaphLabels.add(label);
// svm_node[] nodes = SvmUtils.getNodes(nodeStr);
// anaphNodes.add(nodes);
// }
// return;
// }
if(printVectors){
try {
neOut = new PrintWriter(outputDir + "/" + CorefConsts.NE + "/vectors/" + docId + ".libsvm");
demOut = new PrintWriter(outputDir + "/" + CorefConsts.DEM + "/vectors/" + docId + ".libsvm");
pronOut = new PrintWriter(outputDir + "/" + CorefConsts.PRON + "/vectors/"+ docId + ".libsvm");
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// int ind = 0;
iter = jcas.getJFSIndexRepository().getAllIndexedFS(MarkablePairSet.type);
while(iter.hasNext()){
// VecInstance vec = (VecInstance) iter.next();
MarkablePairSet pair = (MarkablePairSet) iter.next();
Markable anaphor = pair.getAnaphor();
String corefType = (anaphor instanceof NEMarkable ? CorefConsts.NE : (anaphor instanceof DemMarkable ? CorefConsts.DEM : CorefConsts.PRON));
// String nodeStr = vec.getVector();
// int label = getLabel(nodeStr);
FSList pairList = pair.getAntecedentList();
while(pairList instanceof NonEmptyFSList){
NonEmptyFSList node = (NonEmptyFSList) pairList;
BooleanLabeledFS labeledProb = (BooleanLabeledFS) node.getHead();
int label = labeledProb.getLabel() ? 1 : 0;
// if(anaphora){
// if(label == 1) posAnaphInst++;
// else negAnaphInst++;
// anaphLabels.add(label);
// svm_node[] nodes = vecCreator.createAnaphoricityVector(anaphor, jcas);
// anaphNodes.add(nodes);
// }
Markable antecedent = (Markable) labeledProb.getFeature();
label = (labeler.isGoldPair(anaphor, antecedent) ? 1 : 0);
if(label == 1){
numPos++;
if(corefType.equals(CorefConsts.NE)){
posNeInst++;
// neInds.add(ind);
}else if(corefType.equals(CorefConsts.DEM)){
posDemInst++;
}else if(corefType.equals(CorefConsts.PRON)){
posPronInst++;
}
}
else if(label == 0){
if(corefType.equals(CorefConsts.NE)){
negNeInst++;
// neInds.add(ind);
}else if(corefType.equals(CorefConsts.DEM)){
negDemInst++;
}else if(corefType.equals(CorefConsts.PRON)){
negPronInst++;
}
}
// corefLabels.add(label);
// corefTypes.add(corefType); // need to add it every time so the indices match...
// corefPathTrees.add(pathTree);
if(printVectors){
svm_node[] nodes = vecCreator.getNodeFeatures(anaphor, antecedent, jcas); //getNodes(nodeStr);
// corefNodes.add(nodes);
PrintWriter writer = null;
if(corefType.equals(CorefConsts.NE)){
writer = neOut;
}else if(corefType.equals(CorefConsts.PRON)){
writer = pronOut;
}else if(corefType.equals(CorefConsts.DEM)){
writer = demOut;
}
writer.print(label);
for(svm_node inst : nodes){
writer.print(" ");
writer.print(inst.index);
writer.print(":");
writer.print(inst.value);
}
writer.println();
writer.flush();
}
if(printTrees){
// Markable anaphor = vec.getAnaphor();
// Markable antecedent = vec.getAntecedent();
TreebankNode antecedentNode = MarkableTreeUtils.markableNode(jcas, antecedent.getBegin(), antecedent.getEnd());
TreebankNode anaphorNode = MarkableTreeUtils.markableNode(jcas, anaphor.getBegin(), anaphor.getEnd());
debug.println(TreeUtils.tree2str(antecedentNode));
debug.println(TreeUtils.tree2str(anaphorNode));
// TopTreebankNode pathTree = TreeExtractor.extractPathTree(antecedentNode, anaphorNode, jcas);
SimpleTree pathTree = TreeExtractor.extractPathTree(antecedentNode, anaphorNode);
SimpleTree petTree = TreeExtractor.extractPathEnclosedTree(antecedentNode, anaphorNode, jcas);
// TopTreebankNode tree = mctTree;
// String treeStr = TreeUtils.tree2str(tree);
// String treeStr = mctTree.toString();
String treeStr = pathTree.toString();
PrintWriter writer = null;
if(corefType.equals(CorefConsts.NE)){
writer = neTreeOut;
}else if(corefType.equals(CorefConsts.PRON)){
writer = pronTreeOut;
}else if(corefType.equals(CorefConsts.DEM)){
writer = demTreeOut;
}
writer.print(label == 1 ? "+1" : "-1");
writer.print(" |BT| ");
writer.print(treeStr.replaceAll("\\) \\(", ")("));
writer.println(" |ET|");
}
pairList = node.getTail();
// NOTE: If this is in place, then we will only output negative examples backwards until we reach
// the actual coreferent entity. This may have the effect of suggesting that further away markables
// are _more_ likely to be coreferent, which is an assumption that probably does not hold up in the
// test set configuration. Try commenting this feature out to see if it makes the feature more useful.
// if(label == 1) break;
}
}
if(printVectors){
neOut.close();
demOut.close();
pronOut.close();
}
// numVecs = (corefNodes.size() - numVecs);
// log.info("Document id: " + docId + " has " + numVecs + " pairwise instances.");
}