in ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java [125:357]
private void processXmlfile(JCas patientJcas, File xmlFile, Map<String,String> notes) throws AnalysisEngineProcessException {
// load the XML
Element dataElem;
try {
dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
} catch (MalformedURLException e) {
throw new AnalysisEngineProcessException(e);
} catch (JDOMException e) {
throw new AnalysisEngineProcessException(e);
} catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
HashMap<String,Integer> docLens = new HashMap<>();
notes.forEach((k,v) -> docLens.put(k, v.length()));
HashMap<String,JCas> docCases = new HashMap<>();
HashMap<String,JCas> goldCases = new HashMap<>();
for(String docName : notes.keySet()) {
for (JCas docView : PatientViewUtil.getAllViews(patientJcas)) {
if (docView.getViewName().contains(docName) && docView.getViewName().contains(CAS.NAME_DEFAULT_SOFA)) {
docCases.put(docName, docView);
break;
}
}
for(JCas goldView : PatientViewUtil.getAllViews(patientJcas)){
if(goldView.getViewName().contains(docName) && goldView.getViewName().contains(PatientViewUtil.GOLD_PREFIX)) {
goldCases.put(docName, goldView);
}
}
}
for (Element annotationsElem : dataElem.getChildren("annotations")) {
// keep track of entity ids as we read entities so that we can find them from the map annotations later:
Map<String, Annotation> idToAnnotation = Maps.newHashMap();
for (Element entityElem : annotationsElem.getChildren("entity")) {
String id = removeSingleChildText(entityElem, "id", null);
String[] parts = id.split("@");
String entNum = parts[0]; // note-specific id for this entity
String entNoteName = parts[2]; // which note is this entity in: e.g., ID001_clinic_001
String entAnnot = parts[3]; // should be "gold" for gold
String entNote = notes.get(entNoteName);
JCas entCas = goldCases.get(entNoteName);
int docLen = entNote.length();
Element spanElem = removeSingleChild(entityElem, "span", id);
String type = removeSingleChildText(entityElem, "type", id);
Element propertiesElem = removeSingleChild(entityElem, "properties", id);
// UIMA doesn't support disjoint spans, so take the span enclosing
// everything
int begin = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
for (String spanString : spanElem.getText().split(";")) {
String[] beginEndStrings = spanString.split(",");
if (beginEndStrings.length != 2) {
error("span not of the format 'number,number'", id);
}
int spanBegin = Integer.parseInt(beginEndStrings[0]);
int spanEnd = Integer.parseInt(beginEndStrings[1]);
if (spanBegin < begin && spanBegin >= 0) {
begin = spanBegin;
}
if (spanEnd > end && spanEnd <= docLen) {
end = spanEnd;
}
}
if (begin < 0 || end > docLen || end < 0) {
error("Illegal begin or end boundary", id);
continue;
}
Annotation annotation = null;
if (type.equals("Markable")) {
while (end >= begin && (entNote.charAt(end - 1) == '\n' || entNote.charAt(end - 1) == '\r')) {
end--;
}
if (begin < 0 || end < 0) {
error("Illegal negative span", id);
}
Markable markable = new Markable(entCas, begin, end);
markable.addToIndexes();
annotation = markable;
} else if (type.equals("EVENT")) {
while (end >= begin && (entNote.charAt(end - 1) == '\n' || entNote.charAt(end - 1) == '\r')) {
end--;
}
if (begin < 0 || end < 0) {
error("Illegal negative span", id);
}
EventMention event = new EventMention(entCas, begin, end);
event.addToIndexes();
// use the docCas here since we need the dependency parses.
Markable markable = new Markable(entCas, begin, end);
markable.addToIndexes();
annotation = markable;
} else {
LOGGER.warn(String.format("Skipping entity type %s because the handler hasn't been written.", type));
}
if (annotation != null) idToAnnotation.put(id, annotation);
}
Map<Markable, CollectionTextRelation> markable2chain = new HashMap<>();
List<List<Markable>> xDocLists = new ArrayList<>();
for (Element relationElem : annotationsElem.getChildren("relation")) {
String id = removeSingleChildText(relationElem, "id", null);
String[] parts = id.split("@");
String relNum = parts[0]; // note-specific id for this entity
String relNoteName = parts[2]; // which note is this entity in: e.g., ID001_clinic_001
String relAnnot = parts[3]; // should be "gold" for gold
String relNote = notes.get(relNoteName);
JCas relCas = goldCases.get(relNoteName);
String type = removeSingleChildText(relationElem, "type", id);
Element propertiesElem = removeSingleChild(relationElem, "properties", id);
if (type.equals("Identical")) {
boolean crossDoc = false;
// Build list of Markables from FirstInstance and Coreferring_String annotations:
Set<String> chainNotes = new HashSet<>();
String mention = removeSingleChildText(propertiesElem, "FirstInstance", id);
if(mention != null) {
String[] mentionParts = mention.split("@");
String mentionNote = mentionParts[2];
chainNotes.add(mentionNote);
}
List<Markable> markables = new ArrayList<>();
Markable antecedent, anaphor;
Annotation chainElement = idToAnnotation.get(mention);
if (chainElement != null && chainElement instanceof Markable) {
antecedent = (Markable) chainElement;
markables.add(antecedent);
} else {
error("Null markable as FirstInstance", id);
}
List<Element> corefs = propertiesElem.getChildren("Coreferring_String");
for(Element coref : corefs){
mention = coref.getText();
if(mention != null && mention.length() > 0) {
String[] mentionParts = mention.split("@");
String mentionNote = mentionParts[2];
chainNotes.add(mentionNote);
}
chainElement = idToAnnotation.get(mention);
if(chainElement != null && chainElement instanceof Markable){
anaphor = (Markable) chainElement;
markables.add(anaphor);
}else{
error("Null markable as Coreferring_String", id);
}
}
if(chainNotes.size() > 1){
// if this list of markables has more than one note reference in it, save the list of markables for later
xDocLists.add(markables);
}else {
// this is a within-document coref chain, so build it and add to indexes.
// Iterate over markable list creating binary coref relations:
for (int antInd = 0; antInd < markables.size() - 1; antInd++) {
int anaInd = antInd + 1;
// create set of binary relations from chain elements:
CoreferenceRelation pair = new CoreferenceRelation(relCas);
pair.setCategory("Identity");
RelationArgument arg1 = new RelationArgument(relCas);
arg1.setArgument(markables.get(antInd));
arg1.setRole("antecedent");
pair.setArg1(arg1);
RelationArgument arg2 = new RelationArgument(relCas);
arg2.setArgument(markables.get(anaInd));
arg2.setRole("anaphor");
pair.setArg2(arg2);
pair.addToIndexes();
}
// Create FSList from markable list and add to collection text relation:
if (markables.size() > 1) {
CollectionTextRelation chain = new CollectionTextRelation(relCas);
FSList list = ListFactory.buildList(relCas, markables);
list.addToIndexes();
chain.setMembers(list);
chain.addToIndexes();
System.out.println("Creating new chain in thyme anafora reader:");
System.out.print("W/in doc chain ");
for (Markable m : markables) {
System.out.print(" -> " + m.getCoveredText());
markable2chain.put(m, chain);
}
System.out.println();
} else {
error("Coreference chain of length <= 1", id);
}
}
propertiesElem.removeChildren("Coreferring_String");
}else{
LOGGER.warn(String.format("This script cannot process relations of type %s yet.", type));
}
}
// after processing all relations, go back to the queued cross-doc markable lists and use them to create cross-doc chains
for(List<Markable> mlist : xDocLists){
// the first markable is from the first document, we'll use that as the basis for the chain
// and then add the markables from the subsequent doc chains to the end.
List<Markable> ptMarkableList = new ArrayList<>();
JCas chainCas = null;
for(int i = 0; i < mlist.size(); i++) {
Markable child = mlist.get(i);
if( i == 0 ){
try {
chainCas = child.getCAS().getJCas();
}catch(CASException e){
throw new AnalysisEngineProcessException(e);
}
}
// dump the markables from this chain into a java list and remove the
// uima structures for the chain and its list of markables:
CollectionTextRelation inDocChain = markable2chain.get(child);
if(inDocChain != null) {
FSList members = inDocChain.getMembers();
ptMarkableList.addAll(JCasUtil.select(members, Markable.class));
members.removeFromIndexes();
inDocChain.removeFromIndexes();
}else{
// if we didn't see this markable in any other chains (i.e. it wasn't in a coref chain in its
// own document, then the markable2chain map will have an empty entry for that markable,
// and so we have to add the markable itself to the list of all markables in this chain.
ptMarkableList.add(child);
}
}
// put the big java list of markables into a coref chain structure and add to indexes:
FSList newMembers = FSCollectionFactory.createFSList(chainCas, ptMarkableList);
newMembers.addToIndexes();
CollectionTextRelation ptChain = new CollectionTextRelation(chainCas);
ptChain.setMembers(newMembers);
ptChain.addToIndexes();
}
}
}