private void processXmlfile()

in ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java [125:357]


    private void processXmlfile(JCas patientJcas, File xmlFile, Map<String,String> notes) throws AnalysisEngineProcessException {
        // load the XML
        Element dataElem;
        try {
            dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
        } catch (MalformedURLException e) {
            throw new AnalysisEngineProcessException(e);
        } catch (JDOMException e) {
            throw new AnalysisEngineProcessException(e);
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
        HashMap<String,Integer> docLens = new HashMap<>();
        notes.forEach((k,v) -> docLens.put(k, v.length()));
        HashMap<String,JCas> docCases = new HashMap<>();
        HashMap<String,JCas> goldCases = new HashMap<>();
        for(String docName : notes.keySet()) {
            for (JCas docView : PatientViewUtil.getAllViews(patientJcas)) {
                if (docView.getViewName().contains(docName) && docView.getViewName().contains(CAS.NAME_DEFAULT_SOFA)) {
                    docCases.put(docName, docView);
                    break;
                }
            }
            for(JCas goldView : PatientViewUtil.getAllViews(patientJcas)){
                if(goldView.getViewName().contains(docName) && goldView.getViewName().contains(PatientViewUtil.GOLD_PREFIX)) {
                    goldCases.put(docName, goldView);
                }
            }
        }
        for (Element annotationsElem : dataElem.getChildren("annotations")) {
            // keep track of entity ids as we read entities so that we can find them from the map annotations later:
            Map<String, Annotation> idToAnnotation = Maps.newHashMap();

            for (Element entityElem : annotationsElem.getChildren("entity")) {
                String id = removeSingleChildText(entityElem, "id", null);
                String[] parts = id.split("@");
                String entNum = parts[0];   // note-specific id for this entity
                String entNoteName = parts[2];  // which note is this entity in: e.g., ID001_clinic_001
                String entAnnot = parts[3]; // should be "gold" for gold
                String entNote = notes.get(entNoteName);
                JCas entCas = goldCases.get(entNoteName);
                int docLen = entNote.length();
                Element spanElem = removeSingleChild(entityElem, "span", id);
                String type = removeSingleChildText(entityElem, "type", id);
                Element propertiesElem = removeSingleChild(entityElem, "properties", id);

                // UIMA doesn't support disjoint spans, so take the span enclosing
                // everything
                int begin = Integer.MAX_VALUE;
                int end = Integer.MIN_VALUE;
                for (String spanString : spanElem.getText().split(";")) {
                    String[] beginEndStrings = spanString.split(",");
                    if (beginEndStrings.length != 2) {
                        error("span not of the format 'number,number'", id);
                    }
                    int spanBegin = Integer.parseInt(beginEndStrings[0]);
                    int spanEnd = Integer.parseInt(beginEndStrings[1]);
                    if (spanBegin < begin && spanBegin >= 0) {
                        begin = spanBegin;
                    }
                    if (spanEnd > end && spanEnd <= docLen) {
                        end = spanEnd;
                    }
                }
                if (begin < 0 || end > docLen || end < 0) {
                    error("Illegal begin or end boundary", id);
                    continue;
                }

                Annotation annotation = null;
                if (type.equals("Markable")) {
                    while (end >= begin && (entNote.charAt(end - 1) == '\n' || entNote.charAt(end - 1) == '\r')) {
                        end--;
                    }
                    if (begin < 0 || end < 0) {
                        error("Illegal negative span", id);
                    }
                    Markable markable = new Markable(entCas, begin, end);
                    markable.addToIndexes();
                    annotation = markable;
                } else if (type.equals("EVENT")) {
                    while (end >= begin && (entNote.charAt(end - 1) == '\n' || entNote.charAt(end - 1) == '\r')) {
                        end--;
                    }
                    if (begin < 0 || end < 0) {
                        error("Illegal negative span", id);
                    }
                    EventMention event = new EventMention(entCas, begin, end);
                    event.addToIndexes();

                    // use the docCas here since we need the dependency parses.
                    Markable markable = new Markable(entCas, begin, end);
                    markable.addToIndexes();
                    annotation = markable;
                } else {
                    LOGGER.warn(String.format("Skipping entity type %s because the handler hasn't been written.", type));
                }
                if (annotation != null) idToAnnotation.put(id, annotation);
            }

            Map<Markable, CollectionTextRelation> markable2chain = new HashMap<>();
            List<List<Markable>> xDocLists = new ArrayList<>();
            for (Element relationElem : annotationsElem.getChildren("relation")) {
                String id = removeSingleChildText(relationElem, "id", null);
                String[] parts = id.split("@");
                String relNum = parts[0];   // note-specific id for this entity
                String relNoteName = parts[2];  // which note is this entity in: e.g., ID001_clinic_001
                String relAnnot = parts[3]; // should be "gold" for gold
                String relNote = notes.get(relNoteName);
                JCas relCas = goldCases.get(relNoteName);
                String type = removeSingleChildText(relationElem, "type", id);
                Element propertiesElem = removeSingleChild(relationElem, "properties", id);

                if (type.equals("Identical")) {
                    boolean crossDoc = false;
                    // Build list of Markables from FirstInstance and Coreferring_String annotations:
                    Set<String> chainNotes = new HashSet<>();
                    String mention = removeSingleChildText(propertiesElem, "FirstInstance", id);
                    if(mention != null) {
                        String[] mentionParts = mention.split("@");
                        String mentionNote = mentionParts[2];
                        chainNotes.add(mentionNote);
                    }
                    List<Markable> markables = new ArrayList<>();
                    Markable antecedent, anaphor;
                    Annotation chainElement = idToAnnotation.get(mention);

                    if (chainElement != null && chainElement instanceof Markable) {
                        antecedent = (Markable) chainElement;
                        markables.add(antecedent);
                    } else {
                        error("Null markable as FirstInstance", id);
                    }

                    List<Element> corefs = propertiesElem.getChildren("Coreferring_String");
                    for(Element coref : corefs){
                        mention = coref.getText();
                        if(mention != null && mention.length() > 0) {
                            String[] mentionParts = mention.split("@");
                            String mentionNote = mentionParts[2];
                            chainNotes.add(mentionNote);
                        }
                        chainElement = idToAnnotation.get(mention);
                        if(chainElement != null && chainElement instanceof Markable){
                            anaphor = (Markable) chainElement;
                            markables.add(anaphor);
                        }else{
                            error("Null markable as Coreferring_String", id);
                        }
                    }
                    if(chainNotes.size() > 1){
                        // if this list of markables has more than one note reference in it, save the  list of markables for later
                        xDocLists.add(markables);
                    }else {
                        // this is a within-document coref chain, so build it and add to indexes.
                        // Iterate over markable list creating binary coref relations:
                        for (int antInd = 0; antInd < markables.size() - 1; antInd++) {
                            int anaInd = antInd + 1;
                            // create set of binary relations from chain elements:
                            CoreferenceRelation pair = new CoreferenceRelation(relCas);
                            pair.setCategory("Identity");
                            RelationArgument arg1 = new RelationArgument(relCas);
                            arg1.setArgument(markables.get(antInd));
                            arg1.setRole("antecedent");
                            pair.setArg1(arg1);
                            RelationArgument arg2 = new RelationArgument(relCas);
                            arg2.setArgument(markables.get(anaInd));
                            arg2.setRole("anaphor");
                            pair.setArg2(arg2);
                            pair.addToIndexes();
                        }
                        // Create FSList from markable list and add to collection text relation:
                        if (markables.size() > 1) {
                            CollectionTextRelation chain = new CollectionTextRelation(relCas);
                            FSList list = ListFactory.buildList(relCas, markables);
                            list.addToIndexes();
                            chain.setMembers(list);
                            chain.addToIndexes();
                            System.out.println("Creating new chain in thyme anafora reader:");
                            System.out.print("W/in doc chain ");
                            for (Markable m : markables) {
                                System.out.print(" -> " + m.getCoveredText());
                                markable2chain.put(m, chain);
                            }
                            System.out.println();
                        } else {
                            error("Coreference chain of length <= 1", id);
                        }
                    }
                    propertiesElem.removeChildren("Coreferring_String");
                }else{
                    LOGGER.warn(String.format("This script cannot process relations of type %s yet.", type));
                }
            }
            // after processing all relations, go back to the queued cross-doc markable lists and use them to create cross-doc chains
            for(List<Markable> mlist : xDocLists){
                // the first markable is from the first document, we'll use that as the basis for the chain
                // and then add the markables from the subsequent doc chains to the end.
                List<Markable> ptMarkableList = new ArrayList<>();
                JCas chainCas = null;
                for(int i = 0; i < mlist.size(); i++) {
                    Markable child = mlist.get(i);
                    if( i == 0 ){
                        try {
                            chainCas = child.getCAS().getJCas();
                        }catch(CASException e){
                            throw new AnalysisEngineProcessException(e);
                        }
                    }
                    // dump the markables from this chain into a java list and remove the
                    // uima structures for the chain and its list of markables:
                    CollectionTextRelation inDocChain = markable2chain.get(child);
                    if(inDocChain != null) {
                        FSList members = inDocChain.getMembers();
                        ptMarkableList.addAll(JCasUtil.select(members, Markable.class));
                        members.removeFromIndexes();
                        inDocChain.removeFromIndexes();
                    }else{
                        // if we didn't see this markable in any other chains (i.e. it wasn't in a coref chain in its
                        // own document, then the markable2chain map will have an empty entry for that markable,
                        // and so we have to add the markable itself to the list of all markables in this chain.
                        ptMarkableList.add(child);
                    }
                }
                // put the big java list of markables into a coref chain structure and add to indexes:
                FSList newMembers = FSCollectionFactory.createFSList(chainCas, ptMarkableList);
                newMembers.addToIndexes();
                CollectionTextRelation ptChain = new CollectionTextRelation(chainCas);
                ptChain.setMembers(newMembers);
                ptChain.addToIndexes();
            }
        }
    }