public void nodesToText()

in wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java [94:187]


    public void nodesToText(List<?> nodes, Appendable buffer, IWikiModel model) throws IOException {
        CountingAppendable countingBuffer;
        if (buffer instanceof CountingAppendable) {
            countingBuffer = (CountingAppendable) buffer;
        } else {
            // wrap
            countingBuffer = new CountingAppendable(buffer);
        }

        if (nodes != null && !nodes.isEmpty()) {
            try {
                int level = model.incrementRecursionLevel();
                if (level > Configuration.RENDERER_RECURSION_LIMIT) {
                    countingBuffer.append("Error - recursion limit exceeded"
                            + " rendering tags in PlainTextConverter#nodesToText().");
                    return;
                }
                for (Object node : nodes) {
                    if (node instanceof WPATag tag) {
                        // extract wikilink annotations
                        String wikilinkLabel = tag.getAttributes().get(WIKILINK_TITLE_ATTR_KEY);
                        String wikilinkTarget = tag.getAttributes().get(WIKILINK_TARGET_ATTR_KEY);
                        if (wikilinkLabel != null) {
                            int colonIdx = -1; // wikilinkLabel.indexOf(':');
                            if (colonIdx == -1) {
                                // do not serialize non-topic wiki-links such as
                                // translation links missing from the
                                // INTERWIKI_LINK map
                                int start = countingBuffer.currentPosition;
                                tag.getBodyString(countingBuffer);
                                int end = countingBuffer.currentPosition;
                                if (!wikilinkTarget.startsWith("#")) {
                                  // TODO: wikilink label is not important,since that is the covered text?
                                    wikilinks.add(new Annotation(start, end, wikilinkLabel, wikilinkTarget));
                                }
                            }
                        } else {
                            tag.getBodyString(countingBuffer);
                        }

                    } else if (node instanceof ContentToken contentToken) {
                        countingBuffer.append(contentToken.getContent());
                    } else if (node instanceof List) {
                    } else if (node instanceof WPList) {
                    } else if (node instanceof WPTable) {
                        // ignore lists and tables since they most of the time
                        // do not hold grammatically correct
                        // interesting sentences that are representative of the
                        // language.
                    } else if (node instanceof TagNode tagNode) {
                        Map<String, String> attributes = tagNode.getAttributes();
                        Map<String, Object> oAttributes = tagNode.getObjectAttributes();
                        boolean hasSpecialHandling = false;
                        String tagName = tagNode.getName();
                        int tagBegin = countingBuffer.currentPosition;
                        
                        if ("ref".equals(tagName)) {
                            // ignore the references since they do not hold
                            // interesting text content
                            hasSpecialHandling = true;
                        } else if (oAttributes != null
                                && oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) {
                            // the caption of images often holds well-formed
                            // sentences with links to entities
                            hasSpecialHandling = true;
                            ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
                            imageNodeToText(tagNode, iformat, countingBuffer, model);
                        }
                        if (!hasSpecialHandling) {
                            nodesToText(tagNode.getChildren(), countingBuffer, model);
                        }
                        if (PARAGRAPH_TAGS.contains(tagName)) {
                            paragraphs.add(new Annotation(tagBegin,
                                    countingBuffer.currentPosition, "paragraph", tagName));
                            countingBuffer.append("\n\n");
                        } else if (HEADING_TAGS.contains(tagName)) {
                            headers.add(new Annotation(tagBegin,
                                countingBuffer.currentPosition, "heading", tagName));
                            countingBuffer.append("\n\n");
                        } else if ("a".equals(tagName)) {
                          String href = attributes.get(HREF_ATTR_KEY);
                          
                          // TODO: How to get covered text here? Is not needed anyway right?!
                          wikilinks.add(new Annotation(tagBegin, countingBuffer.currentPosition,
                              "", href));
                        }
                          
                    }
                }
            } finally {
                model.decrementRecursionLevel();
            }
        }
    }