in wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java [94:187]
public void nodesToText(List<?> nodes, Appendable buffer, IWikiModel model) throws IOException {
CountingAppendable countingBuffer;
if (buffer instanceof CountingAppendable) {
countingBuffer = (CountingAppendable) buffer;
} else {
// wrap
countingBuffer = new CountingAppendable(buffer);
}
if (nodes != null && !nodes.isEmpty()) {
try {
int level = model.incrementRecursionLevel();
if (level > Configuration.RENDERER_RECURSION_LIMIT) {
countingBuffer.append("Error - recursion limit exceeded"
+ " rendering tags in PlainTextConverter#nodesToText().");
return;
}
for (Object node : nodes) {
if (node instanceof WPATag tag) {
// extract wikilink annotations
String wikilinkLabel = tag.getAttributes().get(WIKILINK_TITLE_ATTR_KEY);
String wikilinkTarget = tag.getAttributes().get(WIKILINK_TARGET_ATTR_KEY);
if (wikilinkLabel != null) {
int colonIdx = -1; // wikilinkLabel.indexOf(':');
if (colonIdx == -1) {
// do not serialize non-topic wiki-links such as
// translation links missing from the
// INTERWIKI_LINK map
int start = countingBuffer.currentPosition;
tag.getBodyString(countingBuffer);
int end = countingBuffer.currentPosition;
if (!wikilinkTarget.startsWith("#")) {
// TODO: wikilink label is not important,since that is the covered text?
wikilinks.add(new Annotation(start, end, wikilinkLabel, wikilinkTarget));
}
}
} else {
tag.getBodyString(countingBuffer);
}
} else if (node instanceof ContentToken contentToken) {
countingBuffer.append(contentToken.getContent());
} else if (node instanceof List) {
} else if (node instanceof WPList) {
} else if (node instanceof WPTable) {
// ignore lists and tables since they most of the time
// do not hold grammatically correct
// interesting sentences that are representative of the
// language.
} else if (node instanceof TagNode tagNode) {
Map<String, String> attributes = tagNode.getAttributes();
Map<String, Object> oAttributes = tagNode.getObjectAttributes();
boolean hasSpecialHandling = false;
String tagName = tagNode.getName();
int tagBegin = countingBuffer.currentPosition;
if ("ref".equals(tagName)) {
// ignore the references since they do not hold
// interesting text content
hasSpecialHandling = true;
} else if (oAttributes != null
&& oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) {
// the caption of images often holds well-formed
// sentences with links to entities
hasSpecialHandling = true;
ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
imageNodeToText(tagNode, iformat, countingBuffer, model);
}
if (!hasSpecialHandling) {
nodesToText(tagNode.getChildren(), countingBuffer, model);
}
if (PARAGRAPH_TAGS.contains(tagName)) {
paragraphs.add(new Annotation(tagBegin,
countingBuffer.currentPosition, "paragraph", tagName));
countingBuffer.append("\n\n");
} else if (HEADING_TAGS.contains(tagName)) {
headers.add(new Annotation(tagBegin,
countingBuffer.currentPosition, "heading", tagName));
countingBuffer.append("\n\n");
} else if ("a".equals(tagName)) {
String href = attributes.get(HREF_ATTR_KEY);
// TODO: How to get covered text here? Is not needed anyway right?!
wikilinks.add(new Annotation(tagBegin, countingBuffer.currentPosition,
"", href));
}
}
}
} finally {
model.decrementRecursionLevel();
}
}
}