in core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java [160:208]
public static void traverse(
NodeVisitor visitor, Node root, int maxSize, StringBuilder builder) {
Validate.notNull(visitor, "null visitor in traverse method");
Validate.notNull(root, "null root node in traverse method");
Node node = root;
int depth = 0;
while (node != null) {
// interrupts if too much text has already been produced
if (maxSize > 0 && builder.length() >= maxSize) return;
Node parent =
node.parentNode(); // remember parent to find nodes that get replaced in .head
int origSize = parent != null ? parent.childNodeSize() : 0;
Node next = node.nextSibling();
visitor.head(node, depth); // visit current node
if (parent != null && !node.hasParent()) { // removed or replaced
if (origSize == parent.childNodeSize()) { // replaced
node =
parent.childNode(
node.siblingIndex()); // replace ditches parent but keeps
// sibling index
} else { // removed
node = next;
if (node == null) { // last one, go up
node = parent;
depth--;
}
continue; // don't tail removed
}
}
if (node.childNodeSize() > 0) { // descend
node = node.childNode(0);
depth++;
} else {
// when no more siblings, ascend
while (node.nextSibling() == null && depth > 0) {
visitor.tail(node, depth);
node = node.parentNode();
depth--;
}
visitor.tail(node, depth);
if (node == root) break;
node = node.nextSibling();
}
}
}