in core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java [114:152]
private void _text(Node node, final StringBuilder accum) {
traverse(
new NodeVisitor() {
private Node excluded = null;
public void head(Node node, int depth) {
if (excluded == null && node instanceof TextNode) {
TextNode textNode = (TextNode) node;
appendNormalisedText(accum, textNode);
} else if (node instanceof Element) {
Element element = (Element) node;
if (excludedTags.contains(element.tagName())) {
excluded = element;
}
if (accum.length() > 0
&& (element.isBlock() || element.tag().getName().equals("br"))
&& !lastCharIsWhitespace(accum)) accum.append(' ');
}
}
public void tail(Node node, int depth) {
// make sure there is a space between block tags and immediately
// following text nodes <div>One</div>Two should be "One Two".
if (node instanceof Element) {
Element element = (Element) node;
if (element == excluded) {
excluded = null;
}
if (element.isBlock()
&& (node.nextSibling() instanceof TextNode)
&& !lastCharIsWhitespace(accum)) accum.append(' ');
}
}
},
node,
maxTextSize,
accum);
}