private void _text()

in core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java [114:152]


    private void _text(Node node, final StringBuilder accum) {
        traverse(
                new NodeVisitor() {

                    private Node excluded = null;

                    public void head(Node node, int depth) {
                        if (excluded == null && node instanceof TextNode) {
                            TextNode textNode = (TextNode) node;
                            appendNormalisedText(accum, textNode);
                        } else if (node instanceof Element) {
                            Element element = (Element) node;
                            if (excludedTags.contains(element.tagName())) {
                                excluded = element;
                            }
                            if (accum.length() > 0
                                    && (element.isBlock() || element.tag().getName().equals("br"))
                                    && !lastCharIsWhitespace(accum)) accum.append(' ');
                        }
                    }

                    public void tail(Node node, int depth) {
                        // make sure there is a space between block tags and immediately
                        // following text nodes <div>One</div>Two should be "One Two".
                        if (node instanceof Element) {
                            Element element = (Element) node;
                            if (element == excluded) {
                                excluded = null;
                            }
                            if (element.isBlock()
                                    && (node.nextSibling() instanceof TextNode)
                                    && !lastCharIsWhitespace(accum)) accum.append(' ');
                        }
                    }
                },
                node,
                maxTextSize,
                accum);
    }