src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java [152:201]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  private boolean getTextHelper(StringBuffer sb, Node node,
      boolean abortOnNestedAnchors, int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      Node previousSibling = currentNode.getPreviousSibling();
      if (previousSibling != null
          && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
        appendParagraphSeparator(sb);
      } else if (blockNodes.contains(nodeName.toLowerCase())) {
        appendParagraphSeparator(sb);
      }

      if ("script".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if ("style".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
        anchorDepth++;
        if (anchorDepth > 1) {
          abort = true;
          break;
        }
      }
      if (nodeType == Node.COMMENT_NODE) {
        walker.skipChildren();
      }
      if (nodeType == Node.TEXT_NODE) {
        // cleanup and trim the value
        String text = currentNode.getNodeValue();
        text = text.replaceAll("\\s+", " ");
        text = text.trim();
        if (text.length() > 0) {
          appendSpace(sb);
          sb.append(text);
        } else {
          appendParagraphSeparator(sb);
        }
      }
    }

    return abort;
  }
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java [154:203]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  private boolean getTextHelper(StringBuffer sb, Node node,
      boolean abortOnNestedAnchors, int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      Node previousSibling = currentNode.getPreviousSibling();
      if (previousSibling != null
          && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
        appendParagraphSeparator(sb);
      } else if (blockNodes.contains(nodeName.toLowerCase())) {
        appendParagraphSeparator(sb);
      }

      if ("script".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if ("style".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
        anchorDepth++;
        if (anchorDepth > 1) {
          abort = true;
          break;
        }
      }
      if (nodeType == Node.COMMENT_NODE) {
        walker.skipChildren();
      }
      if (nodeType == Node.TEXT_NODE) {
        // cleanup and trim the value
        String text = currentNode.getNodeValue();
        text = text.replaceAll("\\s+", " ");
        text = text.trim();
        if (text.length() > 0) {
          appendSpace(sb);
          sb.append(text);
        } else {
          appendParagraphSeparator(sb);
        }
      }
    }

    return abort;
  }
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



