src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java [134:417]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      boolean abortOnNestedAnchors) {
    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
      return true;
    }
    return false;
  }

  /**
   * This is a convinience method, equivalent to
   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
   * @param sb a {@link StringBuffer} used to store content text 
   * found beneath the DOM node... if any exists
   * @param node a DOM {@link Node} to check for content text
   */
  public void getText(StringBuffer sb, Node node) {
    getText(sb, node, false);
  }

  // returns true if abortOnNestedAnchors is true and we find nested
  // anchors
  private boolean getTextHelper(StringBuffer sb, Node node,
      boolean abortOnNestedAnchors, int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      Node previousSibling = currentNode.getPreviousSibling();
      if (previousSibling != null
          && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
        appendParagraphSeparator(sb);
      } else if (blockNodes.contains(nodeName.toLowerCase())) {
        appendParagraphSeparator(sb);
      }

      if ("script".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if ("style".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
        anchorDepth++;
        if (anchorDepth > 1) {
          abort = true;
          break;
        }
      }
      if (nodeType == Node.COMMENT_NODE) {
        walker.skipChildren();
      }
      if (nodeType == Node.TEXT_NODE) {
        // cleanup and trim the value
        String text = currentNode.getNodeValue();
        text = text.replaceAll("\\s+", " ");
        text = text.trim();
        if (text.length() > 0) {
          appendSpace(sb);
          sb.append(text);
        } else {
          appendParagraphSeparator(sb);
        }
      }
    }

    return abort;
  }

  /**
   * Conditionally append a paragraph/line break to StringBuffer unless last
   * character a already indicates a paragraph break. Also remove trailing space
   * before paragraph break.
   *
   * @param buffer
   *          StringBuffer to append paragraph break
   */
  private void appendParagraphSeparator(StringBuffer buffer) {
    if (buffer.length() == 0) {
      return;
    }
    char lastChar = buffer.charAt(buffer.length() - 1);
    if ('\n' != lastChar) {
      // remove white space before paragraph break
      while (lastChar == ' ') {
        buffer.deleteCharAt(buffer.length() - 1);
        lastChar = buffer.charAt(buffer.length() - 1);
      }
      if ('\n' != lastChar) {
        buffer.append('\n');
      }
    }
  }

  /**
   * Conditionally append a space to StringBuffer unless last character is a
   * space or line/paragraph break.
   *
   * @param buffer
   *          StringBuffer to append space
   */
  private void appendSpace(StringBuffer buffer) {
    if (buffer.length() == 0) {
      return;
    }
    char lastChar = buffer.charAt(buffer.length() - 1);
    if (' ' != lastChar && '\n' != lastChar) {
      buffer.append(' ');
    }
  }

  /**
   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
   * append the content text found beneath the first <code>title</code> node to
   * the <code>StringBuffer</code>.
   * @param sb a {@link StringBuffer} used to store content text 
   * found beneath the DOM node... if any exists
   * @param node a DOM {@link Node} to check for content text
   * @return true if a title node was found, false otherwise
   */
  public boolean getTitle(StringBuffer sb, Node node) {

    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();

      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
        return false;
      }

      if (nodeType == Node.ELEMENT_NODE) {
        if ("title".equalsIgnoreCase(nodeName)) {
          getText(sb, currentNode);
          return true;
        }
      }
    }

    return false;
  }

  /**
   * If Node contains a BASE tag then it's HREF is returned.
   * @param node a DOM {@link Node} to check for a BASE tag
   * @return HREF if one exists
   */
  public String getBase(Node node) {

    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();

      // is this node a BASE tag?
      if (nodeType == Node.ELEMENT_NODE) {

        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
          return null;
        }

        if ("base".equalsIgnoreCase(nodeName)) {
          NamedNodeMap attrs = currentNode.getAttributes();
          for (int i = 0; i < attrs.getLength(); i++) {
            Node attr = attrs.item(i);
            if ("href".equalsIgnoreCase(attr.getNodeName())) {
              return attr.getNodeValue();
            }
          }
        }
      }
    }

    // no.
    return null;
  }

  private boolean hasOnlyWhiteSpace(Node node) {
    String val = node.getNodeValue();
    for (int i = 0; i < val.length(); i++) {
      if (!Character.isWhitespace(val.charAt(i)))
        return false;
    }
    return true;
  }

  // this only covers a few cases of empty links that are symptomatic
  // of nekohtml's DOM-fixup process...
  private boolean shouldThrowAwayLink(Node node, NodeList children,
      int childLen, LinkParams params) {
    if (childLen == 0) {
      // this has no inner structure
      if (params.childLen == 0)
        return false;
      else
        return true;
    } else if ((childLen == 1)
        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
      // single nested link
      return true;

    } else if (childLen == 2) {

      Node c0 = children.item(0);
      Node c1 = children.item(1);

      if ((c0.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
        // single link followed by whitespace node
        return true;
      }

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
        // whitespace node followed by single link
        return true;
      }

    } else if (childLen == 3) {
      Node c0 = children.item(0);
      Node c1 = children.item(1);
      Node c2 = children.item(2);

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
          && hasOnlyWhiteSpace(c2)) {
        // single link surrounded by whitespace nodes
        return true;
      }
    }

    return false;
  }

  /**
   * This method finds all anchors below the supplied DOM <code>node</code>, and
   * creates appropriate {@link Outlink} records for each (relative to the
   * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
   * {@link ArrayList}.
   * 
   * <p>
   * 
   * Links without inner structure (tags, text, etc) are discarded, as are links
   * which contain only single nested links and empty text nodes (this is a
   * common DOM-fixup artifact, at least with nekohtml).
   * 
   * @param base the canonical {@link URL}
   * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated 
   * with the base URL
   * @param node a {@link Node} under which to discover anchors
   */
  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {

    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      NodeList children = currentNode.getChildNodes();
      int childLen = (children != null) ? children.getLength() : 0;

      if (nodeType == Node.ELEMENT_NODE) {

        nodeName = nodeName.toLowerCase();
        LinkParams params = (LinkParams) linkParams.get(nodeName);
        if (params != null) {
          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

            StringBuffer linkText = new StringBuffer();
            getText(linkText, currentNode, true);
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java [132:415]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      boolean abortOnNestedAnchors) {
    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
      return true;
    }
    return false;
  }

  /**
   * This is a convinience method, equivalent to
   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
   * @param sb a {@link StringBuffer} used to store content text 
   * found beneath the DOM node... if any exists
   * @param node a DOM {@link Node} to check for content text
   */
  public void getText(StringBuffer sb, Node node) {
    getText(sb, node, false);
  }

  // returns true if abortOnNestedAnchors is true and we find nested
  // anchors
  private boolean getTextHelper(StringBuffer sb, Node node,
      boolean abortOnNestedAnchors, int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      Node previousSibling = currentNode.getPreviousSibling();
      if (previousSibling != null
          && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
        appendParagraphSeparator(sb);
      } else if (blockNodes.contains(nodeName.toLowerCase())) {
        appendParagraphSeparator(sb);
      }

      if ("script".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if ("style".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
        anchorDepth++;
        if (anchorDepth > 1) {
          abort = true;
          break;
        }
      }
      if (nodeType == Node.COMMENT_NODE) {
        walker.skipChildren();
      }
      if (nodeType == Node.TEXT_NODE) {
        // cleanup and trim the value
        String text = currentNode.getNodeValue();
        text = text.replaceAll("\\s+", " ");
        text = text.trim();
        if (text.length() > 0) {
          appendSpace(sb);
          sb.append(text);
        } else {
          appendParagraphSeparator(sb);
        }
      }
    }

    return abort;
  }

  /**
   * Conditionally append a paragraph/line break to StringBuffer unless last
   * character a already indicates a paragraph break. Also remove trailing space
   * before paragraph break.
   *
   * @param buffer
   *          StringBuffer to append paragraph break
   */
  private void appendParagraphSeparator(StringBuffer buffer) {
    if (buffer.length() == 0) {
      return;
    }
    char lastChar = buffer.charAt(buffer.length() - 1);
    if ('\n' != lastChar) {
      // remove white space before paragraph break
      while (lastChar == ' ') {
        buffer.deleteCharAt(buffer.length() - 1);
        lastChar = buffer.charAt(buffer.length() - 1);
      }
      if ('\n' != lastChar) {
        buffer.append('\n');
      }
    }
  }

  /**
   * Conditionally append a space to StringBuffer unless last character is a
   * space or line/paragraph break.
   *
   * @param buffer
   *          StringBuffer to append space
   */
  private void appendSpace(StringBuffer buffer) {
    if (buffer.length() == 0) {
      return;
    }
    char lastChar = buffer.charAt(buffer.length() - 1);
    if (' ' != lastChar && '\n' != lastChar) {
      buffer.append(' ');
    }
  }

  /**
   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
   * append the content text found beneath the first <code>title</code> node to
   * the <code>StringBuffer</code>.
   * @param sb a {@link StringBuffer} used to store content text 
   * found beneath the DOM node... if any exists
   * @param node a DOM {@link Node} to check for content text
   * @return true if a title node was found, false otherwise
   */
  public boolean getTitle(StringBuffer sb, Node node) {

    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();

      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
        return false;
      }

      if (nodeType == Node.ELEMENT_NODE) {
        if ("title".equalsIgnoreCase(nodeName)) {
          getText(sb, currentNode);
          return true;
        }
      }
    }

    return false;
  }

  /**
   * If Node contains a BASE tag then it's HREF is returned.
   * @param node a DOM {@link Node} to check for a BASE tag
   * @return HREF if one exists
   * */
  public String getBase(Node node) {

    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();

      // is this node a BASE tag?
      if (nodeType == Node.ELEMENT_NODE) {

        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
          return null;
        }

        if ("base".equalsIgnoreCase(nodeName)) {
          NamedNodeMap attrs = currentNode.getAttributes();
          for (int i = 0; i < attrs.getLength(); i++) {
            Node attr = attrs.item(i);
            if ("href".equalsIgnoreCase(attr.getNodeName())) {
              return attr.getNodeValue();
            }
          }
        }
      }
    }

    // no.
    return null;
  }

  private boolean hasOnlyWhiteSpace(Node node) {
    String val = node.getNodeValue();
    for (int i = 0; i < val.length(); i++) {
      if (!Character.isWhitespace(val.charAt(i)))
        return false;
    }
    return true;
  }

  // this only covers a few cases of empty links that are symptomatic
  // of nekohtml's DOM-fixup process...
  private boolean shouldThrowAwayLink(Node node, NodeList children,
      int childLen, LinkParams params) {
    if (childLen == 0) {
      // this has no inner structure
      if (params.childLen == 0)
        return false;
      else
        return true;
    } else if ((childLen == 1)
        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
      // single nested link
      return true;

    } else if (childLen == 2) {

      Node c0 = children.item(0);
      Node c1 = children.item(1);

      if ((c0.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
        // single link followed by whitespace node
        return true;
      }

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
        // whitespace node followed by single link
        return true;
      }

    } else if (childLen == 3) {
      Node c0 = children.item(0);
      Node c1 = children.item(1);
      Node c2 = children.item(2);

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
          && hasOnlyWhiteSpace(c2)) {
        // single link surrounded by whitespace nodes
        return true;
      }
    }

    return false;
  }

  /**
   * This method finds all anchors below the supplied DOM <code>node</code>, and
   * creates appropriate {@link Outlink} records for each (relative to the
   * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
   * {@link ArrayList}.
   * 
   * <p>
   * 
   * Links without inner structure (tags, text, etc) are discarded, as are links
   * which contain only single nested links and empty text nodes (this is a
   * common DOM-fixup artifact, at least with nekohtml).
   * 
   * @param base the canonical {@link URL}
   * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated 
   * with the base URL
   * @param node a {@link Node} under which to discover anchors
   */
  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {

    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      NodeList children = currentNode.getChildNodes();
      int childLen = (children != null) ? children.getLength() : 0;

      if (nodeType == Node.ELEMENT_NODE) {

        nodeName = nodeName.toLowerCase();
        LinkParams params = (LinkParams) linkParams.get(nodeName);
        if (params != null) {
          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

            StringBuffer linkText = new StringBuffer();
            getText(linkText, currentNode, true);
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



