private boolean shouldThrowAwayLink()

in src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java [328:377]


  private boolean shouldThrowAwayLink(Node node, NodeList children,
      int childLen, LinkParams params) {
    if (childLen == 0) {
      // this has no inner structure
      if (params.childLen == 0)
        return false;
      else
        return true;
    } else if ((childLen == 1)
        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
      // single nested link
      return true;

    } else if (childLen == 2) {

      Node c0 = children.item(0);
      Node c1 = children.item(1);

      if ((c0.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
        // single link followed by whitespace node
        return true;
      }

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
        // whitespace node followed by single link
        return true;
      }

    } else if (childLen == 3) {
      Node c0 = children.item(0);
      Node c1 = children.item(1);
      Node c2 = children.item(2);

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
          && hasOnlyWhiteSpace(c2)) {
        // single link surrounded by whitespace nodes
        return true;
      }
    }

    return false;
  }