public void getOutlinks()

in src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java [398:493]


  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {

    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {

      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
      NodeList children = currentNode.getChildNodes();
      int childLen = (children != null) ? children.getLength() : 0;

      if (nodeType == Node.ELEMENT_NODE) {

        nodeName = nodeName.toLowerCase();
        LinkParams params = (LinkParams) linkParams.get(nodeName);
        if (params != null) {
          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

            StringBuffer linkText = new StringBuffer();
            getText(linkText, currentNode, true);
            if (linkText.toString().trim().length() == 0) {
              // try harder - use img alt if present
              NodeWalker subWalker = new NodeWalker(currentNode);
              while (subWalker.hasNext()) {
                Node subNode = subWalker.nextNode();
                if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                  if (subNode.getNodeName().toLowerCase().equals("img")) {
                    NamedNodeMap subAttrs = subNode.getAttributes();
                    Node alt = subAttrs.getNamedItem("alt");
                    if (alt != null) {
                      String altTxt = alt.getTextContent();
                      if (altTxt != null && altTxt.trim().length() > 0) {
                        if (linkText.length() > 0)
                          linkText.append(' ');
                        linkText.append(altTxt);
                      }
                    }
                  } else {
                    // ignore other types of elements

                  }
                } else if (subNode.getNodeType() == Node.TEXT_NODE) {
                  String txt = subNode.getTextContent();
                  if (txt != null && txt.length() > 0) {
                    if (linkText.length() > 0)
                      linkText.append(' ');
                    linkText.append(txt);
                  }
                }
              }
            }

            NamedNodeMap attrs = currentNode.getAttributes();
            String target = null;
            boolean noFollow = false;
            boolean post = false;
            for (int i = 0; i < attrs.getLength(); i++) {
              Node attr = attrs.item(i);
              String attrName = attr.getNodeName();
              if (params.attrName.equalsIgnoreCase(attrName)) {
                target = attr.getNodeValue();
              } else if ("rel".equalsIgnoreCase(attrName)
                  && NOFOLLOW_PATTERN.matcher(attr.getNodeValue()).find()) {
                noFollow = true;
              } else if ("method".equalsIgnoreCase(attrName)
                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
                post = true;
              }
            }
            if (target != null && !noFollow && !post)
              try {

                URL url = URLUtil.resolveURL(base, target);
                Outlink outlink = new Outlink(url.toString(), linkText
                    .toString().trim());
                outlinks.add(outlink);

                // NUTCH-2433 - Keep the node name where the URL was found into
                // the outlink metadata
                if (keepNodenames) {
                  MapWritable metadata = new MapWritable();
                  metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                  outlink.setMetadata(metadata);
                }

              } catch (MalformedURLException e) {
                // don't care
              }
          }
          // this should not have any children, skip them
          if (params.childLen == 0)
            continue;
        }
      }
    }
  }