in src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java [398:493]
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
NodeList children = currentNode.getChildNodes();
int childLen = (children != null) ? children.getLength() : 0;
if (nodeType == Node.ELEMENT_NODE) {
nodeName = nodeName.toLowerCase();
LinkParams params = (LinkParams) linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
if (linkText.toString().trim().length() == 0) {
// try harder - use img alt if present
NodeWalker subWalker = new NodeWalker(currentNode);
while (subWalker.hasNext()) {
Node subNode = subWalker.nextNode();
if (subNode.getNodeType() == Node.ELEMENT_NODE) {
if (subNode.getNodeName().toLowerCase().equals("img")) {
NamedNodeMap subAttrs = subNode.getAttributes();
Node alt = subAttrs.getNamedItem("alt");
if (alt != null) {
String altTxt = alt.getTextContent();
if (altTxt != null && altTxt.trim().length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(altTxt);
}
}
} else {
// ignore other types of elements
}
} else if (subNode.getNodeType() == Node.TEXT_NODE) {
String txt = subNode.getTextContent();
if (txt != null && txt.length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(txt);
}
}
}
}
NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName)
&& NOFOLLOW_PATTERN.matcher(attr.getNodeValue()).find()) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName)
&& "post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = URLUtil.resolveURL(base, target);
Outlink outlink = new Outlink(url.toString(), linkText
.toString().trim());
outlinks.add(outlink);
// NUTCH-2433 - Keep the node name where the URL was found into
// the outlink metadata
if (keepNodenames) {
MapWritable metadata = new MapWritable();
metadata.put(new Text(srcTagMetaName), new Text(nodeName));
outlink.setMetadata(metadata);
}
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0)
continue;
}
}
}
}