src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java [134:417]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - boolean abortOnNestedAnchors) { if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { return true; } return false; } /** * This is a convinience method, equivalent to * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. * @param sb a {@link StringBuffer} used to store content text * found beneath the DOM node... if any exists * @param node a DOM {@link Node} to check for content text */ public void getText(StringBuffer sb, Node node) { getText(sb, node, false); } // returns true if abortOnNestedAnchors is true and we find nested // anchors private boolean getTextHelper(StringBuffer sb, Node node, boolean abortOnNestedAnchors, int anchorDepth) { boolean abort = false; NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); Node previousSibling = currentNode.getPreviousSibling(); if (previousSibling != null && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) { appendParagraphSeparator(sb); } else if (blockNodes.contains(nodeName.toLowerCase())) { appendParagraphSeparator(sb); } if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } if ("style".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { anchorDepth++; if (anchorDepth > 1) { abort = true; break; } } if (nodeType == Node.COMMENT_NODE) { walker.skipChildren(); } if (nodeType == Node.TEXT_NODE) { // cleanup and trim the value String text = currentNode.getNodeValue(); text = text.replaceAll("\\s+", " "); text = text.trim(); if (text.length() > 0) { appendSpace(sb); sb.append(text); } else { appendParagraphSeparator(sb); } } } return abort; } /** * Conditionally append a paragraph/line break to StringBuffer unless last * character a already indicates a paragraph break. Also remove trailing space * before paragraph break. * * @param buffer * StringBuffer to append paragraph break */ private void appendParagraphSeparator(StringBuffer buffer) { if (buffer.length() == 0) { return; } char lastChar = buffer.charAt(buffer.length() - 1); if ('\n' != lastChar) { // remove white space before paragraph break while (lastChar == ' ') { buffer.deleteCharAt(buffer.length() - 1); lastChar = buffer.charAt(buffer.length() - 1); } if ('\n' != lastChar) { buffer.append('\n'); } } } /** * Conditionally append a space to StringBuffer unless last character is a * space or line/paragraph break. * * @param buffer * StringBuffer to append space */ private void appendSpace(StringBuffer buffer) { if (buffer.length() == 0) { return; } char lastChar = buffer.charAt(buffer.length() - 1); if (' ' != lastChar && '\n' != lastChar) { buffer.append(' '); } } /** * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will * append the content text found beneath the first title node to * the StringBuffer. * @param sb a {@link StringBuffer} used to store content text * found beneath the DOM node... if any exists * @param node a DOM {@link Node} to check for content text * @return true if a title node was found, false otherwise */ public boolean getTitle(StringBuffer sb, Node node) { NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return false; } if (nodeType == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(nodeName)) { getText(sb, currentNode); return true; } } } return false; } /** * If Node contains a BASE tag then it's HREF is returned. * @param node a DOM {@link Node} to check for a BASE tag * @return HREF if one exists */ public String getBase(Node node) { NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); // is this node a BASE tag? if (nodeType == Node.ELEMENT_NODE) { if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return null; } if ("base".equalsIgnoreCase(nodeName)) { NamedNodeMap attrs = currentNode.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if ("href".equalsIgnoreCase(attr.getNodeName())) { return attr.getNodeValue(); } } } } } // no. return null; } private boolean hasOnlyWhiteSpace(Node node) { String val = node.getNodeValue(); for (int i = 0; i < val.length(); i++) { if (!Character.isWhitespace(val.charAt(i))) return false; } return true; } // this only covers a few cases of empty links that are symptomatic // of nekohtml's DOM-fixup process... private boolean shouldThrowAwayLink(Node node, NodeList children, int childLen, LinkParams params) { if (childLen == 0) { // this has no inner structure if (params.childLen == 0) return false; else return true; } else if ((childLen == 1) && (children.item(0).getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { // single nested link return true; } else if (childLen == 2) { Node c0 = children.item(0); Node c1 = children.item(1); if ((c0.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c0.getNodeName())) && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { // single link followed by whitespace node return true; } if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { // whitespace node followed by single link return true; } } else if (childLen == 3) { Node c0 = children.item(0); Node c1 = children.item(1); Node c2 = children.item(2); if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) && (c0.getNodeType() == Node.TEXT_NODE) && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) && hasOnlyWhiteSpace(c2)) { // single link surrounded by whitespace nodes return true; } } return false; } /** * This method finds all anchors below the supplied DOM node, and * creates appropriate {@link Outlink} records for each (relative to the * supplied base URL), and adds them to the outlinks * {@link ArrayList}. * *

* * Links without inner structure (tags, text, etc) are discarded, as are links * which contain only single nested links and empty text nodes (this is a * common DOM-fixup artifact, at least with nekohtml). * * @param base the canonical {@link URL} * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated * with the base URL * @param node a {@link Node} under which to discover anchors */ public void getOutlinks(URL base, ArrayList outlinks, Node node) { NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); NodeList children = currentNode.getChildNodes(); int childLen = (children != null) ? children.getLength() : 0; if (nodeType == Node.ELEMENT_NODE) { nodeName = nodeName.toLowerCase(); LinkParams params = (LinkParams) linkParams.get(nodeName); if (params != null) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { StringBuffer linkText = new StringBuffer(); getText(linkText, currentNode, true); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java [132:415]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - boolean abortOnNestedAnchors) { if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { return true; } return false; } /** * This is a convinience method, equivalent to * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. * @param sb a {@link StringBuffer} used to store content text * found beneath the DOM node... if any exists * @param node a DOM {@link Node} to check for content text */ public void getText(StringBuffer sb, Node node) { getText(sb, node, false); } // returns true if abortOnNestedAnchors is true and we find nested // anchors private boolean getTextHelper(StringBuffer sb, Node node, boolean abortOnNestedAnchors, int anchorDepth) { boolean abort = false; NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); Node previousSibling = currentNode.getPreviousSibling(); if (previousSibling != null && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) { appendParagraphSeparator(sb); } else if (blockNodes.contains(nodeName.toLowerCase())) { appendParagraphSeparator(sb); } if ("script".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } if ("style".equalsIgnoreCase(nodeName)) { walker.skipChildren(); } if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { anchorDepth++; if (anchorDepth > 1) { abort = true; break; } } if (nodeType == Node.COMMENT_NODE) { walker.skipChildren(); } if (nodeType == Node.TEXT_NODE) { // cleanup and trim the value String text = currentNode.getNodeValue(); text = text.replaceAll("\\s+", " "); text = text.trim(); if (text.length() > 0) { appendSpace(sb); sb.append(text); } else { appendParagraphSeparator(sb); } } } return abort; } /** * Conditionally append a paragraph/line break to StringBuffer unless last * character a already indicates a paragraph break. Also remove trailing space * before paragraph break. * * @param buffer * StringBuffer to append paragraph break */ private void appendParagraphSeparator(StringBuffer buffer) { if (buffer.length() == 0) { return; } char lastChar = buffer.charAt(buffer.length() - 1); if ('\n' != lastChar) { // remove white space before paragraph break while (lastChar == ' ') { buffer.deleteCharAt(buffer.length() - 1); lastChar = buffer.charAt(buffer.length() - 1); } if ('\n' != lastChar) { buffer.append('\n'); } } } /** * Conditionally append a space to StringBuffer unless last character is a * space or line/paragraph break. * * @param buffer * StringBuffer to append space */ private void appendSpace(StringBuffer buffer) { if (buffer.length() == 0) { return; } char lastChar = buffer.charAt(buffer.length() - 1); if (' ' != lastChar && '\n' != lastChar) { buffer.append(' '); } } /** * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will * append the content text found beneath the first title node to * the StringBuffer. * @param sb a {@link StringBuffer} used to store content text * found beneath the DOM node... if any exists * @param node a DOM {@link Node} to check for content text * @return true if a title node was found, false otherwise */ public boolean getTitle(StringBuffer sb, Node node) { NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return false; } if (nodeType == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(nodeName)) { getText(sb, currentNode); return true; } } } return false; } /** * If Node contains a BASE tag then it's HREF is returned. * @param node a DOM {@link Node} to check for a BASE tag * @return HREF if one exists * */ public String getBase(Node node) { NodeWalker walker = new NodeWalker(node); while (walker.hasNext()) { Node currentNode = walker.nextNode(); String nodeName = currentNode.getNodeName(); short nodeType = currentNode.getNodeType(); // is this node a BASE tag? if (nodeType == Node.ELEMENT_NODE) { if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD return null; } if ("base".equalsIgnoreCase(nodeName)) { NamedNodeMap attrs = currentNode.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if ("href".equalsIgnoreCase(attr.getNodeName())) { return attr.getNodeValue(); } } } } } // no. return null; } private boolean hasOnlyWhiteSpace(Node node) { String val = node.getNodeValue(); for (int i = 0; i < val.length(); i++) { if (!Character.isWhitespace(val.charAt(i))) return false; } return true; } // this only covers a few cases of empty links that are symptomatic // of nekohtml's DOM-fixup process... private boolean shouldThrowAwayLink(Node node, NodeList children, int childLen, LinkParams params) { if (childLen == 0) { // this has no inner structure if (params.childLen == 0) return false; else return true; } else if ((childLen == 1) && (children.item(0).getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { // single nested link return true; } else if (childLen == 2) { Node c0 = children.item(0); Node c1 = children.item(1); if ((c0.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c0.getNodeName())) && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) { // single link followed by whitespace node return true; } if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) { // whitespace node followed by single link return true; } } else if (childLen == 3) { Node c0 = children.item(0); Node c1 = children.item(1); Node c2 = children.item(2); if ((c1.getNodeType() == Node.ELEMENT_NODE) && (params.elName.equalsIgnoreCase(c1.getNodeName())) && (c0.getNodeType() == Node.TEXT_NODE) && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0) && hasOnlyWhiteSpace(c2)) { // single link surrounded by whitespace nodes return true; } } return false; } /** * This method finds all anchors below the supplied DOM node, and * creates appropriate {@link Outlink} records for each (relative to the * supplied base URL), and adds them to the outlinks * {@link ArrayList}. * *