in src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java [57:196]
private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
URL currURL) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(node.getNodeName())) {
// META tags should not be under body
return;
}
if ("meta".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node nameNode = null;
Node equivNode = null;
Node contentNode = null;
// Retrieves name, http-equiv and content attribues
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
equivNode = attr;
} else if (attrName.equals("content")) {
contentNode = attr;
}
}
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if (Nutch.ROBOTS_METATAG.equals(name)) {
String directives = contentNode.getNodeValue()
.toLowerCase(Locale.ROOT);
int index = directives.indexOf("none");
if (index >= 0) {
metaTags.setNoIndex();
metaTags.setNoFollow();
}
index = directives.indexOf("all");
if (index >= 0) {
// do nothing...
}
index = directives.indexOf("noindex");
if (index >= 0) {
metaTags.setNoIndex();
}
index = directives.indexOf("nofollow");
if (index >= 0) {
metaTags.setNoFollow();
}
index = directives.indexOf("noarchive");
if (index >= 0) {
metaTags.setNoCache();
}
} // end if (name == robots)
// meta names added/transformed by Tika
else if (name.equals("pragma")) {
String content = contentNode.getNodeValue()
.toLowerCase(Locale.ROOT);
if (content.contains("no-cache")) {
metaTags.setNoCache();
}
} else if (name.equals("refresh")) {
String content = contentNode.getNodeValue()
.toLowerCase(Locale.ROOT);
setRefresh(metaTags, content, currURL);
} else if (name.equals("content-location")) {
String urlString = contentNode.getNodeValue();
URL url = null;
try {
if (currURL == null) {
url = new URL(urlString);
} else {
url = new URL(currURL, urlString);
}
metaTags.setBaseHref(url);
} catch (MalformedURLException e) {
// ignore, base-href not set
}
}
}
}
if (equivNode != null) {
if (contentNode != null) {
String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
if ("pragma".equals(name)) {
content = content.toLowerCase(Locale.ROOT);
int index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
} else if ("refresh".equals(name)) {
setRefresh(metaTags, content, currURL);
}
}
}
} else if ("base".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node hrefNode = attrs.getNamedItem("href");
if (hrefNode != null) {
String urlString = hrefNode.getNodeValue();
URL url = null;
try {
if (currURL == null)
url = new URL(urlString);
else
url = new URL(currURL, urlString);
} catch (Exception e) {
;
}
if (url != null)
metaTags.setBaseHref(url);
}
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
getMetaTagsHelper(metaTags, children.item(i), currURL);
}
}
}