in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java [114:181]
public void startElement(String uri, String local, String name, Attributes atts)
throws SAXException {
if ("HTML".equals(name) && atts.getValue("lang") != null) {
metadata.set(Metadata.CONTENT_LANGUAGE, atts.getValue("lang"));
}
if ("SCRIPT".equals(name)) {
scriptLevel++;
}
if ("TITLE".equals(name) || titleLevel > 0) {
titleLevel++;
}
if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
bodyLevel++;
}
if (mapper.isDiscardElement(name) || discardLevel > 0) {
discardLevel++;
}
if (bodyLevel == 0 && discardLevel == 0) {
if ("META".equals(name) && atts.getValue("content") != null) {
// TIKA-478: For cases where we have either a name or
// "http-equiv", assume that XHTMLContentHandler will emit
// these in the <head>, thus passing them through safely.
if (atts.getValue("http-equiv") != null) {
addHtmlMetadata(atts.getValue("http-equiv"), atts.getValue("content"));
} else if (atts.getValue("name") != null) {
// Record the meta tag in the metadata
addHtmlMetadata(atts.getValue("name"), atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), atts.getValue("content"));
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
startElementWithSafeAttributes("base", atts);
xhtml.endElement("base");
metadata.set(Metadata.CONTENT_LOCATION, resolve(atts.getValue("href")));
} else if ("LINK".equals(name)) {
startElementWithSafeAttributes("link", atts);
xhtml.endElement("link");
} else if ("SCRIPT".equals(name)) {
scriptAtts = atts;
}
}
if (bodyLevel > 0 && discardLevel == 0) {
String safe = mapper.mapSafeElement(name);
if (safe != null) {
startElementWithSafeAttributes(safe, atts);
}
}
title.setLength(0);
String value = atts.getValue("src");
if (value != null && value.startsWith("data:")) {
//don't extract data if we're in a script
//and the user doesn't want to extract scripts
if (scriptLevel == 0 || extractScripts) {
handleDataURIScheme(value);
}
}
if ("IFRAME".equals(name)) {
String srcDoc = atts.getValue("srcdoc");
if (!StringUtils.isBlank(srcDoc)) {
handleSrcDoc(srcDoc);
}
}
}