in src/main/java/com/microsoft/azure/spark/tools/job/YarnContainerLogFetcher.java [335:390]
Map<String, String> parseLogsFromHtml(final String defaultType, final String webPage) {
final Document doc = Jsoup.parse(webPage);
final ListIterator<Element> elementsIterator = Optional.ofNullable(doc.getElementById("navcell"))
.map(Element::nextElementSibling)
.map(Element::children)
.map(ArrayList::listIterator)
.orElse(null);
if (elementsIterator == null) {
return emptyMap();
}
final HashMap<String, String> logTypeMap = new HashMap<>();
final AtomicReference<String> lastLogTypeFound = new AtomicReference<>(defaultType);
while (elementsIterator.hasNext()) {
final Element node = elementsIterator.next();
final List<Node> children = node.childNodes();
if (StringUtils.equalsIgnoreCase(node.tagName(), "p")) {
if (children.isEmpty()) {
continue;
}
// In history server, need to read log type paragraph in page
final String logTypeFound = findLogTypeDomNode(children.get(0));
if (StringUtils.isNotEmpty(logTypeFound)) {
// set the last log type found
// clean up the saved log found
logTypeMap.remove(logTypeFound);
lastLogTypeFound.set(logTypeFound);
}
} else if (StringUtils.equalsIgnoreCase(node.tagName(), "pre") && lastLogTypeFound.get() != null) {
if (children.isEmpty()) {
// found <pre></pre>, empty log
// clean up the log type found
lastLogTypeFound.set(null);
continue;
}
// Only take the first `<pre>` element content as log
// And there are HTML escape codes, such as `2><LOG_DIR>`, which has to be unescaped
final String logs = StringEscapeUtils.unescapeHtml4(String.valueOf(children.get(0)));
// Only take non-empty logs
if (StringUtils.isNotEmpty(logs)) {
// Take log type as key, the log as value
logTypeMap.put(lastLogTypeFound.getAndSet(null), logs);
}
}
}
return logTypeMap;
}