Map parseLogsFromHtml()

in src/main/java/com/microsoft/azure/spark/tools/job/YarnContainerLogFetcher.java [335:390]


    Map<String, String> parseLogsFromHtml(final String defaultType, final String webPage) {
        final Document doc = Jsoup.parse(webPage);
        final ListIterator<Element> elementsIterator = Optional.ofNullable(doc.getElementById("navcell"))
                .map(Element::nextElementSibling)
                .map(Element::children)
                .map(ArrayList::listIterator)
                .orElse(null);

        if (elementsIterator == null) {
            return emptyMap();
        }

        final HashMap<String, String> logTypeMap = new HashMap<>();
        final AtomicReference<String> lastLogTypeFound = new AtomicReference<>(defaultType);

        while (elementsIterator.hasNext()) {
            final Element node = elementsIterator.next();
            final List<Node> children = node.childNodes();

            if (StringUtils.equalsIgnoreCase(node.tagName(), "p")) {
                if (children.isEmpty()) {
                    continue;
                }

                // In history server, need to read log type paragraph in page
                final String logTypeFound = findLogTypeDomNode(children.get(0));

                if (StringUtils.isNotEmpty(logTypeFound)) {
                    // set the last log type found
                    // clean up the saved log found
                    logTypeMap.remove(logTypeFound);
                    lastLogTypeFound.set(logTypeFound);
                }
            } else if (StringUtils.equalsIgnoreCase(node.tagName(), "pre") && lastLogTypeFound.get() != null) {
                if (children.isEmpty()) {
                    // found <pre></pre>, empty log
                    // clean up the log type found
                    lastLogTypeFound.set(null);

                    continue;
                }

                // Only take the first `<pre>` element content as log
                // And there are HTML escape codes, such as `2&gt;&lt;LOG_DIR&gt`, which has to be unescaped
                final String logs = StringEscapeUtils.unescapeHtml4(String.valueOf(children.get(0)));

                // Only take non-empty logs
                if (StringUtils.isNotEmpty(logs)) {
                    // Take log type as key, the log as value
                    logTypeMap.put(lastLogTypeFound.getAndSet(null), logs);
                }
            }
        }

        return logTypeMap;
    }