in Readability.js [792:894]
_prepArticle(articleContent) {
this._cleanStyles(articleContent);
// Check for data tables before we continue, to avoid removing items in
// those tables, which will often be isolated even though they're
// visually linked to other content-ful elements (text, images, etc.).
this._markDataTables(articleContent);
this._fixLazyImages(articleContent);
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
this._clean(articleContent, "aside");
// Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
this._forEachNode(articleContent.children, function (topCandidate) {
this._cleanMatchedNodes(topCandidate, function (node, matchString) {
return (
this.REGEXPS.shareElements.test(matchString) &&
node.textContent.length < shareElementThreshold
);
});
});
this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
this._clean(articleContent, "textarea");
this._clean(articleContent, "select");
this._clean(articleContent, "button");
this._cleanHeaders(articleContent);
// Do these last as the previous stuff may have removed junk
// that will affect these
this._cleanConditionally(articleContent, "table");
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");
// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(
this._getAllNodesWithTag(articleContent, ["h1"]),
"h2"
);
// Remove extra paragraphs
this._removeNodes(
this._getAllNodesWithTag(articleContent, ["p"]),
function (paragraph) {
// At this point, nasty iframes have been removed; only embedded video
// ones remain.
var contentElementCount = this._getAllNodesWithTag(paragraph, [
"img",
"embed",
"object",
"iframe",
]).length;
return (
contentElementCount === 0 && !this._getInnerText(paragraph, false)
);
}
);
this._forEachNode(
this._getAllNodesWithTag(articleContent, ["br"]),
function (br) {
var next = this._nextNode(br.nextSibling);
if (next && next.tagName == "P") {
br.remove();
}
}
);
// Remove single-cell tables
this._forEachNode(
this._getAllNodesWithTag(articleContent, ["table"]),
function (table) {
var tbody = this._hasSingleTagInsideElement(table, "TBODY")
? table.firstElementChild
: table;
if (this._hasSingleTagInsideElement(tbody, "TR")) {
var row = tbody.firstElementChild;
if (this._hasSingleTagInsideElement(row, "TD")) {
var cell = row.firstElementChild;
cell = this._setNodeTag(
cell,
this._everyNode(cell.childNodes, this._isPhrasingContent)
? "P"
: "DIV"
);
table.parentNode.replaceChild(cell, table);
}
}
}
);
},