in Readability.js [2444:2642]
_cleanConditionally(e, tag) {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
return;
}
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
// First check if this node IS data table, in which case don't remove it.
var isDataTable = function (t) {
return t._readabilityDataTable;
};
var isList = tag === "ul" || tag === "ol";
if (!isList) {
var listLength = 0;
var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
this._forEachNode(
listNodes,
list => (listLength += this._getInnerText(list).length)
);
isList = listLength / this._getInnerText(node).length > 0.9;
}
if (tag === "table" && isDataTable(node)) {
return false;
}
// Next check if we're inside a data table, in which case don't remove it as well.
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
return false;
}
if (this._hasAncestorTag(node, "code")) {
return false;
}
// keep element if it has a data tables
if (
[...node.getElementsByTagName("table")].some(
tbl => tbl._readabilityDataTable
)
) {
return false;
}
var weight = this._getClassWeight(node);
this.log("Cleaning Conditionally", node);
var contentScore = 0;
if (weight + contentScore < 0) {
return true;
}
if (this._getCharCount(node, ",") < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
var li = node.getElementsByTagName("li").length - 100;
var input = node.getElementsByTagName("input").length;
var headingDensity = this._getTextDensity(node, [
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
]);
var embedCount = 0;
var embeds = this._getAllNodesWithTag(node, [
"object",
"embed",
"iframe",
]);
for (var i = 0; i < embeds.length; i++) {
// If this embed has attribute that matches video regex, don't delete it.
for (var j = 0; j < embeds[i].attributes.length; j++) {
if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
return false;
}
}
// For embed with <object> tag, check inner HTML as well.
if (
embeds[i].tagName === "object" &&
this._allowedVideoRegex.test(embeds[i].innerHTML)
) {
return false;
}
embedCount++;
}
var innerText = this._getInnerText(node);
// toss any node whose inner text contains nothing but suspicious words
if (
this.REGEXPS.adWords.test(innerText) ||
this.REGEXPS.loadingWords.test(innerText)
) {
return true;
}
var contentLength = innerText.length;
var linkDensity = this._getLinkDensity(node);
var textishTags = ["SPAN", "LI", "TD"].concat(
Array.from(this.DIV_TO_P_ELEMS)
);
var textDensity = this._getTextDensity(node, textishTags);
var isFigureChild = this._hasAncestorTag(node, "figure");
// apply shadiness checks, then check for exceptions
const shouldRemoveNode = () => {
const errs = [];
if (!isFigureChild && img > 1 && p / img < 0.5) {
errs.push(`Bad p to img ratio (img=${img}, p=${p})`);
}
if (!isList && li > p) {
errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`);
}
if (input > Math.floor(p / 3)) {
errs.push(`Too many inputs per p. (input=${input}, p=${p})`);
}
if (
!isList &&
!isFigureChild &&
headingDensity < 0.9 &&
contentLength < 25 &&
(img === 0 || img > 2) &&
linkDensity > 0
) {
errs.push(
`Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})`
);
}
if (
!isList &&
weight < 25 &&
linkDensity > 0.2 + this._linkDensityModifier
) {
errs.push(
`Low weight and a little linky. (linkDensity=${linkDensity})`
);
}
if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) {
errs.push(
`High weight and mostly links. (linkDensity=${linkDensity})`
);
}
if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
errs.push(
`Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`
);
}
if (img === 0 && textDensity === 0) {
errs.push(
`No useful content. (img=${img}, textDensity=${textDensity})`
);
}
if (errs.length) {
this.log("Checks failed", errs);
return true;
}
return false;
};
var haveToRemove = shouldRemoveNode();
// Allow simple lists of images to remain in pages
if (isList && haveToRemove) {
for (var x = 0; x < node.children.length; x++) {
let child = node.children[x];
// Don't filter in lists with li's that contain more than one child
if (child.children.length > 1) {
return haveToRemove;
}
}
let li_count = node.getElementsByTagName("li").length;
// Only allow the list to remain if every li contains an image
if (img == li_count) {
return false;
}
}
return haveToRemove;
}
return false;
});
},