in Readability.js [573:661]
_getArticleTitle() {
var doc = this._doc;
var curTitle = "";
var origTitle = "";
try {
curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string") {
curTitle = origTitle = this._getInnerText(
doc.getElementsByTagName("title")[0]
);
}
} catch (e) {
/* ignore exceptions setting the title. */
}
var titleHadHierarchicalSeparators = false;
function wordCount(str) {
return str.split(/\s+/).length;
}
// If there's a separator in the title, first remove the final part
const titleSeparators = /\|\-–—\\\/>»/.source;
if (new RegExp(`\\s[${titleSeparators}]\\s`).test(curTitle)) {
titleHadHierarchicalSeparators = /\s[\\\/>»]\s/.test(curTitle);
let allSeparators = Array.from(
origTitle.matchAll(new RegExp(`\\s[${titleSeparators}]\\s`, "gi"))
);
curTitle = origTitle.substring(0, allSeparators.pop().index);
// If the resulting title is too short, remove the first part instead:
if (wordCount(curTitle) < 3) {
curTitle = origTitle.replace(
new RegExp(`^[^${titleSeparators}]*[${titleSeparators}]`, "gi"),
""
);
}
} else if (curTitle.includes(": ")) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]);
var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function (heading) {
return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
if (!match) {
curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
// If the title is now too short, try the first colon instead:
if (wordCount(curTitle) < 3) {
curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
// But if we have too many words before the colon there's something weird
// with the titles and the H tags so let's just use the original title instead
} else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
curTitle = origTitle;
}
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
var hOnes = doc.getElementsByTagName("h1");
if (hOnes.length === 1) {
curTitle = this._getInnerText(hOnes[0]);
}
}
curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
// If we now have 4 words or fewer as our title, and either no
// 'hierarchical' separators (\, /, > or ») were found in the original
// title or we decreased the number of words by more than 1 word, use
// the original title.
var curTitleWordCount = wordCount(curTitle);
if (
curTitleWordCount <= 4 &&
(!titleHadHierarchicalSeparators ||
curTitleWordCount !=
wordCount(
origTitle.replace(new RegExp(`\\s[${titleSeparators}]\\s`, "g"), "")
) -
1)
) {
curTitle = origTitle;
}
return curTitle;
},