in Readability.js [1642:1757]
_getJSONLD(doc) {
var scripts = this._getAllNodesWithTag(doc, ["script"]);
var metadata;
this._forEachNode(scripts, function (jsonLdElement) {
if (
!metadata &&
jsonLdElement.getAttribute("type") === "application/ld+json"
) {
try {
// Strip CDATA markers if present
var content = jsonLdElement.textContent.replace(
/^\s*<!\[CDATA\[|\]\]>\s*$/g,
""
);
var parsed = JSON.parse(content);
if (Array.isArray(parsed)) {
parsed = parsed.find(it => {
return (
it["@type"] &&
it["@type"].match(this.REGEXPS.jsonLdArticleTypes)
);
});
if (!parsed) {
return;
}
}
var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/;
var matches =
(typeof parsed["@context"] === "string" &&
parsed["@context"].match(schemaDotOrgRegex)) ||
(typeof parsed["@context"] === "object" &&
typeof parsed["@context"]["@vocab"] == "string" &&
parsed["@context"]["@vocab"].match(schemaDotOrgRegex));
if (!matches) {
return;
}
if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
parsed = parsed["@graph"].find(it => {
return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes);
});
}
if (
!parsed ||
!parsed["@type"] ||
!parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
) {
return;
}
metadata = {};
if (
typeof parsed.name === "string" &&
typeof parsed.headline === "string" &&
parsed.name !== parsed.headline
) {
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
var title = this._getArticleTitle();
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
var headlineMatches =
this._textSimilarity(parsed.headline, title) > 0.75;
if (headlineMatches && !nameMatches) {
metadata.title = parsed.headline;
} else {
metadata.title = parsed.name;
}
} else if (typeof parsed.name === "string") {
metadata.title = parsed.name.trim();
} else if (typeof parsed.headline === "string") {
metadata.title = parsed.headline.trim();
}
if (parsed.author) {
if (typeof parsed.author.name === "string") {
metadata.byline = parsed.author.name.trim();
} else if (
Array.isArray(parsed.author) &&
parsed.author[0] &&
typeof parsed.author[0].name === "string"
) {
metadata.byline = parsed.author
.filter(function (author) {
return author && typeof author.name === "string";
})
.map(function (author) {
return author.name.trim();
})
.join(", ");
}
}
if (typeof parsed.description === "string") {
metadata.excerpt = parsed.description.trim();
}
if (parsed.publisher && typeof parsed.publisher.name === "string") {
metadata.siteName = parsed.publisher.name.trim();
}
if (typeof parsed.datePublished === "string") {
metadata.datePublished = parsed.datePublished.trim();
}
} catch (err) {
this.log(err.message);
}
}
});
return metadata ? metadata : {};
},