Prefer JSON-LD metadata object, when present

pull/609/head
Dan Burzo 4 years ago
parent b1d15c0ef9
commit f208c928cb

@ -53,6 +53,7 @@ function Readability(doc, options) {
this._serializer = options.serializer || function(el) {
return el.innerHTML;
};
this._preferJSONLD = !!options.preferJSONLD || true;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
@ -135,7 +136,9 @@ Readability.prototype = {
whitespace: /^\s*$/,
hasContent: /\S$/,
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
// See: https://schema.org/Article
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
},
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
@ -244,6 +247,21 @@ Readability.prototype = {
Array.prototype.forEach.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, and return the first node that passes
* the supplied test function
*
* For convenience, the current object context is applied to the provided
* test function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The test function.
* @return void
*/
_findNode: function(nodeList, fn) {
return Array.prototype.find.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, return true if any of the provided iterate
* function calls returns true, false otherwise.
@ -1292,12 +1310,82 @@ Readability.prototype = {
});
},
/**
* Try to extract metadata from JSON-LD object.
* For now, only Schema.org objects of type Article or its subtypes are supported.
* @return Object with any metadata that could be extracted (possibly none)
*/
_getJSONLD: function (doc) {
var scripts = this._getAllNodesWithTag(doc, ["script"]);
var jsonLdElement = this._findNode(scripts, function(el) {
return el.getAttribute('type') === 'application/ld+json';
});
if (jsonLdElement) {
try {
// Strip CDATA markers if present
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[\s*|\s*\]\]>\s*$/g, '');
var parsed = JSON.parse(content);
var metadata = {};
if (
!parsed['@context'] ||
!parsed['@context'].match(/^https?\:\/\/schema\.org$/)
) {
return metadata;
}
if (!parsed['@type'] && Array.isArray(parsed['@graph'])) {
parsed = parsed['@graph'].find(it =>
(it['@type'] || '').match(
this.REGEXPS.jsonLdArticleTypes
)
);
}
if (
!parsed ||
!parsed['@type'] ||
!parsed['@type'].match(this.REGEXPS.jsonLdArticleTypes)
) {
return metadata;
}
if (typeof parsed.name === 'string') {
metadata.title = parsed.name;
} else if (typeof parsed.headline === 'string') {
metadata.title = parsed.headline;
}
if (parsed.author && typeof parsed.author.name === 'string') {
metadata.byline = parsed.author.name;
}
if (typeof parsed.description === 'string') {
metadata.excerpt = parsed.description.trim();
}
if (
parsed.publisher &&
typeof parsed.publisher.name === 'string'
) {
metadata.siteName = parsed.publisher.name;
}
return metadata;
} catch (err) {
// ignore malformed JSON-LD
console.error(err);
}
}
return {};
},
/**
* Attempts to get excerpt and byline metadata for the article.
*
* @param {Object} jsonld object containing any metadata that
* could be extracted from JSON-LD object.
*
* @return Object with optional "excerpt" and "byline" properties
*/
_getArticleMetadata: function() {
_getArticleMetadata: function(jsonld) {
var metadata = {};
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");
@ -1343,7 +1431,8 @@ Readability.prototype = {
});
// get title
metadata.title = values["dc:title"] ||
metadata.title = jsonld.title ||
values["dc:title"] ||
values["dcterm:title"] ||
values["og:title"] ||
values["weibo:article:title"] ||
@ -1356,12 +1445,14 @@ Readability.prototype = {
}
// get author
metadata.byline = values["dc:creator"] ||
metadata.byline = jsonld.byline ||
values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"];
// get description
metadata.excerpt = values["dc:description"] ||
metadata.excerpt = jsonld.excerpt ||
values["dc:description"] ||
values["dcterm:description"] ||
values["og:description"] ||
values["weibo:article:description"] ||
@ -1370,7 +1461,8 @@ Readability.prototype = {
values["twitter:description"];
// get site name
metadata.siteName = values["og:site_name"];
metadata.siteName = jsonld.siteName ||
values["og:site_name"];
// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
@ -2029,12 +2121,15 @@ Readability.prototype = {
// Unwrap image from noscript
this._unwrapNoscriptImages(this._doc);
// Extract JSON-LD metadata before removing scripts
var jsonLd = this._preferJSONLD ? this._getJSONLD(this._doc) : {};
// Remove script tags from the document.
this._removeScripts(this._doc);
this._prepDocument();
var metadata = this._getArticleMetadata();
var metadata = this._getArticleMetadata(jsonLd);
this._articleTitle = metadata.title;
var articleContent = this._grabArticle();

@ -2,7 +2,7 @@
"title": "Facebook Is Tracking Me Even Though Im Not on Facebook",
"byline": "By Daniel Kahn Gillmor, Senior Staff Technologist, ACLU Speech, Privacy, and Technology Project",
"dir": "ltr",
"excerpt": "I don't use Facebook. I'm not technophobic — I'm a geek. I've been using email since the early 1990s, I have accounts on hundreds of services around the net, and I do software development and internet protocol design both for work and for fun. I believe that a globe-spanning communications network like the internet can be a positive social force, and I publish much of my own work on the open web.",
"excerpt": "Facebook collects data about people who have never even opted in. But there are ways these non-users can protect themselves.",
"readerable": true,
"siteName": "American Civil Liberties Union"
}

@ -1,5 +1,5 @@
{
"title": "Obama admits US gun laws are his 'biggest frustration' - BBC News",
"title": "Obama admits US gun laws are his 'biggest frustration'",
"byline": null,
"excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.",
"readerable": true,

@ -2,7 +2,7 @@
"title": "Tite diz que errou ao levar taça da Libertadores a Lula em 2012",
"byline": "21.dez.2018 às 10h55",
"dir": null,
"excerpt": "Após rechaçar um encontro da seleção brasileira com o presidente eleito Jair Bolsonaro, o técnico Tite declarou que errou ao levar a taça da Copa Libertadores de 2012, conquistada pelo Corinthians, ao ex-presidente Luiz Inácio Lula da Silva.",
"excerpt": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente",
"siteName": "Folha de S.Paulo",
"readerable": true
}

@ -204,7 +204,7 @@
"description": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente",
"datePublished": "2018-12-21T12:55:00Z",
"image": { "@type": "ImageObject", "url": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_md.jpg", "width": "768", "height": "512" }
"image": { "@type": "ImageObject", "url": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_md.jpg", "width": "768", "height": "512" },
"contentLocation": {
"@type": "Place",

@ -2,7 +2,7 @@
"title": "Node.js and CPU profiling on production (in real-time without downtime)",
"byline": "Vincent Vallet",
"dir": null,
"excerpt": "Why CPU monitoring is important?",
"siteName": "Medium",
"excerpt": "How to run a CPU profiling with Node.js on your production in real-time and without interruption of service.",
"siteName": "Voodoo Engineering",
"readerable": true
}

@ -1,8 +1,8 @@
{
"title": "The 21 best movies of 2017",
"byline": "By Alissa Wilkinson@alissamarie\n Updated Jul 24, 2018, 2:15pm EDT",
"title": "How to watch the 21 best films of 2017",
"byline": "Alissa Wilkinson",
"dir": null,
"excerpt": "How to watch the greatest movies of the year, from Lady Bird and Dunkirk to Get Out and The Big Sick.",
"excerpt": "It was an extraordinary year for movies.",
"siteName": "Vox",
"readerable": true
}

@ -2,7 +2,7 @@
"title": "Screenshot : «Vape Wave», «6 Days», «Alphonse Président»…",
"byline": "Par Alexandre Hervaud et Jérémy Piette",
"dir": null,
"excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine. Pour dépasser...",
"siteName": "Libération.fr",
"excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine.\nPour dépasser...",
"siteName": "Libération",
"readerable": true
}

@ -1,8 +1,8 @@
{
"title": "New Zealand - Wikipedia",
"byline": "Authority control",
"title": "New Zealand",
"byline": "Contributors to Wikimedia projects",
"dir": "ltr",
"excerpt": "Coordinates: 42°S 174°E / 42°S 174°E",
"siteName": null,
"siteName": "Wikimedia Foundation, Inc.",
"readerable": true
}

@ -1,8 +1,8 @@
{
"title": "Hermitian matrix - Wikipedia",
"byline": null,
"title": "Hermitian matrix",
"byline": "Contributors to Wikimedia projects",
"dir": "ltr",
"excerpt": "In mathematics, a Hermitian matrix (or self-adjoint matrix) is a complex square matrix that is equal to its own conjugate transpose—that is, the element in the i-th row and j-th column is equal to the complex conjugate of the element in the j-th row and i-th column, for all indices i and j:",
"siteName": null,
"siteName": "Wikimedia Foundation, Inc.",
"readerable": true
}

@ -158,23 +158,23 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe
});
it("should extract expected title", function() {
expect(expectedMetadata.title).eql(result.title);
expect(result.title).eql(expectedMetadata.title);
});
it("should extract expected byline", function() {
expect(expectedMetadata.byline).eql(result.byline);
expect(result.byline).eql(expectedMetadata.byline);
});
it("should extract expected excerpt", function() {
expect(expectedMetadata.excerpt).eql(result.excerpt);
expect(result.excerpt).eql(expectedMetadata.excerpt);
});
it("should extract expected site name", function() {
expect(expectedMetadata.siteName).eql(result.siteName);
expect(result.siteName).eql(expectedMetadata.siteName);
});
expectedMetadata.dir && it("should extract expected direction", function() {
expect(expectedMetadata.dir).eql(result.dir);
expect(result.dir).eql(expectedMetadata.dir);
});
});
}
@ -279,7 +279,7 @@ describe("Test pages", function() {
runTestsWithItems("jsdom", function(source) {
var doc = new JSDOM(source, {
url: uri,
url: uri
}).window.document;
removeCommentNodesRecursively(doc);
return doc;

Loading…
Cancel
Save