From 8ca8a5f7e5c5026f717e05a10cec000f6f26b7a8 Mon Sep 17 00:00:00 2001 From: Sarah Doire Date: Thu, 6 Oct 2022 13:06:50 -0500 Subject: [PATCH] feat: add postlight.com custom extractor (#695) --- .gitignore | 2 + dist/mercury.js | 32 +- fixtures/postlight.com/1664999338243.html | 650 ++++++++++++++++++ src/extractors/custom/index.js | 1 + src/extractors/custom/postlight.com/index.js | 36 + .../custom/postlight.com/index.test.js | 111 +++ 6 files changed, 831 insertions(+), 1 deletion(-) create mode 100644 fixtures/postlight.com/1664999338243.html create mode 100644 src/extractors/custom/postlight.com/index.js create mode 100644 src/extractors/custom/postlight.com/index.test.js diff --git a/.gitignore b/.gitignore index f2984975..b839f4a6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ dist/mercury_test.js.map dist/mercury_test.web.js tmp/artifacts test-output.json +.tool-versions +.yarnrc.yml diff --git a/dist/mercury.js b/dist/mercury.js index 75df72d6..8ca35c9d 100644 --- a/dist/mercury.js +++ b/dist/mercury.js @@ -6139,6 +6139,35 @@ var SpektrumExtractor = { } }; +var PostlightComExtractor = { + domain: 'postlight.com', + title: { + selectors: [['meta[name="og:title"]', 'value']] + }, + author: { + selectors: [['meta[name="parsely-author"]', 'value']] + }, + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']] + }, + dek: { + selectors: ['h2.single-hero__abstract'] + }, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + content: { + selectors: ['article.body'], + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['section.pl-post-link'] + } +}; + var CustomExtractors = /*#__PURE__*/Object.freeze({ @@ -6285,7 +6314,8 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({ WwwEngadgetComExtractor: WwwEngadgetComExtractor, ArstechnicaComExtractor: ArstechnicaComExtractor, WwwNdtvComExtractor: WwwNdtvComExtractor, - SpektrumExtractor: SpektrumExtractor + SpektrumExtractor: SpektrumExtractor, + PostlightComExtractor: PostlightComExtractor }); var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) { diff --git a/fixtures/postlight.com/1664999338243.html b/fixtures/postlight.com/1664999338243.html new file mode 100644 index 00000000..35cd6f8a --- /dev/null +++ b/fixtures/postlight.com/1664999338243.html @@ -0,0 +1,650 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Three Ways to Be the Thermostat — Postlight — Digital Strategy, Design and Engineering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + Get in touch +
+
+
+
+ +
+
+

Digital Transformation? In This Economy? Join Postlight on 10/20 for a panel discussion on Digital Strategy. RSVP.

+
+
+
+
+ +
+ + + + + + + +
+ + + + + + + + + + + + + + diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index 0fe77004..b91c2429 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -142,3 +142,4 @@ export * from './www.engadget.com'; export * from './arstechnica.com'; export * from './www.ndtv.com'; export * from './www.spektrum.de'; +export * from './postlight.com'; diff --git a/src/extractors/custom/postlight.com/index.js b/src/extractors/custom/postlight.com/index.js new file mode 100644 index 00000000..df7145f6 --- /dev/null +++ b/src/extractors/custom/postlight.com/index.js @@ -0,0 +1,36 @@ +export const PostlightComExtractor = { + domain: 'postlight.com', + + title: { + selectors: [['meta[name="og:title"]', 'value']], + }, + + author: { + selectors: [['meta[name="parsely-author"]', 'value']], + }, + + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']], + }, + + dek: { + selectors: ['h2.single-hero__abstract'], + }, + + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']], + }, + + content: { + selectors: ['article.body'], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: {}, + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['section.pl-post-link'], + }, +}; diff --git a/src/extractors/custom/postlight.com/index.test.js b/src/extractors/custom/postlight.com/index.test.js new file mode 100644 index 00000000..8be71563 --- /dev/null +++ b/src/extractors/custom/postlight.com/index.test.js @@ -0,0 +1,111 @@ +import assert from 'assert'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +const fs = require('fs'); + +describe('PostlightComExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = 'https://postlight.com/insights/three-ways-to-be-the-thermostat'; + const html = fs.readFileSync( + './fixtures/postlight.com/1664999338243.html' + ); + result = Mercury.parse(url, { html, fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/postlight.com/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(title, `Three Ways to Be the Thermostat`); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/postlight.com/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Gina Trapani'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/postlight.com/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, `2022-10-05T16:00:00.000Z`); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/postlight.com/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + dek, + 'Great leaders set the temperature, especially when stress is high.' + ); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/postlight.com/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + lead_image_url, + `https://postlight.com/wp-content/uploads/2022/09/Be-The-Thermostat-1200-1.png?fit=1200%2C675` + ); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/postlight.com/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent( + $('*') + .first() + .text(), + 13 + ); + + // Update these values with the expected values from + // the article. + assert.equal( + first13, + 'One of the best pieces of advice I’ve ever received is: “Be the' + ); + }); + }); +});