feat: added tmz custom parser (#22)

8 years ago · 3a2f32b0eb
parent 783a9cfb2f
commit 3a2f32b0eb
6 changed files with 229 additions and 3 deletions
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -404,7 +404,7 @@ function normalizeMetaTags($) {
 }

 // Spacer images to be removed
-var SPACER_RE = new RegExp('trans|transparent|spacer|blank', 'i');
+var SPACER_RE = new RegExp('transparent|spacer|blank', 'i');

 // The class we will use to mark elements we want to keep
 // but would normally remove
@ -2702,6 +2702,43 @@ var MediumExtractor = {
  }
 };

+var WwwTmzComExtractor = {
+  domain: 'www.tmz.com',
+
+  title: {
+    selectors: ['.post-title-breadcrumb', 'h1', '.headline']
+  },
+
+  author: 'TMZ STAFF',
+
+  date_published: {
+    selectors: ['.article-posted-date']
+  },
+
+  dek: {
+    selectors: [
+      // enter selectors
+    ]
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['.article-content', '.all-post-body'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.lightbox-link']
+  }
+};
+


 var CustomExtractors = Object.freeze({
@ -2722,7 +2759,8 @@ var CustomExtractors = Object.freeze({
 	DeadspinExtractor: DeadspinExtractor,
 	BroadwayWorldExtractor: BroadwayWorldExtractor,
 	ApartmentTherapyExtractor: ApartmentTherapyExtractor,
-	MediumExtractor: MediumExtractor
+	MediumExtractor: MediumExtractor,
+	WwwTmzComExtractor: WwwTmzComExtractor
 });

 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
--- a/fixtures/www.tmz.com/1480368537455.html
+++ b/fixtures/www.tmz.com/1480368537455.html
--- a/src/extractors/custom/index.js
+++ b/src/extractors/custom/index.js
@ -16,3 +16,4 @@ export * from './deadspin.com';
 export * from './www.broadwayworld.com';
 export * from './www.apartmenttherapy.com';
 export * from './medium.com';
+export * from './www.tmz.com';
--- a/src/extractors/custom/www.tmz.com/index.js
+++ b/src/extractors/custom/www.tmz.com/index.js
@ -0,0 +1,50 @@
+export const WwwTmzComExtractor = {
+  domain: 'www.tmz.com',
+
+  title: {
+    selectors: [
+      '.post-title-breadcrumb',
+      'h1',
+      '.headline',
+    ],
+  },
+
+  author: 'TMZ STAFF',
+
+  date_published: {
+    selectors: [
+      '.article-posted-date',
+    ],
+  },
+
+  dek: {
+    selectors: [
+      // enter selectors
+    ],
+  },
+
+  lead_image_url: {
+    selectors: [
+      ['meta[name="og:image"]', 'value'],
+    ],
+  },
+
+  content: {
+    selectors: [
+      '.article-content',
+      '.all-post-body',
+    ],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: [
+      '.lightbox-link',
+    ],
+  },
+};
--- a/src/extractors/custom/www.tmz.com/index.test.js
+++ b/src/extractors/custom/www.tmz.com/index.test.js
@ -0,0 +1,122 @@
+import assert from 'assert';
+import fs from 'fs';
+import URL from 'url';
+import cheerio from 'cheerio';
+
+import Mercury from 'mercury';
+import getExtractor from 'extractors/get-extractor';
+import { excerptContent } from 'utils/text';
+
+describe('WwwTmzComExtractor', () => {
+  it('is selected properly', () => {
+    // This test should be passing by default.
+    // It sanity checks that the correct parser
+    // is being selected for URLs from this domain
+    const url =
+      'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+    const extractor = getExtractor(url);
+    assert.equal(extractor.domain, URL.parse(url).hostname);
+  });
+
+  it('returns the title', async () => {
+    // To pass this test, fill out the title selector
+    // in ./src/extractors/custom/www.tmz.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
+    const articleUrl =
+      'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+
+    const { title } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(title, 'Prince -- Woman Warns Estate ... Step Aside, I\'m His Wife!');
+  });
+
+  it('returns the author', async () => {
+    // To pass this test, fill out the author selector
+    // in ./src/extractors/custom/www.tmz.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
+    const articleUrl =
+      'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+
+    const { author } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(author, 'TMZ STAFF');
+  });
+
+  // it('returns the date_published', async () => {
+  //   // To pass this test, fill out the date_published selector
+  //   // in ./src/extractors/custom/www.tmz.com/index.js.
+  //   const html =
+  //     fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
+  //   const articleUrl =
+  //     'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+  //
+  //   const { date_published } =
+  //     await Mercury.parse(articleUrl, html, { fallback: false });
+  //
+  //   // Update these values with the expected values from
+  //   // the article.
+  //   assert.equal(date_published, '2016-11-28T08:00:00.000Z');
+  // });
+
+  // it('returns the dek', async () => {
+  //   // To pass this test, fill out the dek selector
+  //   // in ./src/extractors/custom/www.tmz.com/index.js.
+  //   const html =
+  //     fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
+  //   const articleUrl =
+  //     'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+  //
+  //   const { dek } =
+  //     await Mercury.parse(articleUrl, html, { fallback: false });
+  //
+  //   // Update these values with the expected values from
+  //   // the article.
+  //   assert.equal(dek, '');
+  // });
+
+  it('returns the lead_image_url', async () => {
+    // To pass this test, fill out the lead_image_url selector
+    // in ./src/extractors/custom/www.tmz.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
+    const articleUrl =
+      'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+
+    const { lead_image_url } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(lead_image_url, 'http://ll-media.tmz.com/2016/11/28/1128-prince-getty-03-1200x630.jpg');
+  });
+
+  it('returns the content', async () => {
+    // To pass this test, fill out the content selector
+    // in ./src/extractors/custom/www.tmz.com/index.js.
+    // You may also want to make use of the clean and transform
+    // options.
+    const html =
+      fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
+    const url =
+      'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
+
+    const { content } =
+      await Mercury.parse(url, html, { fallback: false });
+
+    const $ = cheerio.load(content || '');
+
+    const first13 = excerptContent($('*').first().text(), 13);
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(first13, 'Prince was married when he died and wanted all of his money to');
+  });
+});