feat: extractor for the verge (#33)

* feat: extractor for the verge's standard article template * feat: basic support for the verge feature template * feat: allow multiple links to be previewed * feat: content selector arrays Content selector arrays allow custom parsers to select multiple elements to match and include in the result. * feat: updated verge parser to use multimatch selectors * lint fix * cleanup test builds
8 years ago · a0570f8e94
parent 233ca11a33
commit a0570f8e94
8 changed files with 305 additions and 14 deletions
--- a/fixtures/www.theverge.com/1480520999617.html
+++ b/fixtures/www.theverge.com/1480520999617.html
--- a/fixtures/www.theverge.com/1480526003318.html
+++ b/fixtures/www.theverge.com/1480526003318.html
--- a/24
+++ b/24
@ -7,18 +7,20 @@ execSync('MERCURY_TEST_BUILD=true npm run build')

 var Mercury = require('./dist/mercury_test')

-var url = process.argv[2]
+var urls = process.argv.slice(2)

-console.log(`Fetching link`)
-Mercury.parse(url, null, { fallback: false }).then(function(result) {
-  var htmlFile = './preview.html'
-  var jsonFile = './preview.json'
+console.log(`Fetching link(s)`)

-  var html = `<h1>${result.title}</h1>${result.content}`
+urls.map(url => {
+  Mercury.parse(url, null, { fallback: false }).then(function(result) {
+    var htmlFile = './preview.html'
+    var jsonFile = './preview.json'

-  fs.writeFileSync(htmlFile, html)
-  fs.writeFileSync(jsonFile, JSON.stringify(result))
-  execSync(`open ${htmlFile}`)
-  execSync(`open ${jsonFile}`)
-})
+    var html = `<h1>${result.title}</h1>${result.content}`

+    fs.writeFileSync(htmlFile, html)
+    fs.writeFileSync(jsonFile, JSON.stringify(result))
+    execSync(`open ${jsonFile}`)
+    execSync(`open ${htmlFile}`)
+  })
+})
--- a/src/extractors/custom/index.js
+++ b/src/extractors/custom/index.js
@ -21,3 +21,4 @@ export * from './www.washingtonpost.com';
 export * from './www.huffingtonpost.com';
 export * from './newrepublic.com';
 export * from './money.cnn.com';
+export * from './www.theverge.com';
--- a/src/extractors/custom/www.theverge.com/index.js
+++ b/src/extractors/custom/www.theverge.com/index.js
@ -0,0 +1,66 @@
+export const WwwThevergeComExtractor = {
+  domain: 'www.theverge.com',
+
+  title: {
+    selectors: [
+      'h1',
+    ],
+  },
+
+  author: {
+    selectors: [
+      ['meta[name="author"]', 'value'],
+    ],
+  },
+
+  date_published: {
+    selectors: [
+      ['meta[name="article:published_time"]', 'value'],
+    ],
+  },
+
+  dek: {
+    selectors: [
+      'h2.p-dek',
+    ],
+  },
+
+  lead_image_url: {
+    selectors: [
+      ['meta[name="og:image"]', 'value'],
+    ],
+  },
+
+  content: {
+    selectors: [
+      // feature template multi-match
+      ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'],
+      // regular post multi-match
+      ['.e-image--hero', '.c-entry-content'],
+      // feature template fallback
+      '.l-wrapper .l-feature',
+      // regular post fallback
+      'div.c-entry-content',
+    ],
+
+    // Transform lazy-loaded images
+    transforms: {
+      noscript: ($node) => {
+        const $children = $node.children();
+        if ($children.length === 1 && $children.get(0).tagName === 'img') {
+          return 'span';
+        }
+
+        return null;
+      },
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: [
+      '.aside',
+      'img.c-dynamic-image', // images come from noscript transform
+    ],
+  },
+};
--- a/src/extractors/custom/www.theverge.com/index.test.js
+++ b/src/extractors/custom/www.theverge.com/index.test.js
@ -0,0 +1,145 @@
+import assert from 'assert';
+import fs from 'fs';
+import URL from 'url';
+import cheerio from 'cheerio';
+
+import Mercury from 'mercury';
+import getExtractor from 'extractors/get-extractor';
+import { excerptContent } from 'utils/text';
+
+describe('WwwThevergeComExtractor', () => {
+  it('is selected properly', () => {
+    // This test should be passing by default.
+    // It sanity checks that the correct parser
+    // is being selected for URLs from this domain
+    const url =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+    const extractor = getExtractor(url);
+    assert.equal(extractor.domain, URL.parse(url).hostname);
+  });
+
+  it('returns the title', async () => {
+    // To pass this test, fill out the title selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
+    const articleUrl =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+
+    const { title } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(title, 'AT&T just declared war on an open internet (and us)');
+  });
+
+  it('returns the author', async () => {
+    // To pass this test, fill out the author selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
+    const articleUrl =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+
+    const { author } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(author, 'T.C. Sottek');
+  });
+
+  it('returns the date_published', async () => {
+    // To pass this test, fill out the date_published selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
+    const articleUrl =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+
+    const { date_published } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(date_published, '2016-11-29T15:00:19.000Z');
+  });
+
+  it('returns the dek', async () => {
+    // To pass this test, fill out the dek selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
+    const articleUrl =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+
+    const { dek } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(dek, '‘Mobilizing Your World’ sounds like a threat now');
+  });
+
+  it('returns the lead_image_url', async () => {
+    // To pass this test, fill out the lead_image_url selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
+    const articleUrl =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+
+    const { lead_image_url } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(lead_image_url, 'https://cdn0.vox-cdn.com/thumbor/v7kU2cISjo-wm6XceGk_kBuMBlA=/0x16:1024x592/1600x900/cdn0.vox-cdn.com/uploads/chorus_image/image/52042639/vrg_tc_attarmy_1024.1480431618.jpeg');
+  });
+
+  it('returns the content', async () => {
+    // To pass this test, fill out the content selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    // You may also want to make use of the clean and transform
+    // options.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
+    const url =
+      'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
+
+    const { content } =
+      await Mercury.parse(url, html, { fallback: false });
+
+    const $ = cheerio.load(content || '');
+
+    const first13 = excerptContent($('*').first().text(), 13);
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(first13, 'Last year we won the open internet back, but the new regulations had');
+    assert.equal($('.e-image--hero').length, 1);
+  });
+
+  it('returns the content from a feature', async () => {
+    // To pass this test, fill out the content selector
+    // in ./src/extractors/custom/www.theverge.com/index.js.
+    // You may also want to make use of the clean and transform
+    // options.
+    const html =
+      fs.readFileSync('./fixtures/www.theverge.com/1480526003318.html');
+    const url =
+      'http://www.theverge.com/2016/10/31/13478080/microsoft-surface-studio-design-engineering-interview';
+
+    const { content } =
+      await Mercury.parse(url, html, { fallback: false });
+
+    const $ = cheerio.load(content || '');
+
+    const first13 = excerptContent($('*').first().text(), 13);
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(first13, 'Microsoft’s Surface PCs are known for their hinges. From the first, launched alongside');
+  });
+});
--- a/src/extractors/root-extractor.js
+++ b/src/extractors/root-extractor.js
@ -39,9 +39,13 @@ export function transformElements($content, $, { transforms }) {
  return $content;
 }

-function findMatchingSelector($, selectors) {
+function findMatchingSelector($, selectors, extractHtml) {
  return selectors.find((selector) => {
    if (Array.isArray(selector)) {
+      if (extractHtml) {
+        return selector.reduce((acc, s) => acc && $(s).length > 0, true);
+      }
+
      const [s, attr] = selector;
      return $(s).length === 1 && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
    }
@ -61,7 +65,7 @@ export function select(opts) {

  const { selectors, defaultCleaner = true } = extractionOpts;

-  const matchingSelector = findMatchingSelector($, selectors);
+  const matchingSelector = findMatchingSelector($, selectors, extractHtml);

  if (!matchingSelector) return null;

@ -71,8 +75,23 @@ export function select(opts) {

  // If the selector type requests html as its return type
  // transform and clean the element with provided selectors
+  let $content;
  if (extractHtml) {
-    let $content = $(matchingSelector);
+    // If matching selector is an array, we're considering this a
+    // multi-match selection, which allows the parser to choose several
+    // selectors to include in the result. Note that all selectors in the
+    // array must match in order for this selector to trigger
+    if (Array.isArray(matchingSelector)) {
+      $content = $(matchingSelector.join(','));
+      const $wrapper = $('<div></div>');
+      $content.each((index, element) => {
+        $wrapper.append(element);
+      });
+
+      $content = $wrapper;
+    } else {
+      $content = $(matchingSelector);
+    }

    // Wrap in div so transformation can take place on root element
    $content.wrap($('<div></div>'));
--- a/src/extractors/root-extractor.test.js
+++ b/src/extractors/root-extractor.test.js
@ -161,4 +161,60 @@ describe('select(opts)', () => {
    const result = select(opts);
    assert.equal(result, '2016-09-07T09:07:59.000Z');
  });
+
+  it('returns a node\'s html when it is a content selector', () => {
+    const html = `
+      <div><div class="content-is-here"><p>Wow what a piece of content</p></div></div>
+    `;
+    const $ = cheerio.load(html);
+    const opts = {
+      type: 'content',
+      $,
+      extractionOpts: {
+        selectors: ['.content-is-here'],
+      },
+      extractHtml: true,
+    };
+
+    const result = select(opts);
+    assertClean(result, html);
+  });
+
+  it('handles multiple matches when the content selector is an array', () => {
+    const html = `
+      <div><div><img class="lead-image" src="#" /><div class="content-is-here"><p>Wow what a piece of content</p></div></div></div>
+    `;
+    const $ = cheerio.load(html);
+    const opts = {
+      type: 'content',
+      $,
+      extractionOpts: {
+        selectors: [['.lead-image', '.content-is-here']],
+      },
+      extractHtml: true,
+    };
+
+    const result = select(opts);
+    assert.equal($(result).find('img.lead-image').length, 1);
+    assert.equal($(result).find('.content-is-here').length, 1);
+  });
+
+  it('skips multi-match if not all selectors are present', () => {
+    const html = `
+      <div><div><img class="lead-image" src="#" /><div class="content-is-here"><p>Wow what a piece of content</p></div></div></div>
+    `;
+    const $ = cheerio.load(html);
+    const opts = {
+      type: 'content',
+      $,
+      extractionOpts: {
+        selectors: [['.lead-image', '.content-is-here', '.foo']],
+      },
+      extractHtml: true,
+    };
+
+    const result = select(opts);
+
+    assert.equal(result, null);
+  });
 });