feat: extractor for the verge (#33)

* feat: extractor for the verge's standard article template

* feat: basic support for the verge feature template

* feat: allow multiple links to be previewed

* feat: content selector arrays

Content selector arrays allow custom parsers to select multiple elements
to match and include in the result.

* feat: updated verge parser to use multimatch selectors

* lint fix

* cleanup test builds
pull/36/head
Silas Burton 8 years ago committed by Adam Pash
parent 233ca11a33
commit a0570f8e94

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -7,18 +7,20 @@ execSync('MERCURY_TEST_BUILD=true npm run build')
var Mercury = require('./dist/mercury_test')
var url = process.argv[2]
var urls = process.argv.slice(2)
console.log(`Fetching link`)
Mercury.parse(url, null, { fallback: false }).then(function(result) {
var htmlFile = './preview.html'
var jsonFile = './preview.json'
console.log(`Fetching link(s)`)
var html = `<h1>${result.title}</h1>${result.content}`
urls.map(url => {
Mercury.parse(url, null, { fallback: false }).then(function(result) {
var htmlFile = './preview.html'
var jsonFile = './preview.json'
fs.writeFileSync(htmlFile, html)
fs.writeFileSync(jsonFile, JSON.stringify(result))
execSync(`open ${htmlFile}`)
execSync(`open ${jsonFile}`)
})
var html = `<h1>${result.title}</h1>${result.content}`
fs.writeFileSync(htmlFile, html)
fs.writeFileSync(jsonFile, JSON.stringify(result))
execSync(`open ${jsonFile}`)
execSync(`open ${htmlFile}`)
})
})

@ -21,3 +21,4 @@ export * from './www.washingtonpost.com';
export * from './www.huffingtonpost.com';
export * from './newrepublic.com';
export * from './money.cnn.com';
export * from './www.theverge.com';

@ -0,0 +1,66 @@
export const WwwThevergeComExtractor = {
domain: 'www.theverge.com',
title: {
selectors: [
'h1',
],
},
author: {
selectors: [
['meta[name="author"]', 'value'],
],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
},
dek: {
selectors: [
'h2.p-dek',
],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
},
content: {
selectors: [
// feature template multi-match
['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'],
// regular post multi-match
['.e-image--hero', '.c-entry-content'],
// feature template fallback
'.l-wrapper .l-feature',
// regular post fallback
'div.c-entry-content',
],
// Transform lazy-loaded images
transforms: {
noscript: ($node) => {
const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'span';
}
return null;
},
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.aside',
'img.c-dynamic-image', // images come from noscript transform
],
},
};

@ -0,0 +1,145 @@
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
describe('WwwThevergeComExtractor', () => {
it('is selected properly', () => {
// This test should be passing by default.
// It sanity checks that the correct parser
// is being selected for URLs from this domain
const url =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.theverge.com/index.js.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
const articleUrl =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const { title } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(title, 'AT&T just declared war on an open internet (and us)');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.theverge.com/index.js.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
const articleUrl =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const { author } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(author, 'T.C. Sottek');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.theverge.com/index.js.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
const articleUrl =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const { date_published } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-11-29T15:00:19.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.theverge.com/index.js.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
const articleUrl =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const { dek } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Mobilizing Your World sounds like a threat now');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.theverge.com/index.js.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
const articleUrl =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const { lead_image_url } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://cdn0.vox-cdn.com/thumbor/v7kU2cISjo-wm6XceGk_kBuMBlA=/0x16:1024x592/1600x900/cdn0.vox-cdn.com/uploads/chorus_image/image/52042639/vrg_tc_attarmy_1024.1480431618.jpeg');
});
it('returns the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.theverge.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480520999617.html');
const url =
'http://www.theverge.com/2016/11/29/13774648/fcc-att-zero-rating-directv-net-neutrality-vs-tmobile';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Last year we won the open internet back, but the new regulations had');
assert.equal($('.e-image--hero').length, 1);
});
it('returns the content from a feature', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.theverge.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.theverge.com/1480526003318.html');
const url =
'http://www.theverge.com/2016/10/31/13478080/microsoft-surface-studio-design-engineering-interview';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Microsofts Surface PCs are known for their hinges. From the first, launched alongside');
});
});

@ -39,9 +39,13 @@ export function transformElements($content, $, { transforms }) {
return $content;
}
function findMatchingSelector($, selectors) {
function findMatchingSelector($, selectors, extractHtml) {
return selectors.find((selector) => {
if (Array.isArray(selector)) {
if (extractHtml) {
return selector.reduce((acc, s) => acc && $(s).length > 0, true);
}
const [s, attr] = selector;
return $(s).length === 1 && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
}
@ -61,7 +65,7 @@ export function select(opts) {
const { selectors, defaultCleaner = true } = extractionOpts;
const matchingSelector = findMatchingSelector($, selectors);
const matchingSelector = findMatchingSelector($, selectors, extractHtml);
if (!matchingSelector) return null;
@ -71,8 +75,23 @@ export function select(opts) {
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
let $content;
if (extractHtml) {
let $content = $(matchingSelector);
// If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
// array must match in order for this selector to trigger
if (Array.isArray(matchingSelector)) {
$content = $(matchingSelector.join(','));
const $wrapper = $('<div></div>');
$content.each((index, element) => {
$wrapper.append(element);
});
$content = $wrapper;
} else {
$content = $(matchingSelector);
}
// Wrap in div so transformation can take place on root element
$content.wrap($('<div></div>'));

@ -161,4 +161,60 @@ describe('select(opts)', () => {
const result = select(opts);
assert.equal(result, '2016-09-07T09:07:59.000Z');
});
it('returns a node\'s html when it is a content selector', () => {
const html = `
<div><div class="content-is-here"><p>Wow what a piece of content</p></div></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'content',
$,
extractionOpts: {
selectors: ['.content-is-here'],
},
extractHtml: true,
};
const result = select(opts);
assertClean(result, html);
});
it('handles multiple matches when the content selector is an array', () => {
const html = `
<div><div><img class="lead-image" src="#" /><div class="content-is-here"><p>Wow what a piece of content</p></div></div></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'content',
$,
extractionOpts: {
selectors: [['.lead-image', '.content-is-here']],
},
extractHtml: true,
};
const result = select(opts);
assert.equal($(result).find('img.lead-image').length, 1);
assert.equal($(result).find('.content-is-here').length, 1);
});
it('skips multi-match if not all selectors are present', () => {
const html = `
<div><div><img class="lead-image" src="#" /><div class="content-is-here"><p>Wow what a piece of content</p></div></div></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'content',
$,
extractionOpts: {
selectors: [['.lead-image', '.content-is-here', '.foo']],
},
extractHtml: true,
};
const result = select(opts);
assert.equal(result, null);
});
});

Loading…
Cancel
Save