chore: update node and some deps (#209)

* chore: update .nvmrc

* added prettier and pre-commit hooks

* update docker image to new node

* add karma-cli to get web tests working

* explictly install karma... seems to fix problem

* remove pre-built phantomjs

* swap install order
pull/212/head
Adam Pash 5 years ago committed by GitHub
parent 78adb2c2a0
commit e4b057f9ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,33 +2,33 @@ version: 2
jobs:
test-node:
docker:
- image: circleci/node:6.14-stretch
- image: circleci/node:8.10
steps:
- checkout
- run: "yarn install"
- run: "yarn lint:ci"
- run: "yarn build"
- run: "./scripts/pr-parser-preview.sh"
- run: "yarn test:node -- --maxWorkers=4"
- run: "yarn test:node --maxWorkers=4"
- store_artifacts:
path: tmp/artifacts
test-web:
docker:
- image: circleci/node:6.14-stretch
- image: circleci/node:8.10-browsers
steps:
- checkout
- run: "yarn install"
# For some reason phantomjs-prebuild is failing w/yarn, but npm installing works
- run: "npm install phantomjs-prebuilt"
# Switch to 7 and lint
- run: "yarn test:web -- --maxWorkers=4"
- run: "yarn build:web -- --maxWorkers=4"
- run: "yarn add karma-cli --dev"
- run: "yarn test:web --maxWorkers=4"
- run: "yarn build:web --maxWorkers=4"
update-fixtures:
docker:
- image: circleci/node:6.14-stretch
- image: circleci/node:8.10
steps:
- checkout
- run: "yarn install"

@ -2,7 +2,7 @@
// Copy this file, and add rule overrides as needed.
{
"parser": "babel-eslint",
"extends": "airbnb",
"extends": ["airbnb", "prettier"],
"plugins": [
"babel"
],

@ -1 +1 @@
6.10
8.10

@ -0,0 +1,5 @@
{
"trailingComma": "es5",
"semi": true,
"singleQuote": true
}

@ -49,12 +49,14 @@
"changelog-maker": "^2.3.0",
"eslint": "^3.8.1",
"eslint-config-airbnb": "^12.0.0",
"eslint-config-prettier": "^3.5.0",
"eslint-import-resolver-babel-module": "^2.0.1",
"eslint-plugin-babel": "^3.3.0",
"eslint-plugin-import": "^1.16.0",
"eslint-plugin-jsx-a11y": "^2.2.3",
"eslint-plugin-react": "^6.4.1",
"express": "^4.14.0",
"husky": "^1.3.1",
"inquirer": "^1.1.3",
"jasmine-core": "^2.5.2",
"jest": "^16.0.2",
@ -62,16 +64,19 @@
"karma": "^1.3.0",
"karma-browserify": "^5.1.0",
"karma-chrome-launcher": "^2.0.0",
"karma-cli": "^2.0.0",
"karma-jasmine": "^1.0.2",
"karma-mocha": "^1.3.0",
"karma-phantomjs-launcher": "^1.0.2",
"karma-requirejs": "^1.1.0",
"lint-staged": "^8.1.0",
"mocha": "^3.1.2",
"nock": "^9.0.2",
"ora": "^0.3.0",
"phantomjs-polyfill-find": "ptim/phantomjs-polyfill-find",
"phantomjs-polyfill-string-includes": "^1.0.0",
"phantomjs-prebuilt": "^2.1.14",
"prettier": "^1.15.3",
"requirejs": "^2.3.2",
"rollup": "^0.36.3",
"rollup-plugin-babel": "^2.6.1",
@ -104,5 +109,21 @@
"request": "browser-request",
"iconv-lite": "./src/shims/iconv-lite",
"moment-timezone": "./node_modules/moment-timezone/builds/moment-timezone-with-data-2010-2020.min.js"
},
"husky": {
"hooks": {
"pre-commit": "lint-staged"
}
},
"lint-staged": {
"*.{js}": [
"eslint --fix",
"prettier --write",
"git add"
],
"*.{json,css,md}": [
"prettier --write",
"git add"
]
}
}

@ -8,8 +8,10 @@ let urls = [
title: 'Iraqi troops storm town south of Mosul',
},
{
url: 'https://www.cnn.com/2018/10/12/us/before-after-aerial-images-mexico-beach-devastation-trnd/index.html',
title: 'Before and after images show there\'s nothing left in some parts of Mexico Beach',
url:
'https://www.cnn.com/2018/10/12/us/before-after-aerial-images-mexico-beach-devastation-trnd/index.html',
title:
"Before and after images show there's nothing left in some parts of Mexico Beach",
},
];
@ -24,7 +26,8 @@ if (process.env.CI) {
if (cheerio.browser) {
require('../dist/mercury.web');
}
const Merc = typeof Mercury === 'undefined' ? require('../dist/mercury') : Mercury;
const Merc =
typeof Mercury === 'undefined' ? require('../dist/mercury') : Mercury;
describe('Is Mercury build working', () => {
beforeAll(() => {
@ -38,15 +41,17 @@ if (process.env.CI) {
});
urls.map(article =>
it(`gets this title right ${article.title}`, (done) => {
Merc.parse(article.url).then((result) => {
assert.equal(article.title, result.title);
done();
}).catch((e) => {
console.log(e.name, e.message); // eslint-disable-line no-console
assert.equal(true, false);
done();
});
it(`gets this title right ${article.title}`, done => {
Merc.parse(article.url)
.then(result => {
assert.equal(article.title, result.title);
done();
})
.catch(e => {
console.log(e.name, e.message); // eslint-disable-line no-console
assert.equal(true, false);
done();
});
})
);
});

@ -1,7 +1,7 @@
/* eslint-disable */
const bot = require('@jesses/circle-github-bot').default.create();
const Mercury = require('../dist/mercury.js');
const fs = require("fs");
const fs = require('fs');
const run = () => {
const screenshotPath = process.argv[2];
@ -11,33 +11,42 @@ const run = () => {
const html = fs.readFileSync(`${fixture}`);
// first parse is just to get the url
Mercury.parse('http://example.com', html, { fallback: false }).then(({ url, domain, excerpt, word_count, direction }) => {
// with the url, second pass will test the correct parser
Mercury.parse(url, html, { fallback: false }).then(json => {
// removing excerpt b/c this comes from content, not necessary
delete json.excerpt
// adding items that aren't pulled in custom parser w/out fallback
Object.assign(json, { url, domain, word_count, direction });
// a quick preview of the parsed content in an html file
const previewHtml = `<h1>${json.title}</h1><img src=${json.lead_image_url} /><p>${json.author}</p>${json.content}`
const jsonPath = `${screenshotPath}-parsed.json`;
const fixtureArtifactPath = `tmp/artifacts/${fixture}`;
const previewPath = `tmp/artifacts/${fixture}.preview.html`;
fs.writeFileSync(previewPath, previewHtml);
fs.writeFileSync(jsonPath, JSON.stringify(json));
fs.writeFileSync(fixtureArtifactPath, html);
bot.comment(process.env.GH_AUTH_TOKEN, `### 🤖 Automated Parsing Preview 🤖
Mercury.parse('http://example.com', html, { fallback: false }).then(
({ url, domain, excerpt, word_count, direction }) => {
// with the url, second pass will test the correct parser
Mercury.parse(url, html, { fallback: false }).then(json => {
// removing excerpt b/c this comes from content, not necessary
delete json.excerpt;
// adding items that aren't pulled in custom parser w/out fallback
Object.assign(json, { url, domain, word_count, direction });
// a quick preview of the parsed content in an html file
const previewHtml = `<h1>${json.title}</h1><img src=${
json.lead_image_url
} /><p>${json.author}</p>${json.content}`;
const jsonPath = `${screenshotPath}-parsed.json`;
const fixtureArtifactPath = `tmp/artifacts/${fixture}`;
const previewPath = `tmp/artifacts/${fixture}.preview.html`;
fs.writeFileSync(previewPath, previewHtml);
fs.writeFileSync(jsonPath, JSON.stringify(json));
fs.writeFileSync(fixtureArtifactPath, html);
bot.comment(
process.env.GH_AUTH_TOKEN,
`### 🤖 Automated Parsing Preview 🤖
**Commit:** \`${bot.env.commitMessage}\`
![Screenshot of fixture (this embed should work after repo is public)](${bot.artifactUrl(screenshotPath)})
![Screenshot of fixture (this embed should work after repo is public)](${bot.artifactUrl(
screenshotPath
)})
[Original Article](${url}) | ${bot.artifactLink(fixtureArtifactPath, 'HTML Fixture')} | ${bot.artifactLink(previewPath, 'Parsed Content Preview')}
[Original Article](${url}) | ${bot.artifactLink(
fixtureArtifactPath,
'HTML Fixture'
)} | ${bot.artifactLink(previewPath, 'Parsed Content Preview')}
<details>
<summary><b>Parsed JSON</b></summary>
@ -52,12 +61,15 @@ ${JSON.stringify(json, null, 2)}
**\`null\` fields**
${Object.keys(json).map(key => json[key] !== null ? '' : ` * \`${key}\n\``).join('') || 'None'}
${Object.keys(json)
.map(key => (json[key] !== null ? '' : ` * \`${key}\n\``))
.join('') || 'None'}
`
);
});
});
}
);
});
}
);
};
run();

@ -7,10 +7,7 @@ import inquirer from 'inquirer';
import ora from 'ora';
import { exec } from 'child_process';
import {
stripJunkTags,
makeLinksAbsolute,
} from 'utils/dom';
import { stripJunkTags, makeLinksAbsolute } from 'utils/dom';
import Mercury from '../dist/mercury';
import extractorTemplate from './templates/custom-extractor';
import extractorTestTemplate from './templates/custom-extractor-test';
@ -19,7 +16,8 @@ const questions = [
{
type: 'input',
name: 'website',
message: 'Paste a url to an article you\'d like to create or extend a parser for:',
message:
"Paste a url to an article you'd like to create or extend a parser for:",
validate(value) {
const { hostname } = URL.parse(value);
if (hostname) return true;
@ -74,7 +72,7 @@ const urlArg = process.argv[2];
if (urlArg) {
scaffoldCustomParser(urlArg);
} else {
inquirer.prompt(questions).then((answers) => {
inquirer.prompt(questions).then(answers => {
scaffoldCustomParser(answers.website);
});
}
@ -82,17 +80,17 @@ if (urlArg) {
function generateScaffold(url, file, result) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname, extractorName(hostname));
const extractorTest =
extractorTestTemplate(
file, url, getDir(url), result, extractorName(hostname)
);
const extractorTest = extractorTestTemplate(
file,
url,
getDir(url),
result,
extractorName(hostname)
);
fs.writeFileSync(`${getDir(url)}/index.js`, extractor);
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest);
fs.appendFileSync(
'./src/extractors/custom/index.js',
exportString(url),
);
fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));
exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`);
}
@ -116,9 +114,13 @@ function savePage($, [url], newParser) {
fs.writeFileSync(file, html);
Mercury.parse(url, html).then((result) => {
Mercury.parse(url, html).then(result => {
if (newParser) {
confirm(generateScaffold, [url, file, result], 'Generating parser and tests');
confirm(
generateScaffold,
[url, file, result],
'Generating parser and tests'
);
console.log(`Your custom site extractor has been set up. To get started building it, run
yarn watch:test -- ${hostname}
-- OR --

@ -3,7 +3,10 @@ const page = require('webpage').create();
const system = require('system');
const args = system.args;
const fixtures = args.slice(1)[0].slice(0, -1).split(',');
const fixtures = args
.slice(1)[0]
.slice(0, -1)
.split(',');
const totalRenders = fixtures.length;
var renderCount = 0;
@ -17,12 +20,13 @@ function pageRenderComplete() {
}
function capturePage() {
const fixturePath = fixtures[renderCount]
const fixturePath = fixtures[renderCount];
page.viewportSize = { width: 1366, height: 768 };
page.open(fixturePath, function() {
// set default background to white (otherwise can sometimes get transparent bg in png
const script = "function() { \
const script =
"function() { \
var style = document.createElement('style'); \
var text = document.createTextNode('body { background: #fff }'); \
style.setAttribute('type', 'text/css'); \

@ -4,9 +4,8 @@
// require('phantomjs-prebuilt').path = './node_modules/.bin/phantomjs';
// }
module.exports = function (config) {
module.exports = function(config) {
config.set({
// base path that will be used to resolve all patterns (eg. files, exclude)
basePath: '',
@ -23,8 +22,7 @@ module.exports = function (config) {
],
// list of files to exclude
exclude: [
],
exclude: [],
// preprocess matching files before serving them to the browser
// available preprocessors: https://npmjs.org/browse/keyword/karma-preprocessor
@ -34,10 +32,7 @@ module.exports = function (config) {
browserify: {
debug: true,
transform: [
'brfs-babel',
'babelify',
],
transform: ['brfs-babel', 'babelify'],
},
// test results reporter to use
@ -61,7 +56,7 @@ module.exports = function (config) {
// start these browsers
// available browser launchers: https://npmjs.org/browse/keyword/karma-launcher
// browsers: ['PhantomJS'],
browsers: [(process.env.CI ? 'PhantomJS' : 'Chrome')],
browsers: [process.env.CI ? 'PhantomJS' : 'Chrome'],
// browsers: ['Chrome'],
// Continuous Integration mode
@ -71,6 +66,5 @@ module.exports = function (config) {
// Concurrency level
// how many browser should be started simultaneous
concurrency: Infinity,
});
};

@ -3,7 +3,7 @@ const express = require('express'); // eslint-disable-line import/no-extraneous-
const request = require('request');
const app = express();
var server
var server;
const start = () => {
app.use('/', (req, res) => {
@ -25,15 +25,17 @@ const start = () => {
});
server = app.listen(process.env.PORT || 3000);
}
};
const stop = () => {
server && server.close()
}
server && server.close();
};
if (!process.env.CI) {
start()
require('child_process').execSync('./node_modules/karma/bin/karma start ./scripts/karma.conf.js', {stdio:[0,1,2]});
stop()
start();
require('child_process').execSync(
'./node_modules/karma/bin/karma start ./scripts/karma.conf.js',
{ stdio: [0, 1, 2] }
);
stop();
}

@ -8,10 +8,7 @@ babelOpts.runtimeHelpers = true;
export default {
entry: './scripts/generate-custom-parser.js',
plugins: [
commonjs(),
babel(babelOpts),
],
plugins: [commonjs(), babel(babelOpts)],
format: 'cjs',
dest: 'dist/generate-custom-parser.js', // equivalent to --output
sourceMap: true,

@ -4,7 +4,7 @@ const { execFile, execFileSync } = require('child_process');
const fs = require('fs');
const path = require('path');
const URL = require('url');
const octokit = require('@octokit/rest')()
const octokit = require('@octokit/rest')();
const Mercury = require('../dist/mercury');
@ -17,112 +17,144 @@ execFile('find', ['fixtures', '-type', 'f'], (err, stdout) => {
// iterate through fixtures for fixtures older than 2 weeks
console.log('Finding fixtures to update...');
const fixturesToUpdate = fixtures.filter(fixture => {
const timestamp = path.basename(fixture).split(/\.html$/)[0].trim();
try {
const date = new Date(parseInt(timestamp, 10));
return now - date > twoWeeks;
} catch (e) {
// if fixture isn't a timestamp, ignore it
return false;
}
}).slice(0, 1);
const fixturesToUpdate = fixtures
.filter(fixture => {
const timestamp = path
.basename(fixture)
.split(/\.html$/)[0]
.trim();
try {
const date = new Date(parseInt(timestamp, 10));
return now - date > twoWeeks;
} catch (e) {
// if fixture isn't a timestamp, ignore it
return false;
}
})
.slice(0, 1);
console.log(`${fixturesToUpdate.length} fixtures are out of date`);
// iterate through fixtures and extract their URLs.
console.log('Extracting urls...');
const baseDomains = fixturesToUpdate.map(fixture => fixture.split("/")[1])
Promise.all(fixturesToUpdate.map((fixture, i) => {
const html = fs.readFileSync(fixture);
return Mercury.parse(`http://${baseDomains[i]}`, html)
})).then(parsedFixture => {
const fixturesAndUrls = fixturesToUpdate.reduce((acc, fixture, i) =>
acc.concat({
fixture,
url: parsedFixture[i].url,
baseDomain: baseDomains[i]
}), []);
const baseDomains = fixturesToUpdate.map(fixture => fixture.split('/')[1]);
Promise.all(
fixturesToUpdate.map((fixture, i) => {
const html = fs.readFileSync(fixture);
return Mercury.parse(`http://${baseDomains[i]}`, html);
})
).then(parsedFixture => {
const fixturesAndUrls = fixturesToUpdate.reduce(
(acc, fixture, i) =>
acc.concat({
fixture,
url: parsedFixture[i].url,
baseDomain: baseDomains[i],
}),
[]
);
console.log('Updating all fixtures');
const fns = fixturesAndUrls.map(fixtureAndUrl => {
return () => {
// console.log('Updating fixture for', fixtureAndUrl);
return updateFixture(fixtureAndUrl)
}
}).concat(() => {
return new Promise(res => {
console.log('changed bases', changeBase)
console.log(`otherMess`, otherMess);
res();
const fns = fixturesAndUrls
.map(fixtureAndUrl => {
return () => {
// console.log('Updating fixture for', fixtureAndUrl);
return updateFixture(fixtureAndUrl);
};
})
});
promiseSerial(fns)
})
.concat(() => {
return new Promise(res => {
console.log('changed bases', changeBase);
console.log(`otherMess`, otherMess);
res();
});
});
promiseSerial(fns);
});
});
const changeBase = [];
const otherMess = [];
const updateFixture = (({ fixture, url, baseDomain }) => {
const updateFixture = ({ fixture, url, baseDomain }) => {
return new Promise(res => {
Mercury.parse(url).then(({ url: updatedUrl }) => {
if (!updatedUrl) {
otherMess.push({ updatedUrl, url, fixture, baseDomain })
return res();
}
console.log(`updatedUrl`, updatedUrl);
const { hostname } = URL.parse(updatedUrl);
if (hostname !== baseDomain) {
console.log('Base URL has changed!!! Do something different');
console.log(`url`, url);
Mercury.parse(url)
.then(({ url: updatedUrl }) => {
if (!updatedUrl) {
otherMess.push({ updatedUrl, url, fixture, baseDomain });
return res();
}
console.log(`updatedUrl`, updatedUrl);
console.log(`hostname`, hostname);
changeBase.push({ fixture, url, baseDomain, newBaseDomain: hostname, updatedUrl });
return res();
}
execFile('yarn', ['generate-parser', url], (err, stdout) => {
// console.log(`stdout`, stdout);
const dirRe = new RegExp(`(${path.dirname(fixture)}\/\\d+\.html)`);
const newFixture = stdout.match(dirRe)[0]
console.log(`newFixture`, newFixture);
// replace old fixture with new fixture in tests
execFile('./scripts/find-and-replace.sh', [fixture, newFixture, 'src/extractors/custom/**/*.test.js'], (err, stdout) => {
// remove old fixture
fs.unlinkSync(fixture)
const { branchName, commitMessage } = doTestsPass(baseDomain) ? {
branchName: `chore-update-${baseDomain}-fixture`,
commitMessage: `chore: update ${baseDomain} fixture`
} : {
branchName: `fix-update-${baseDomain}-extractor`,
commitMessage: `fix: update ${baseDomain} extractor`
}
createAndPushBranch({ branchName, commitMessage })
createPR({ branchName, title: commitMessage })
const { hostname } = URL.parse(updatedUrl);
if (hostname !== baseDomain) {
console.log('Base URL has changed!!! Do something different');
console.log(`url`, url);
console.log(`updatedUrl`, updatedUrl);
console.log(`hostname`, hostname);
changeBase.push({
fixture,
url,
baseDomain,
newBaseDomain: hostname,
updatedUrl,
});
return res();
}
execFile('yarn', ['generate-parser', url], (err, stdout) => {
// console.log(`stdout`, stdout);
const dirRe = new RegExp(`(${path.dirname(fixture)}\/\\d+\.html)`);
const newFixture = stdout.match(dirRe)[0];
console.log(`newFixture`, newFixture);
// replace old fixture with new fixture in tests
execFile(
'./scripts/find-and-replace.sh',
[fixture, newFixture, 'src/extractors/custom/**/*.test.js'],
(err, stdout) => {
// remove old fixture
fs.unlinkSync(fixture);
const { branchName, commitMessage } = doTestsPass(baseDomain)
? {
branchName: `chore-update-${baseDomain}-fixture`,
commitMessage: `chore: update ${baseDomain} fixture`,
}
: {
branchName: `fix-update-${baseDomain}-extractor`,
commitMessage: `fix: update ${baseDomain} extractor`,
};
createAndPushBranch({ branchName, commitMessage });
createPR({ branchName, title: commitMessage });
}
);
});
})
.catch(e => {
otherMess.push({ fixture, url, baseDomain, e });
});
}).catch(e => {
otherMess.push({ fixture, url, baseDomain, e });
});
});
});
};
const doTestsPass = (site) => {
const doTestsPass = site => {
try {
execFileSync('yarn', ['test:node', site]);
return true
return true;
} catch (e) {
return false;
}
}
};
const promiseSerial = funcs =>
funcs.reduce((promise, func) =>
promise.then(result => func().then(Array.prototype.concat.bind(result))),
Promise.resolve([]))
funcs.reduce(
(promise, func) =>
promise.then(result => func().then(Array.prototype.concat.bind(result))),
Promise.resolve([])
);
const createAndPushBranch = ({ branchName, commitMessage }) => {
execFileSync('git', ['config', 'user.email', 'adam.pash+postlight-bot@postlight.com']);
execFileSync('git', [
'config',
'user.email',
'adam.pash+postlight-bot@postlight.com',
]);
execFileSync('git', ['config', 'user.name', 'Postlight Bot']);
execFileSync('git', ['checkout', '-b', branchName]);
execFileSync('git', ['add', '.']);
@ -130,24 +162,25 @@ const createAndPushBranch = ({ branchName, commitMessage }) => {
execFileSync('git', [
'push',
'-q',
`https://${process.env.GH_AUTH_TOKEN}@github.com/postlight/mercury-parser.git`
`https://${
process.env.GH_AUTH_TOKEN
}@github.com/postlight/mercury-parser.git`,
]);
}
};
const createPR = ({ branchName, title, body = '' }) => {
octokit.authenticate({
type: 'token',
token: process.env.GH_AUTH_TOKEN
})
token: process.env.GH_AUTH_TOKEN,
});
octokit.pulls.create({
owner: 'postlight',
owner: 'postlight',
repo: 'mercury-parser',
title,
head: branchName,
base: 'master',
body,
maintainer_can_modify: true
})
}
title,
head: branchName,
base: 'master',
body,
maintainer_can_modify: true,
});
};

@ -4,7 +4,5 @@ import { CLEAN_AUTHOR_RE } from './constants';
// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
export default function cleanAuthor(author) {
return normalizeSpaces(
author.replace(CLEAN_AUTHOR_RE, '$2').trim()
);
return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}

@ -1,6 +1,6 @@
// CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
@ -13,17 +13,14 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// - dc.description
// However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [
];
export const DEK_META_TAGS = [];
// An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit.
//
// Should be more restrictive than not, as a failed dek can be pretty
// detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [
'.entry-summary',
];
export const DEK_SELECTORS = ['.entry-summary'];
// CLEAN DATE PUBLISHED CONSTANTS
export const MS_DATE_STRING = /^\d{13}$/i;
@ -49,8 +46,10 @@ const allMonths = months.join('|');
const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
const timestamp3 = '-[0-9]{3,4}$';
export const SPLIT_DATE_STRING =
new RegExp(`(${timestamp1})|(${timestamp2})|(${timestamp3})|([0-9]{1,4})|(${allMonths})`, 'ig');
export const SPLIT_DATE_STRING = new RegExp(
`(${timestamp1})|(${timestamp2})|(${timestamp3})|([0-9]{1,4})|(${allMonths})`,
'ig'
);
// 2016-11-22T08:57-500
// Check if datetime string has an offset at the end
@ -61,5 +60,4 @@ export const TIME_WITH_OFFSET_RE = /-\d{3,4}$/;
// title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;
export const DOMAIN_ENDINGS_RE =
new RegExp('.com$|.net$|.org$|.co.uk$', 'g');
export const DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g');

@ -14,13 +14,7 @@ import {
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(
article,
{
$,
cleanConditionally = true,
title = '',
url = '',
defaultCleaner = true,
}
{ $, cleanConditionally = true, title = '', url = '', defaultCleaner = true }
) {
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.

@ -20,10 +20,11 @@ describe('extractCleanNode(article, { $, cleanConditionally, title } })', () =>
const cleanNode = extractCleanNode(bestNode, { $, opts });
const text = $(cleanNode).text()
.replace(/\n/g, '')
.replace(/\s+/g, ' ')
.trim();
const text = $(cleanNode)
.text()
.replace(/\n/g, '')
.replace(/\s+/g, ' ')
.trim();
assert.equal(text.length === 2656 || text.length === 2657, true);
});
});

@ -16,11 +16,11 @@ import {
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim();
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim();
}
export function createDate(dateString, timezone, format) {
@ -28,14 +28,17 @@ export function createDate(dateString, timezone, format) {
return moment(new Date(dateString));
}
return timezone ?
moment.tz(dateString, format || parseFormat(dateString), timezone) :
moment(dateString, format || parseFormat(dateString));
return timezone
? moment.tz(dateString, format || parseFormat(dateString), timezone)
: moment(dateString, format || parseFormat(dateString));
}
// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
export default function cleanDatePublished(dateString, { timezone, format } = {}) {
export default function cleanDatePublished(
dateString,
{ timezone, format } = {}
) {
// If string is in milliseconds or seconds, convert to int and return
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
return new Date(parseInt(dateString, 10)).toISOString();

@ -26,16 +26,19 @@ describe('cleanDatePublished(dateString)', () => {
it('handles timezones', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
const datePublished =
cleanDatePublished('November 29, 2016: 8:18 AM ET', { timezone: 'America/New_York' });
const datePublished = cleanDatePublished('November 29, 2016: 8:18 AM ET', {
timezone: 'America/New_York',
});
assert.equal(datePublished, '2016-11-29T13:18:00.000Z');
});
it('accepts a custom date format', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
const datePublished =
cleanDatePublished('Mon Aug 03 12:45:00 EDT 2015', { timezone: 'America/New_York', format: 'ddd MMM DD HH:mm:ss zz YYYY' });
const datePublished = cleanDatePublished('Mon Aug 03 12:45:00 EDT 2015', {
timezone: 'America/New_York',
format: 'ddd MMM DD HH:mm:ss zz YYYY',
});
assert.equal(datePublished, '2015-08-03T16:45:00.000Z');
});
});
@ -73,7 +76,9 @@ describe('cleanDateString(dateString)', () => {
it('can handle some tough timestamps', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.');
const date1 = cleanDateString(
'This page was last modified on 15 April 2016, at 10:59.'
);
assert.equal(date1, '15 Apr 2016 10:59');
});

@ -1,8 +1,5 @@
import { stripTags } from 'utils/dom';
import {
excerptContent,
normalizeSpaces,
} from 'utils/text';
import { excerptContent, normalizeSpaces } from 'utils/text';
import { TEXT_LINK_RE } from './constants';
@ -13,7 +10,8 @@ export default function cleanDek(dek, { $, excerpt }) {
if (dek.length > 1000 || dek.length < 5) return null;
// Check that dek isn't the same as excerpt
if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;
if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10))
return null;
const dekText = stripTags(dek, $);

@ -13,7 +13,7 @@ describe('cleanDek(dekString, { $ })', () => {
const $ = cheerio.load('<div></div>');
const longDek =
// generate a string that is 1,280 chars
[0, 1, 2, 3, 4, 5, 6].reduce((acc) => {
[0, 1, 2, 3, 4, 5, 6].reduce(acc => {
acc += acc;
return acc;
}, '0123456789');

@ -1,10 +1,7 @@
import URL from 'url';
import wuzzy from 'wuzzy';
import {
TITLE_SPLITTERS_RE,
DOMAIN_ENDINGS_RE,
} from './constants';
import { TITLE_SPLITTERS_RE, DOMAIN_ENDINGS_RE } from './constants';
function extractBreadcrumbTitle(splitTitle, text) {
// This must be a very breadcrumbed title, like:
@ -19,15 +16,16 @@ function extractBreadcrumbTitle(splitTitle, text) {
return acc;
}, {});
const [maxTerm, termCount] =
Reflect.ownKeys(termCounts)
.reduce((acc, key) => {
if (acc[1] < termCounts[key]) {
return [key, termCounts[key]];
}
const [maxTerm, termCount] = Reflect.ownKeys(termCounts).reduce(
(acc, key) => {
if (acc[1] < termCounts[key]) {
return [key, termCounts[key]];
}
return acc;
}, [0, 0]);
return acc;
},
[0, 0]
);
// We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead.
@ -38,7 +36,10 @@ function extractBreadcrumbTitle(splitTitle, text) {
}
const splitEnds = [splitTitle[0], splitTitle.slice(-1)];
const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');
const longestEnd = splitEnds.reduce(
(acc, end) => (acc.length > end.length ? acc : end),
''
);
if (longestEnd.length > 10) {
return longestEnd;
@ -67,7 +68,10 @@ function cleanDomainFromTitle(splitTitle, url) {
return splitTitle.slice(2).join('');
}
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
const endSlug = splitTitle
.slice(-1)[0]
.toLowerCase()
.replace(' ', '');
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
if (endSlugRatio > 0.4 && endSlug.length >= 5) {

@ -3,18 +3,16 @@ import RootExtractor from 'extractors/root-extractor';
import GenericExtractor from 'extractors/generic';
import Resource from 'resource';
export default async function collectAllPages(
{
next_page_url,
html,
$,
metaCache,
result,
Extractor,
title,
url,
}
) {
export default async function collectAllPages({
next_page_url,
html,
$,
metaCache,
result,
Extractor,
title,
url,
}) {
// At this point, we've fetched just the first page
let pages = 1;
const previousUrls = [removeAnchor(url)];
@ -41,13 +39,17 @@ export default async function collectAllPages(
previousUrls.push(next_page_url);
result = {
...result,
content: `${result.content}<hr><h4>Page ${pages}</h4>${nextPageResult.content}`,
content: `${result.content}<hr><h4>Page ${pages}</h4>${
nextPageResult.content
}`,
};
next_page_url = nextPageResult.next_page_url;
}
const word_count = GenericExtractor.word_count({ content: `<div>${result.content}</div>` });
const word_count = GenericExtractor.word_count({
content: `<div>${result.content}</div>`,
});
return {
...result,
total_pages: pages,

@ -6,7 +6,7 @@
//
describe('collectAllPages(opts)', () => {
it('fetches additional pages', () => {
// const html = fs.readFileSync('./fixtures/ars.html');
// const $ = cheerio.load(html);
// const html = fs.readFileSync('./fixtures/ars.html');
// const $ = cheerio.load(html);
});
});

@ -2,45 +2,31 @@ export const twofortysevensportsComExtractor = {
domain: '247sports.com',
title: {
selectors: [
'title',
'article header h1',
],
selectors: ['title', 'article header h1'],
},
author: {
selectors: [
'.author',
],
selectors: ['.author'],
},
date_published: {
selectors: [
['time[data-published]', 'data-published'],
],
selectors: [['time[data-published]', 'data-published']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'section.body.article',
],
selectors: ['section.body.article'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('twofortysevensportsComExtractor', () => {
beforeAll(() => {
url =
'http://247sports.com/Bolt/Breaking-Houston-hires-Major-Applewhite-as-head-coach-49676932';
const html =
fs.readFileSync('./fixtures/247sports.com/1481309665090.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/247sports.com/1481309665090.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,49 @@ describe('twofortysevensportsComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/247sports.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/247sports.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Breaking: Houston hires Major Applewhite as head coach');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Breaking: Houston hires Major Applewhite as head coach'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/247sports.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/247sports.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Travis Haney');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/247sports.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/247sports.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-09T09:13:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/247sports.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/247sports.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://s3media.247sports.com/Uploads/Assets/149/971/26_4971149.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://s3media.247sports.com/Uploads/Assets/149/971/26_4971149.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('twofortysevensportsComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, '(Photo: Brett Davis, USA TODAY Sports) Houston has promoted offensive coordinator Major Applewhite');
assert.equal(
first13,
'(Photo: Brett Davis, USA TODAY Sports) Houston has promoted offensive coordinator Major Applewhite'
);
});
});
});

@ -2,50 +2,33 @@ export const AbcnewsGoComExtractor = {
domain: 'abcnews.go.com',
title: {
selectors: [
'.article-header h1',
],
selectors: ['.article-header h1'],
},
author: {
selectors: [
'.authors',
],
clean: [
'.author-overlay',
'.by-text',
],
selectors: ['.authors'],
clean: ['.author-overlay', '.by-text'],
},
date_published: {
selectors: [
'.timestamp',
],
selectors: ['.timestamp'],
timezone: 'America/New_York',
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.article-copy',
],
selectors: ['.article-copy'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('AbcnewsGoComExtractor', () => {
beforeAll(() => {
url =
'http://abcnews.go.com/US/hillary-clinton-putins-alleged-involvement-democratic-hack-stems/story?id=44233864&cid=clicksource_4380645_2_three_posts_vert_hed';
const html =
fs.readFileSync('./fixtures/abcnews.go.com/1481922563840.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/abcnews.go.com/1481922563840.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,49 @@ describe('AbcnewsGoComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Hillary Clinton: Putin\'s Alleged Involvement in Democratic Hack Stems From Longtime Grudge');
// Update these values with the expected values from
// the article.
assert.equal(
title,
"Hillary Clinton: Putin's Alleged Involvement in Democratic Hack Stems From Longtime Grudge"
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Josh Haskell David Caplan PATRICK REEVELL');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-16T17:37:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/abcnews.go.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://a.abcnews.com/images/Politics/AP-hillary-clinton-01-as-161216_16x9_992.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://a.abcnews.com/images/Politics/AP-hillary-clinton-01-as-161216_16x9_992.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('AbcnewsGoComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Hillary Clinton has an explanation for Vladimir Putin\'s alleged involvement in the hacking');
assert.equal(
first13,
"Hillary Clinton has an explanation for Vladimir Putin's alleged involvement in the hacking"
);
});
});
});

@ -4,13 +4,10 @@ export const BloggerExtractor = {
// Blogger is insane and does not load its content
// initially in the page, but it's all there
// in noscript
selectors: [
'.post-content noscript',
],
selectors: ['.post-content noscript'],
// Selectors to remove from the extracted content
clean: [
],
clean: [],
// Convert the noscript tag to a div
transforms: {
@ -19,20 +16,14 @@ export const BloggerExtractor = {
},
author: {
selectors: [
'.post-author-name',
],
selectors: ['.post-author-name'],
},
title: {
selectors: [
'.post h2.title',
],
selectors: ['.post h2.title'],
},
date_published: {
selectors: [
'span.publishdate',
],
selectors: ['span.publishdate'],
},
};

@ -11,27 +11,20 @@ export const DeadspinExtractor = {
],
title: {
selectors: [
'h1.headline',
],
selectors: ['h1.headline'],
},
author: {
selectors: [
'.author',
],
selectors: ['.author'],
},
content: {
selectors: [
'.post-content',
'.entry-content',
],
selectors: ['.post-content', '.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'iframe.lazyload[data-recommend-id^="youtube://"]': ($node) => {
'iframe.lazyload[data-recommend-id^="youtube://"]': $node => {
const youtubeId = $node.attr('id').split('youtube-')[1];
$node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);
},
@ -40,22 +33,15 @@ export const DeadspinExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.magnifier',
'.lightbox',
],
clean: ['.magnifier', '.lightbox'],
},
date_published: {
selectors: [
['time.updated[datetime]', 'datetime'],
],
selectors: [['time.updated[datetime]', 'datetime']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
dek: {

@ -14,10 +14,10 @@ describe('DeadspinExtractor', () => {
beforeAll(() => {
url =
'http://deadspin.com/the-nationals-are-stuck-with-danny-espinosa-tonight-un-1787706769';
const html =
fs.readFileSync('./fixtures/deadspin.com/1476389931786.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/deadspin.com/1476389931786.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
@ -31,7 +31,10 @@ describe('DeadspinExtractor', () => {
// Update these values with the expected values from
// the article.
const { title } = await result;
assert.equal(title, 'The Nationals Are Stuck With Danny Espinosa Tonight, Unless They Opt For The Only Thing Worse');
assert.equal(
title,
'The Nationals Are Stuck With Danny Espinosa Tonight, Unless They Opt For The Only Thing Worse'
);
});
it('returns the author', async () => {
@ -61,7 +64,10 @@ describe('DeadspinExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://i.kinja-img.com/gawker-media/image/upload/s--SUEXWZgf--/c_fill,fl_progressive,g_center,h_450,q_80,w_800/vmeayd7lteyycwzcdlju.jpg');
assert.equal(
lead_image_url,
'https://i.kinja-img.com/gawker-media/image/upload/s--SUEXWZgf--/c_fill,fl_progressive,g_center,h_450,q_80,w_800/vmeayd7lteyycwzcdlju.jpg'
);
});
it('returns the content', async () => {
@ -73,11 +79,19 @@ describe('DeadspinExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Photo credit: Rob Carr/Getty Washingtons Danny Espinosa problem is inextricably linked to its');
assert.equal(
first13,
'Photo credit: Rob Carr/Getty Washingtons Danny Espinosa problem is inextricably linked to its'
);
});
});
@ -86,13 +100,11 @@ describe('DeadspinExtractor', () => {
// in ./src/extractors/custom/deadspin.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/deadspin.com/1477505848605.html');
const html = fs.readFileSync('./fixtures/deadspin.com/1477505848605.html');
const url =
'http://deadspin.com/remember-when-donald-trump-got-booed-for-butchering-ta-1788216229';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const { content } = await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');

@ -12,7 +12,8 @@ export const WikiaExtractor = {
author: {
selectors: [
'.author vcard', '.fn',
'.author vcard',
'.fn',
// enter author selectors
],
},
@ -26,32 +27,24 @@ export const WikiaExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [
],
transforms: [],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
dek: {
selectors: [
],
selectors: [],
},
next_page_url: null,

@ -13,12 +13,11 @@ describe('WikiaExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://fandom.wikia.com/articles/box-office-good-peculiar';
const html = fs.readFileSync(
'./fixtures/fandom.wikia.com/1475595373938.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
@ -68,7 +67,10 @@ describe('WikiaExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://fandom.wikia.com/wp-content/uploads/2016/10/box-office-peculiar-feature-hero.jpg');
assert.equal(
lead_image_url,
'http://fandom.wikia.com/wp-content/uploads/2016/10/box-office-peculiar-feature-hero.jpg'
);
});
it('returns the content', async () => {
@ -80,11 +82,19 @@ describe('WikiaExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Tim Burton once again claimed the top spot at the box office. Miss');
assert.equal(
first13,
'Tim Burton once again claimed the top spot at the box office. Miss'
);
});
});
});

@ -2,47 +2,33 @@ export const FortuneComExtractor = {
domain: 'fortune.com',
title: {
selectors: [
'h1',
],
selectors: ['h1'],
},
author: {
selectors: [
['meta[name="author"]', 'value'],
],
selectors: [['meta[name="author"]', 'value']],
},
date_published: {
selectors: [
'.MblGHNMJ',
],
selectors: ['.MblGHNMJ'],
timezone: 'UTC',
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
['picture', 'article.row'],
'article.row',
],
selectors: [['picture', 'article.row'], 'article.row'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -12,12 +12,9 @@ describe('FortuneComExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://fortune.com/2016/12/15/amazon-alexa-gadgets/';
const html =
fs.readFileSync('./fixtures/fortune.com/1485216994169.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://fortune.com/2016/12/15/amazon-alexa-gadgets/';
const html = fs.readFileSync('./fixtures/fortune.com/1485216994169.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +26,49 @@ describe('FortuneComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/fortune.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/fortune.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Here Are 6 Cant-Miss Gadgets Powered by Amazons Alexa Assistant');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Here Are 6 Cant-Miss Gadgets Powered by Amazons Alexa Assistant'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/fortune.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/fortune.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'John Patrick Pullen, TIME');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/fortune.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/fortune.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-15T14:57:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/fortune.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/fortune.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://fortunedotcom.files.wordpress.com/2016/05/amazon-echo-2.jpg?w=720');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://fortunedotcom.files.wordpress.com/2016/05/amazon-echo-2.jpg?w=720'
);
});
it('returns the content', async () => {
@ -77,11 +80,19 @@ describe('FortuneComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The Amazon Echo Dot, the retailers puck-shaped smart speaker, is is killing it');
assert.equal(
first13,
'The Amazon Echo Dot, the retailers puck-shaped smart speaker, is is killing it'
);
});
});
});

@ -2,22 +2,15 @@ export const ForwardComExtractor = {
domain: 'forward.com',
title: {
selectors: [
['meta[name="og:title"]', 'value'],
],
selectors: [['meta[name="og:title"]', 'value']],
},
author: {
selectors: [
'.author-name',
['meta[name="sailthru.author"]', 'value'],
],
selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']],
},
date_published: {
selectors: [
['meta[name="date"]', 'value'],
],
selectors: [['meta[name="date"]', 'value']],
},
dek: {
@ -27,28 +20,19 @@ export const ForwardComExtractor = {
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
['.post-item-media-wrap', '.post-item p'],
],
selectors: [['.post-item-media-wrap', '.post-item p']],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.donate-box',
'.message',
'.subtitle',
],
clean: ['.donate-box', '.message', '.subtitle'],
},
};

@ -14,10 +14,8 @@ describe('ForwardComExtractor', () => {
beforeAll(() => {
url =
'http://forward.com/schmooze/358592/why-does-slack-want-me-to-say-hummus/';
const html =
fs.readFileSync('./fixtures/forward.com/1488392273490.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/forward.com/1488392273490.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,32 +27,35 @@ describe('ForwardComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/forward.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/forward.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'The Adorable Reason You Can Set Your Alert Sound to “Hummus” on Slack');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'The Adorable Reason You Can Set Your Alert Sound to “Hummus” on Slack'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/forward.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/forward.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Laura E. Adkins');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/forward.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/forward.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-28T20:32:00.000Z');
});
@ -69,13 +70,16 @@ describe('ForwardComExtractor', () => {
// });
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/forward.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/forward.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://s3.amazonaws.com/assets.forward.com/images/cropped/gettyimages-457536286-1482958420.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://s3.amazonaws.com/assets.forward.com/images/cropped/gettyimages-457536286-1482958420.jpg'
);
});
it('returns the content', async () => {
@ -87,11 +91,19 @@ describe('ForwardComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Getty Images Sometimes (okay, a lot of the time), if Im in the');
assert.equal(
first13,
'Getty Images Sometimes (okay, a lot of the time), if Im in the'
);
});
});
});

@ -2,23 +2,15 @@ export const FusionNetExtractor = {
domain: 'fusion.net',
title: {
selectors: [
'.post-title',
'.single-title',
'.headline',
],
selectors: ['.post-title', '.single-title', '.headline'],
},
author: {
selectors: [
'.show-for-medium .byline',
],
selectors: ['.show-for-medium .byline'],
},
date_published: {
selectors: [
['time.local-time', 'datetime'],
],
selectors: [['time.local-time', 'datetime']],
},
dek: {
@ -28,9 +20,7 @@ export const FusionNetExtractor = {
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
@ -48,8 +38,6 @@ export const FusionNetExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,8 @@ describe('FusionNetExtractor', () => {
beforeAll(() => {
url =
'http://fusion.net/story/377467/la-la-land-oscar-hollywood-musicals-race/';
const html =
fs.readFileSync('./fixtures/fusion.net/1482529202024.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/fusion.net/1482529202024.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +27,49 @@ describe('FusionNetExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/fusion.net/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/fusion.net/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'La La Land might win an Oscar but its got some bizarre racial politics');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'La La Land might win an Oscar but its got some bizarre racial politics'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/fusion.net/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/fusion.net/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Jack Mirkinson');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/fusion.net/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/fusion.net/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-23T16:14:25.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/fusion.net/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/fusion.net/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://i0.wp.com/fusion.net/wp-content/uploads/2016/12/screen-shot-2016-12-23-at-8-22-27-am.png?resize=1200%2C630&quality=80&strip=all');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://i0.wp.com/fusion.net/wp-content/uploads/2016/12/screen-shot-2016-12-23-at-8-22-27-am.png?resize=1200%2C630&quality=80&strip=all'
);
});
it('returns the content', async () => {
@ -77,11 +81,19 @@ describe('FusionNetExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Warning: this piece contains spoilers. I love musicals. Like, love them love them.');
assert.equal(
first13,
'Warning: this piece contains spoilers. I love musicals. Like, love them love them.'
);
});
});
});

@ -10,43 +10,29 @@ export const GothamistComExtractor = {
],
title: {
selectors: [
'h1',
'.entry-header h1',
],
selectors: ['h1', '.entry-header h1'],
},
author: {
selectors: [
'.author',
],
selectors: ['.author'],
},
date_published: {
selectors: [
'abbr',
'abbr.published',
],
selectors: ['abbr', 'abbr.published'],
timezone: 'America/New_York',
},
dek: {
selectors: [
null,
],
selectors: [null],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.entry-body',
],
selectors: ['.entry-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -12,12 +12,11 @@ describe('GothamistComExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://gothamist.com/2017/03/09/fallout_shelters_nyc.php';
const html =
fs.readFileSync('./fixtures/gothamist.com/1489073770258.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://gothamist.com/2017/03/09/fallout_shelters_nyc.php';
const html = fs.readFileSync(
'./fixtures/gothamist.com/1489073770258.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,53 +28,59 @@ describe('GothamistComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/gothamist.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/gothamist.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Helter Shelter: NYC\'s Fallout Shelters Basically Don\'t Exist Anymore');
// Update these values with the expected values from
// the article.
assert.equal(
title,
"Helter Shelter: NYC's Fallout Shelters Basically Don't Exist Anymore"
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/gothamist.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/gothamist.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Nathan Tempey');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/gothamist.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/gothamist.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2017-03-09T15:15:00.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/gothamist.com/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/gothamist.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(dek, null);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/gothamist.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/gothamist.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://gothamist.com/assets_c/2017/03/030717FalloutShelter80NY-5-thumb-640xauto-989222.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://gothamist.com/assets_c/2017/03/030717FalloutShelter80NY-5-thumb-640xauto-989222.jpg'
);
});
it('returns the content', async () => {
@ -87,11 +92,19 @@ describe('GothamistComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The basement at 80 New York Avenue in Crown Heights is one of');
assert.equal(
first13,
'The basement at 80 New York Avenue in Crown Heights is one of'
);
});
});
});

@ -2,44 +2,31 @@ export const HellogigglesComExtractor = {
domain: 'hellogiggles.com',
title: {
selectors: [
'.title',
],
selectors: ['.title'],
},
author: {
selectors: [
'.author-link',
],
selectors: ['.author-link'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.entry-content',
],
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -12,12 +12,11 @@ describe('HellogigglesComExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://hellogiggles.com/comfy-chic-holiday-outfit-illustrated/';
const html =
fs.readFileSync('./fixtures/hellogiggles.com/1482437663500.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://hellogiggles.com/comfy-chic-holiday-outfit-illustrated/';
const html = fs.readFileSync(
'./fixtures/hellogiggles.com/1482437663500.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +28,46 @@ describe('HellogigglesComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(title, 'Your comfy-chic holiday outfit, illustrated');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Cindy Mangomini');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-22T00:05:23.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/hellogiggles.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://images.hellogiggles.com/uploads/2016/12/21073729/HG-Xmas-NY-Cindy-Mangomini1.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://images.hellogiggles.com/uploads/2016/12/21073729/HG-Xmas-NY-Cindy-Mangomini1.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +79,19 @@ describe('HellogigglesComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Its almost that time! The time to quietly panic and throw all the');
assert.equal(
first13,
'Its almost that time! The time to quietly panic and throw all the'
);
});
});
});

@ -2,52 +2,37 @@ export const IciRadioCanadaCaExtractor = {
domain: 'ici.radio-canada.ca',
title: {
selectors: [
'h1',
],
selectors: ['h1'],
},
author: {
selectors: [
['meta[name="dc.creator"]', 'value'],
],
selectors: [['meta[name="dc.creator"]', 'value']],
},
date_published: {
selectors: [
['meta[name="dc.date.created"]', 'value'],
],
selectors: [['meta[name="dc.date.created"]', 'value']],
timezone: 'America/New_York',
},
dek: {
selectors: [
'.bunker-component.lead',
],
selectors: ['.bunker-component.lead'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
['.main-multimedia-item', '.news-story-content'],
],
selectors: [['.main-multimedia-item', '.news-story-content']],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('IciRadioCanadaCaExtractor', () => {
beforeAll(() => {
url =
'http://ici.radio-canada.ca/nouvelle/1022038/kpmg-comptables-fiscalite-impots-paradis-fiscaux-juge-bocock-cocktail';
const html =
fs.readFileSync('./fixtures/ici.radio-canada.ca/1489433621634.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/ici.radio-canada.ca/1489433621634.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,53 +29,62 @@ describe('IciRadioCanadaCaExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(title, 'Affaire KPMG: un juge se récuse');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
assert.equal(author, 'Zone Justice et faits divers - ICI.Radio-Canada.ca');
// Update these values with the expected values from
// the article.
assert.equal(
author,
'Zone Justice et faits divers - ICI.Radio-Canada.ca'
);
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2017-03-13T23:18:00.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Un juge de la cour de l\'impôt se récuse d\'un dossier mettant en cause un stratagème du cabinet comptable KPMG. Selon les émissions Enquête et the fifth estate, le juge Bocock avait participé à une soirée cocktail organisée par un cabinet d\'avocats lié à l\'affaire.');
// Update these values with the expected values from
// the article.
assert.equal(
dek,
"Un juge de la cour de l'impôt se récuse d'un dossier mettant en cause un stratagème du cabinet comptable KPMG. Selon les émissions Enquête et the fifth estate, le juge Bocock avait participé à une soirée cocktail organisée par un cabinet d'avocats lié à l'affaire."
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/ici.radio-canada.ca/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://images.radio-canada.ca/w_635,h_357/v1/ici-info/16x9/randall-bocock-juge.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://images.radio-canada.ca/w_635,h_357/v1/ici-info/16x9/randall-bocock-juge.jpg'
);
});
it('returns the content', async () => {
@ -87,11 +96,19 @@ describe('IciRadioCanadaCaExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Le juge Randall Bocock se retire d\'une cause liée à KPMG Photo :');
assert.equal(
first13,
"Le juge Randall Bocock se retire d'une cause liée à KPMG Photo :"
);
});
});
});

@ -2,33 +2,23 @@ export const MashableComExtractor = {
domain: 'mashable.com',
title: {
selectors: [
'h1.title',
],
selectors: ['h1.title'],
},
author: {
selectors: [
'span.author_name a',
],
selectors: ['span.author_name a'],
},
date_published: {
selectors: [
['meta[name="og:article:published_time"]', 'value'],
],
selectors: [['meta[name="og:article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'section.article-content.blueprint',
],
selectors: ['section.article-content.blueprint'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -39,8 +29,6 @@ export const MashableComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('MashableComExtractor', () => {
beforeAll(() => {
url =
'http://mashable.com/2016/12/13/mysterious-plane-flying-over-new-york/?utm_cid=hp-n-1#sxBI1HiPKsqG';
const html =
fs.readFileSync('./fixtures/mashable.com/1481670648585.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/mashable.com/1481670648585.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,49 @@ describe('MashableComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/mashable.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/mashable.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Mysterious plane circling Manhattan sparks concern and intrigue');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Mysterious plane circling Manhattan sparks concern and intrigue'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/mashable.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/mashable.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Nicole Gallucci');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/mashable.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/mashable.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-13T22:33:06.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/mashable.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/mashable.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://a.amz.mshcdn.com/media/ZgkyMDE2LzEyLzEzL2UxL2xpbGlzYW1zbWFzaGFibGU1XzcyMC4wMWZkOS5qcGcKcAl0aHVtYgkxMjAweDYzMAplCWpwZw/29e123a7/0e0/lili-sams-mashable-5_720.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://a.amz.mshcdn.com/media/ZgkyMDE2LzEyLzEzL2UxL2xpbGlzYW1zbWFzaGFibGU1XzcyMC4wMWZkOS5qcGcKcAl0aHVtYgkxMjAweDYzMAplCWpwZw/29e123a7/0e0/lili-sams-mashable-5_720.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('MashableComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'A large military-style plane, which looked remarkably like a C-130, circled Manhattan for');
assert.equal(
first13,
'A large military-style plane, which looked remarkably like a C-130, circled Manhattan for'
);
});
});
});

@ -1,20 +1,14 @@
export const MediumExtractor = {
domain: 'medium.com',
supportedDomains: [
'trackchanges.postlight.com',
],
supportedDomains: ['trackchanges.postlight.com'],
title: {
selectors: [
'h1',
],
selectors: ['h1'],
},
author: {
selectors: [
['meta[name="author"]', 'value'],
],
selectors: [['meta[name="author"]', 'value']],
},
content: {
@ -28,13 +22,12 @@ export const MediumExtractor = {
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
// Re-write lazy-loaded youtube videos
iframe: ($node) => {
const ytRe =
/https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
iframe: $node => {
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
const thumb = decodeURIComponent($node.attr('data-thumbnail'));
if (ytRe.test(thumb)) {
const [_, youtubeId] = thumb.match(ytRe) // eslint-disable-line
const [_, youtubeId] = thumb.match(ytRe); // eslint-disable-line
$node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);
const $parent = $node.parents('figure');
const $caption = $parent.find('figcaption');
@ -43,7 +36,7 @@ export const MediumExtractor = {
},
// rewrite figures to pull out image and caption, remove rest
figure: ($node) => {
figure: $node => {
// ignore if figure has an iframe
if ($node.find('iframe').length > 0) return;
@ -56,21 +49,15 @@ export const MediumExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
date_published: {
selectors: [
['time[datetime]', 'datetime'],
],
selectors: [['time[datetime]', 'datetime']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
dek: {

@ -14,10 +14,8 @@ describe('MediumExtractor', () => {
beforeAll(() => {
url =
'https://medium.com/the-wtf-economy/wtf-whats-the-future-e52ab9515573#.ilwrgwsks';
const html =
fs.readFileSync('./fixtures/medium.com/1477523363921.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/medium.com/1477523363921.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
@ -39,7 +37,7 @@ describe('MediumExtractor', () => {
it('returns the author', async () => {
const { author } = await result;
assert.equal(author, 'Tim O\'Reilly');
assert.equal(author, "Tim O'Reilly");
});
it('returns the date_published', async () => {
@ -61,7 +59,10 @@ describe('MediumExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://cdn-images-1.medium.com/max/1200/1*3Gzaug9mRc8vvx1cuQWkog.png');
assert.equal(
lead_image_url,
'https://cdn-images-1.medium.com/max/1200/1*3Gzaug9mRc8vvx1cuQWkog.png'
);
});
it('returns the content', async () => {
@ -69,12 +70,20 @@ describe('MediumExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// testing that youtube video transform is working
assert.equal(/IAoy3ia2ivI/.test(content), true);
assert.equal(first13, 'Video of WTF? My talk at the White House Frontiers ConferenceLast Thursday, I');
assert.equal(
first13,
'Video of WTF? My talk at the White House Frontiers ConferenceLast Thursday, I'
);
});
});
@ -85,8 +94,7 @@ describe('MediumExtractor', () => {
url =
'https://medium.com/@JakobUlbrich/flag-attributes-in-android-how-to-use-them-ac4ec8aee7d1#.h949wjmyw';
const html = fs.readFileSync('./fixtures/medium.com/1485902752952.html');
result =
Mercury.parse(url, html, { fallback: false });
result = Mercury.parse(url, html, { fallback: false });
});
it('returns the content', async () => {
@ -96,7 +104,10 @@ describe('MediumExtractor', () => {
const first13 = excerptContent($.text(), 13);
assert.equal(first13, 'Im sure you have seen something like the following line very often while');
assert.equal(
first13,
'Im sure you have seen something like the following line very often while'
);
});
});
});

@ -2,52 +2,37 @@ export const MoneyCnnComExtractor = {
domain: 'money.cnn.com',
title: {
selectors: [
'.article-title',
],
selectors: ['.article-title'],
},
author: {
selectors: [
'.byline a',
],
selectors: ['.byline a'],
},
date_published: {
selectors: [
['meta[name="date"]', 'value'],
],
selectors: [['meta[name="date"]', 'value']],
timezone: 'GMT',
},
dek: {
selectors: [
'#storytext h2',
],
selectors: ['#storytext h2'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'#storytext',
],
selectors: ['#storytext'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.inStoryHeading',
],
clean: ['.inStoryHeading'],
},
};

@ -14,10 +14,10 @@ describe('MoneyCnnComExtractor', () => {
beforeAll(() => {
url =
'http://money.cnn.com/2016/11/29/news/ohare-workers-strike/index.html';
const html =
fs.readFileSync('./fixtures/money.cnn.com/1480437611330.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/money.cnn.com/1480437611330.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
// This test should be passing by default.
@ -34,7 +34,10 @@ describe('MoneyCnnComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'Hundreds of Chicago O\'Hare airport workers go on strike');
assert.equal(
title,
"Hundreds of Chicago O'Hare airport workers go on strike"
);
});
it('returns the author', async () => {
@ -64,7 +67,10 @@ describe('MoneyCnnComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Heads up, travelers: Hundreds of workers are striking at Chicago O\'Hare International Airport on Tuesday.');
assert.equal(
dek,
"Heads up, travelers: Hundreds of workers are striking at Chicago O'Hare International Airport on Tuesday."
);
});
it('returns the lead_image_url', async () => {
@ -74,7 +80,10 @@ describe('MoneyCnnComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://i2.cdn.turner.com/money/dam/assets/161118102423-ohare-airport-strike-780x439.jpg');
assert.equal(
lead_image_url,
'http://i2.cdn.turner.com/money/dam/assets/161118102423-ohare-airport-strike-780x439.jpg'
);
});
it('returns the content', async () => {
@ -86,11 +95,19 @@ describe('MoneyCnnComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Janitors, baggage handlers, cabin cleaners and wheelchair attendants are asking for a $15');
assert.equal(
first13,
'Janitors, baggage handlers, cabin cleaners and wheelchair attendants are asking for a $15'
);
});
});
});

@ -2,37 +2,25 @@ export const NewrepublicComExtractor = {
domain: 'newrepublic.com',
title: {
selectors: [
'h1.article-headline',
'.minutes-primary h1.minute-title',
],
selectors: ['h1.article-headline', '.minutes-primary h1.minute-title'],
},
author: {
selectors: [
'div.author-list',
'.minutes-primary h3.minute-byline',
],
selectors: ['div.author-list', '.minutes-primary h3.minute-byline'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
timezone: 'America/New_York',
},
dek: {
selectors: [
'h2.article-subhead',
],
selectors: ['h2.article-subhead'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
@ -43,14 +31,11 @@ export const NewrepublicComExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'aside',
],
clean: ['aside'],
},
};

@ -14,10 +14,10 @@ describe('NewrepublicComExtractor', () => {
beforeAll(() => {
url =
'https://newrepublic.com/article/138859/fantastic-beasts-nice-place-visit';
const html =
fs.readFileSync('./fixtures/newrepublic.com/1480434805231.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/newrepublic.com/1480434805231.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
// This test should be passing by default.
@ -64,7 +64,10 @@ describe('NewrepublicComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(dek, 'The glorious world-building in the first Harry Potter spin-off isn\'t enough to keep viewers coming back.');
assert.equal(
dek,
"The glorious world-building in the first Harry Potter spin-off isn't enough to keep viewers coming back."
);
});
it('returns the lead_image_url', async () => {
@ -74,7 +77,10 @@ describe('NewrepublicComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://images.newrepublic.com/29020c1e6b108813cf65b54487ad2b5a65aa6079.jpeg?w=1109&h=577&crop=faces&fit=crop&fm=jpg');
assert.equal(
lead_image_url,
'https://images.newrepublic.com/29020c1e6b108813cf65b54487ad2b5a65aa6079.jpeg?w=1109&h=577&crop=faces&fit=crop&fm=jpg'
);
});
it('article returns the content', async () => {
@ -86,11 +92,19 @@ describe('NewrepublicComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The eight Harry Potter films, which stretched out over nearly a decade, had');
assert.equal(
first13,
'The eight Harry Potter films, which stretched out over nearly a decade, had'
);
});
});
@ -100,8 +114,9 @@ describe('NewrepublicComExtractor', () => {
beforeAll(async () => {
url =
'https://newrepublic.com/minutes/139022/maybe-donald-trumps-twitter-account-just-smoke-screen';
const html =
fs.readFileSync('./fixtures/newrepublic.com/1480446502259.html');
const html = fs.readFileSync(
'./fixtures/newrepublic.com/1480446502259.html'
);
result = await Mercury.parse(url, html, { fallback: false });
});
@ -112,7 +127,10 @@ describe('NewrepublicComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'Maybe Donald Trumps Twitter account is more than just a smoke screen.');
assert.equal(
title,
'Maybe Donald Trumps Twitter account is more than just a smoke screen.'
);
});
it('article returns the author', async () => {
@ -134,11 +152,19 @@ describe('NewrepublicComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Its been one of the most persistent narratives of the last year: Whenever');
assert.equal(
first13,
'Its been one of the most persistent narratives of the last year: Whenever'
);
});
});
});

@ -2,52 +2,39 @@ export const NewsNationalgeographicComExtractor = {
domain: 'news.nationalgeographic.com',
title: {
selectors: [
'h1',
'h1.main-title',
],
selectors: ['h1', 'h1.main-title'],
},
author: {
selectors: [
'.byline-component__contributors b span',
],
selectors: ['.byline-component__contributors b span'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
format: 'ddd MMM DD HH:mm:ss zz YYYY',
timezone: 'EST',
},
dek: {
selectors: [
'.article__deck',
],
selectors: ['.article__deck'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
['.parsys.content', '.__image-lead__'],
'.content',
],
selectors: [['.parsys.content', '.__image-lead__'], '.content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.parsys.content': ($node, $) => {
const $imgSrc = $node.find('.image.parbase.section')
.find('.picturefill')
.first()
.data('platform-src');
const $imgSrc = $node
.find('.image.parbase.section')
.find('.picturefill')
.first()
.data('platform-src');
if ($imgSrc) {
$node.prepend($(`<img class="__image-lead__" src="${$imgSrc}"/>`));
}
@ -57,8 +44,6 @@ export const NewsNationalgeographicComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.pull-quote.pull-quote--large',
],
clean: ['.pull-quote.pull-quote--large'],
},
};

@ -14,10 +14,10 @@ describe('NewsNationalgeographicComExtractor', () => {
beforeAll(() => {
url =
'http://news.nationalgeographic.com/energy/2015/08/150803-gender-bias-affects-office-heating-cooling-temperatures/';
const html =
fs.readFileSync('./fixtures/news.nationalgeographic.com/1481919545107.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/news.nationalgeographic.com/1481919545107.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,52 @@ describe('NewsNationalgeographicComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Heres Why Your Office May Be Too Hot or Cold: Gender Bias');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Heres Why Your Office May Be Too Hot or Cold: Gender Bias'
);
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2015-08-03T17:45:00.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Do you argue about the temperature in your office or home? Find out what often decides it, and tell us your preference.');
// Update these values with the expected values from
// the article.
assert.equal(
dek,
'Do you argue about the temperature in your office or home? Find out what often decides it, and tell us your preference.'
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/news.nationalgeographic.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://news.nationalgeographic.com/content/dam/news/2015/08/03/temperaturegenderbias/02tempgenderbias.ngsversion.1438795800319.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://news.nationalgeographic.com/content/dam/news/2015/08/03/temperaturegenderbias/02tempgenderbias.ngsversion.1438795800319.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +86,19 @@ describe('NewsNationalgeographicComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Many couples fight about it at home. No, its not money, sex, or');
assert.equal(
first13,
'Many couples fight about it at home. No, its not money, sex, or'
);
});
});
});

@ -2,17 +2,10 @@ export const NYMagExtractor = {
domain: 'nymag.com',
content: {
// Order by most likely. Extractor will stop on first occurrence
selectors: [
'div.article-content',
'section.body',
'article.article',
],
selectors: ['div.article-content', 'section.body', 'article.article'],
// Selectors to remove from the extracted content
clean: [
'.ad',
'.single-related-story',
],
clean: ['.ad', '.single-related-story'],
// Object of tranformations to make on matched elements
// Each key is the selector, each value is the tag to
@ -27,8 +20,11 @@ export const NYMagExtractor = {
// Convert lazy-loaded noscript images to figures
noscript: ($node, $) => {
const $children = $.browser ? $($node.text()) : $node.children();
if ($children.length === 1 && $children.get(0) !== undefined &&
$children.get(0).tagName.toLowerCase() === 'img') {
if (
$children.length === 1 &&
$children.get(0) !== undefined &&
$children.get(0).tagName.toLowerCase() === 'img'
) {
return 'figure';
}
@ -38,24 +34,15 @@ export const NYMagExtractor = {
},
title: {
selectors: [
'h1.lede-feature-title',
'h1.headline-primary',
'h1',
],
selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1'],
},
author: {
selectors: [
'.by-authors',
'.lede-feature-author',
],
selectors: ['.by-authors', '.lede-feature-author'],
},
dek: {
selectors: [
'.lede-feature-teaser',
],
selectors: ['.lede-feature-teaser'],
},
date_published: {

@ -6,10 +6,12 @@ import Mercury from 'mercury';
describe('NYMagExtractor', () => {
it('works with a feature story', async () => {
const html = fs.readFileSync('./fixtures/nymag.com/ailes.html');
const uri = 'http://nymag.com/daily/intelligencer/2016/09/how-fox-news-women-took-down-roger-ailes.html';
const uri =
'http://nymag.com/daily/intelligencer/2016/09/how-fox-news-women-took-down-roger-ailes.html';
const { dek, title, author } = await Mercury.parse(uri, html);
const actualDek = 'How Fox News women took down the most powerful, and predatory, man in media.';
const actualDek =
'How Fox News women took down the most powerful, and predatory, man in media.';
assert.equal(dek, actualDek);
assert.equal(title, 'The Revenge of Rogers Angels');

@ -1,61 +1,40 @@
export const ObamawhitehouseArchivesGovExtractor = {
domain: 'obamawhitehouse.archives.gov',
supportedDomains: [
'whitehouse.gov',
],
supportedDomains: ['whitehouse.gov'],
title: {
selectors: [
'h1',
'.pane-node-title',
],
selectors: ['h1', '.pane-node-title'],
},
author: {
selectors: [
'.blog-author-link',
'.node-person-name-link',
],
selectors: ['.blog-author-link', '.node-person-name-link'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
selectors: [
'.field-name-field-forall-summary',
],
selectors: ['.field-name-field-forall-summary'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
defaultCleaner: false,
selectors: [
'div#content-start',
'.pane-node-field-forall-body',
],
selectors: ['div#content-start', '.pane-node-field-forall-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.pane-node-title',
'.pane-custom.pane-1',
],
clean: ['.pane-node-title', '.pane-custom.pane-1'],
},
};

@ -14,10 +14,10 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
beforeAll(() => {
url =
'https://obamawhitehouse.archives.gov/blog/2017/01/17/obama-administration-digital-transition-moving-forward';
const html =
fs.readFileSync('./fixtures/obamawhitehouse.archives.gov/1485905445365.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/obamawhitehouse.archives.gov/1485905445365.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,53 +29,65 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'The Obama Administration Digital Transition: Moving Forward');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'The Obama Administration Digital Transition: Moving Forward'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Kori Schulman');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2017-01-17T23:08:47.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(dek.split(/\s/).slice(0, 4).join(' '), 'Summary: Here\'s the latest');
// Update these values with the expected values from
// the article.
assert.equal(
dek
.split(/\s/)
.slice(0, 4)
.join(' '),
"Summary: Here's the latest"
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/obamawhitehouse.archives.gov/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://obamawhitehouse.archives.gov/sites/obamawhitehouse.archives.gov/files/digitaltransition.jpeg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://obamawhitehouse.archives.gov/sites/obamawhitehouse.archives.gov/files/digitaltransition.jpeg'
);
});
it('returns the content', async () => {
@ -87,11 +99,19 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Over the past eight years, the President, the First Lady, and the Obama');
assert.equal(
first13,
'Over the past eight years, the President, the First Lady, and the Obama'
);
});
});
@ -101,10 +121,10 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
beforeAll(() => {
url =
'https://obamawhitehouse.archives.gov/the-press-office/2015/04/11/weekly-address-tuition-free-community-college';
const html =
fs.readFileSync('./fixtures/obamawhitehouse.archives.gov/1490209983872.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/obamawhitehouse.archives.gov/1490209983872.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('includes this youtube video', async () => {
@ -122,10 +142,10 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
beforeAll(() => {
url =
'https://obamawhitehouse.archives.gov/the-press-office/2016/12/24/weekly-address-merry-christmas-president-and-first-lady';
const html =
fs.readFileSync('./fixtures/obamawhitehouse.archives.gov/1490227791307.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/obamawhitehouse.archives.gov/1490227791307.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('gets the words and video', async () => {
@ -133,11 +153,19 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'In this weeks address, the President and the First Lady wished all Americans');
assert.equal(
first13,
'In this weeks address, the President and the First Lady wished all Americans'
);
assert.equal($('iframe[src*="youtube"]').length, 1);
});
});
@ -148,10 +176,10 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
beforeAll(() => {
url =
'https://obamawhitehouse.archives.gov/blog/2011/09/10/serve-and-remember';
const html =
fs.readFileSync('./fixtures/obamawhitehouse.archives.gov/1490375478954.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/obamawhitehouse.archives.gov/1490375478954.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('gets the content', async () => {
@ -159,9 +187,17 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
assert.equal(first13, 'September 11th has been designated as a National Day of Service and Remembrance.');
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
assert.equal(
first13,
'September 11th has been designated as a National Day of Service and Remembrance.'
);
});
});
});

@ -2,51 +2,35 @@ export const ObserverComExtractor = {
domain: 'observer.com',
title: {
selectors: [
'h1.entry-title',
],
selectors: ['h1.entry-title'],
},
author: {
selectors: [
'.author',
'.vcard',
],
selectors: ['.author', '.vcard'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
selectors: [
'h2.dek',
],
selectors: ['h2.dek'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'div.entry-content',
],
selectors: ['div.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('ObserverComExtractor', () => {
beforeAll(() => {
url =
'http://observer.com/2016/12/archaeologists-just-discovered-a-2500-year-old-lost-city-atop-a-greek-mountain-peak/';
const html =
fs.readFileSync('./fixtures/observer.com/1481925269939.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/observer.com/1481925269939.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,53 +29,62 @@ describe('ObserverComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/observer.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/observer.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Archaeologists Just Discovered a 2,500-Year-Old Lost City Atop a Greek Mountain Peak');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Archaeologists Just Discovered a 2,500-Year-Old Lost City Atop a Greek Mountain Peak'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/observer.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/observer.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Sage Lazzaro');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/observer.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/observer.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-16T17:21:02.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/observer.com/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/observer.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(dek, '\'The fact that nobody has never explored the hill before is a mystery\'');
// Update these values with the expected values from
// the article.
assert.equal(
dek,
"'The fact that nobody has never explored the hill before is a mystery'"
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/observer.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/observer.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://i0.wp.com/nyoobserver.files.wordpress.com/2016/12/extra_large-1481648730-cover-image-2.jpg?quality=80&strip&ssl=1');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://i0.wp.com/nyoobserver.files.wordpress.com/2016/12/extra_large-1481648730-cover-image-2.jpg?quality=80&strip&ssl=1'
);
});
it('returns the content', async () => {
@ -87,11 +96,19 @@ describe('ObserverComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The citys acropolis is barely visible during a cloudy day on the Thessalian');
assert.equal(
first13,
'The citys acropolis is barely visible during a cloudy day on the Thessalian'
);
});
});
});

@ -1,38 +1,26 @@
export const PagesixComExtractor = {
domain: 'pagesix.com',
supportedDomains: [
'nypost.com',
],
supportedDomains: ['nypost.com'],
title: {
selectors: [
'h1 a',
],
selectors: ['h1 a'],
},
author: {
selectors: [
'.byline',
],
selectors: ['.byline'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
selectors: [
['meta[name="description"]', 'value'],
],
selectors: [['meta[name="description"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
@ -51,8 +39,6 @@ export const PagesixComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.modal-trigger',
],
clean: ['.modal-trigger'],
},
};

@ -14,10 +14,8 @@ describe('PagesixComExtractor', () => {
beforeAll(() => {
url =
'http://pagesix.com/2016/12/19/sofia-vergara-and-nick-loebs-embryo-drama-taking-a-detour/';
const html =
fs.readFileSync('./fixtures/pagesix.com/1482254007534.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/pagesix.com/1482254007534.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +27,49 @@ describe('PagesixComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/pagesix.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/pagesix.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Sofía Vergara and Nick Loebs embryo drama taking a detour');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Sofía Vergara and Nick Loebs embryo drama taking a detour'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/pagesix.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/pagesix.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Oli Coleman');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/pagesix.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/pagesix.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-20T00:08:44.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/pagesix.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/pagesix.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://nyppagesix.files.wordpress.com/2016/12/sofia-vergara4.jpg?quality=90&strip=all&w=1200');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://nyppagesix.files.wordpress.com/2016/12/sofia-vergara4.jpg?quality=90&strip=all&w=1200'
);
});
it('returns the content', async () => {
@ -77,12 +81,20 @@ describe('PagesixComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Nick Loeb and Sofia Vergara Shutterstock / Getty Images (Composite) In the latest');
assert.equal(
first13,
'Nick Loeb and Sofia Vergara Shutterstock / Getty Images (Composite) In the latest'
);
});
});
});

@ -2,44 +2,31 @@ export const PeopleComExtractor = {
domain: 'people.com',
title: {
selectors: [
['meta[name="og:title"]', 'value'],
],
selectors: [['meta[name="og:title"]', 'value']],
},
author: {
selectors: [
'a.author.url.fn',
],
selectors: ['a.author.url.fn'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'div.article-body__inner',
],
selectors: ['div.article-body__inner'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -12,12 +12,9 @@ describe('PeopleComExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://people.com/style/jennifer-aniston-coat-tags-jimmy-kimmel/';
const html =
fs.readFileSync('./fixtures/people.com/1481580462922.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://people.com/style/jennifer-aniston-coat-tags-jimmy-kimmel/';
const html = fs.readFileSync('./fixtures/people.com/1481580462922.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +26,49 @@ describe('PeopleComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/people.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/people.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Jennifer Aniston on Leaving the Tags on Her Celine Coat: I Wore It Four Times and Didnt Notice!');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Jennifer Aniston on Leaving the Tags on Her Celine Coat: I Wore It Four Times and Didnt Notice!'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/people.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/people.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Emily Kirkpatrick');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/people.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/people.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-09T15:40:20.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/people.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/people.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://i2.wp.com/peopledotcom.files.wordpress.com/2016/12/jennifer-aniston5.jpg?crop=0px%2C0px%2C1500px%2C1125px&resize=660%2C495&ssl=1');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://i2.wp.com/peopledotcom.files.wordpress.com/2016/12/jennifer-aniston5.jpg?crop=0px%2C0px%2C1500px%2C1125px&resize=660%2C495&ssl=1'
);
});
it('returns the content', async () => {
@ -77,11 +80,19 @@ describe('PeopleComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'People who dont chronicle every item of clothing worn by the rich and');
assert.equal(
first13,
'People who dont chronicle every item of clothing worn by the rich and'
);
});
});
});

@ -2,46 +2,31 @@ export const QzComExtractor = {
domain: 'qz.com',
title: {
selectors: [
'header.item-header.content-width-responsive',
],
selectors: ['header.item-header.content-width-responsive'],
},
author: {
selectors: [
['meta[name="author"]', 'value'],
],
selectors: [['meta[name="author"]', 'value']],
},
date_published: {
selectors: [
'.timestamp',
],
selectors: ['.timestamp'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
['figure.featured-image', '.item-body'],
'.item-body',
],
selectors: [['figure.featured-image', '.item-body'], '.item-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.article-aside',
'.progressive-image-thumbnail',
],
clean: ['.article-aside', '.progressive-image-thumbnail'],
},
};

@ -14,10 +14,8 @@ describe('QzComExtractor', () => {
beforeAll(() => {
url =
'http://qz.com/863015/uber-is-rolling-out-self-driving-cars-in-san-francisco-in-open-defiance-of-california-dmv/';
const html =
fs.readFileSync('./fixtures/qz.com/1481758330660.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/qz.com/1481758330660.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,45 +27,51 @@ describe('QzComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/qz.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/qz.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Uber is launching self-driving cars in San Francisco the same way it does everything—without permission');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Uber is launching self-driving cars in San Francisco the same way it does everything—without permission'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/qz.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/qz.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Alison Griswold');
});
// qz doesn't appear to pass the date from the server,
// so the date is unfortunately null
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/qz.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/qz.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, null);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/qz.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/qz.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://qzprod.files.wordpress.com/2016/12/uber-self-driving-volvo-sf-e1481735810897.jpg?quality=80&strip=all&w=1600');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://qzprod.files.wordpress.com/2016/12/uber-self-driving-volvo-sf-e1481735810897.jpg?quality=80&strip=all&w=1600'
);
});
it('returns the content', async () => {
@ -79,11 +83,19 @@ describe('QzComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'One of the Volvo XC90s Uber is debuting in San Francisco. (Uber) No');
assert.equal(
first13,
'One of the Volvo XC90s Uber is debuting in San Francisco. (Uber) No'
);
});
});
});

@ -2,24 +2,15 @@ export const ScienceflyComExtractor = {
domain: 'sciencefly.com',
title: {
selectors: [
'.entry-title',
'.cb-entry-title',
'.cb-single-title',
],
selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title'],
},
author: {
selectors: [
'div.cb-author',
'div.cb-author-title',
],
selectors: ['div.cb-author', 'div.cb-author-title'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
@ -29,26 +20,19 @@ export const ScienceflyComExtractor = {
},
lead_image_url: {
selectors: [
['div.theiaPostSlider_slides img', 'src'],
],
selectors: [['div.theiaPostSlider_slides img', 'src']],
},
content: {
selectors: [
'div.theiaPostSlider_slides',
],
selectors: ['div.theiaPostSlider_slides'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('ScienceflyComExtractor', () => {
beforeAll(() => {
url =
'http://sciencefly.com/video-shows-false-killer-whale-snagging-tuna-bait/';
const html =
fs.readFileSync('./fixtures/sciencefly.com/1482530492413.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/sciencefly.com/1482530492413.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,46 @@ describe('ScienceflyComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/sciencefly.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/sciencefly.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(title, 'Video shows false killer whale snagging tuna bait');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/sciencefly.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/sciencefly.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Paul');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/sciencefly.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/sciencefly.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-23T16:44:36.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/sciencefly.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/sciencefly.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://cdnph.upi.com/rss/i/14825077851993/Video-shows-false-killer-whale-snagging-tuna-bait_f.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://cdnph.upi.com/rss/i/14825077851993/Video-shows-false-killer-whale-snagging-tuna-bait_f.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +80,19 @@ describe('ScienceflyComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'HONOLULU, Dec. 23 (UPI) — Researchers trying to cut down on the number');
assert.equal(
first13,
'HONOLULU, Dec. 23 (UPI) — Researchers trying to cut down on the number'
);
});
});
});

@ -2,44 +2,31 @@ export const ThefederalistpapersOrgExtractor = {
domain: 'thefederalistpapers.org',
title: {
selectors: [
'h1.entry-title',
],
selectors: ['h1.entry-title'],
},
author: {
selectors: [
'main span.entry-author-name',
],
selectors: ['main span.entry-author-name'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.entry-content',
],
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
['p[style]'],
],
clean: [['p[style]']],
},
};

@ -14,10 +14,10 @@ describe('ThefederalistpapersOrgExtractor', () => {
beforeAll(() => {
url =
'http://thefederalistpapers.org/education-2/the-failure-of-public-schooling-in-one-chart';
const html =
fs.readFileSync('./fixtures/thefederalistpapers.org/1482344359572.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/thefederalistpapers.org/1482344359572.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,46 @@ describe('ThefederalistpapersOrgExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(title, 'The FAILURE Of Public Schooling In One Chart');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Robert Gehl');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-21T08:31:34.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/thefederalistpapers.org/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://thefederalistpapers.integratedmarket.netdna-cdn.com/wp-content/uploads/2016/12/trends-in-public-schooling-1.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://thefederalistpapers.integratedmarket.netdna-cdn.com/wp-content/uploads/2016/12/trends-in-public-schooling-1.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +80,19 @@ describe('ThefederalistpapersOrgExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Americas public schools are failing… miserably. We pump more and more money into');
assert.equal(
first13,
'Americas public schools are failing… miserably. We pump more and more money into'
);
});
});
});

@ -2,10 +2,7 @@ export const ThoughtcatalogComExtractor = {
domain: 'thoughtcatalog.com',
title: {
selectors: [
'h1.title',
['meta[name="og:title"]', 'value'],
],
selectors: ['h1.title', ['meta[name="og:title"]', 'value']],
},
author: {
@ -16,32 +13,23 @@ export const ThoughtcatalogComExtractor = {
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.entry.post',
],
selectors: ['.entry.post'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.tc_mark',
],
clean: ['.tc_mark'],
},
};

@ -14,10 +14,10 @@ describe('ThoughtcatalogComExtractor', () => {
beforeAll(() => {
url =
'http://thoughtcatalog.com/lauren-jarvis-gibson/2016/12/one-day-you-will-meet-someone-who-will-change-your-life-for-good/';
const html =
fs.readFileSync('./fixtures/thoughtcatalog.com/1482426075702.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/thoughtcatalog.com/1482426075702.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,49 @@ describe('ThoughtcatalogComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'One Day You Will Meet Someone Who Will Change Your Life For Good');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'One Day You Will Meet Someone Who Will Change Your Life For Good'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Lauren Jarvis-Gibson');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-16T18:05:39.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/thoughtcatalog.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://thoughtcatalog.files.wordpress.com/2016/12/31565018766_4494e5f335_o.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://thoughtcatalog.files.wordpress.com/2016/12/31565018766_4494e5f335_o.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('ThoughtcatalogComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'herzblut One day you are going to meet someone in your life, that');
assert.equal(
first13,
'herzblut One day you are going to meet someone in your life, that'
);
});
});
});

@ -19,23 +19,15 @@ export const TwitterExtractor = {
s: 'span',
},
selectors: [
'.permalink[role=main]',
],
selectors: ['.permalink[role=main]'],
defaultCleaner: false,
clean: [
'.stream-item-footer',
'button',
'.tweet-details-fixer',
],
clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'],
},
author: {
selectors: [
'.tweet.permalink-tweet .username',
],
selectors: ['.tweet.permalink-tweet .username'],
},
date_published: {
@ -44,5 +36,4 @@ export const TwitterExtractor = {
// '.tweet.permalink-tweet .metadata',
],
},
};

@ -2,33 +2,23 @@ export const UproxxComExtractor = {
domain: 'uproxx.com',
title: {
selectors: [
'div.post-top h1',
],
selectors: ['div.post-top h1'],
},
author: {
selectors: [
'.post-top .authorname',
],
selectors: ['.post-top .authorname'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.post-body',
],
selectors: ['.post-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -40,8 +30,6 @@ export const UproxxComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -12,12 +12,9 @@ describe('UproxxComExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://uproxx.com/news/rudy-giuliani-not-secretary-of-state/';
const html =
fs.readFileSync('./fixtures/uproxx.com/1481324633976.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://uproxx.com/news/rudy-giuliani-not-secretary-of-state/';
const html = fs.readFileSync('./fixtures/uproxx.com/1481324633976.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +26,49 @@ describe('UproxxComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/uproxx.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/uproxx.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Rudy Giuliani Is No Longer In The Running To Become Donald Trumps Secretary Of State');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Rudy Giuliani Is No Longer In The Running To Become Donald Trumps Secretary Of State'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/uproxx.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/uproxx.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Andrew Husband');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/uproxx.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/uproxx.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-09T22:51:26.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/uproxx.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/uproxx.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://uproxx.files.wordpress.com/2016/12/rudy-giuliani-not-secretary-of-state.jpg?quality=90&w=650');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://uproxx.files.wordpress.com/2016/12/rudy-giuliani-not-secretary-of-state.jpg?quality=90&w=650'
);
});
it('returns the content', async () => {
@ -77,11 +80,19 @@ describe('UproxxComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Getty Image Longtime Donald Trump advocate Rudy Giuliani has reportedly removed himself from');
assert.equal(
first13,
'Getty Image Longtime Donald Trump advocate Rudy Giuliani has reportedly removed himself from'
);
});
});
});

@ -1,15 +1,13 @@
export const WikipediaExtractor = {
domain: 'wikipedia.org',
content: {
selectors: [
'#mw-content-text',
],
selectors: ['#mw-content-text'],
defaultCleaner: false,
// transform top infobox to an image with caption
transforms: {
'.infobox img': ($node) => {
'.infobox img': $node => {
const $parent = $node.parents('.infobox');
// Only prepend the first image in .infobox
if ($parent.children('img').length === 0) {
@ -27,21 +25,15 @@ export const WikipediaExtractor = {
'#toc',
'.navbox',
],
},
author: 'Wikipedia Contributors',
title: {
selectors: [
'h2.title',
],
selectors: ['h2.title'],
},
date_published: {
selectors: [
'#footer-info-lastmod',
],
selectors: ['#footer-info-lastmod'],
},
};

@ -2,45 +2,32 @@ export const WwwAlComExtractor = {
domain: 'www.al.com',
title: {
selectors: [
['meta[name="title"]', 'value'],
],
selectors: [['meta[name="title"]', 'value']],
},
author: {
selectors: [
['meta[name="article_author"]', 'value'],
],
selectors: [['meta[name="article_author"]', 'value']],
},
date_published: {
selectors: [
['meta[name="article_date_original"]', 'value'],
],
selectors: [['meta[name="article_date_original"]', 'value']],
timezone: 'EST',
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.entry-content',
],
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,8 @@ describe('WwwAlComExtractor', () => {
beforeAll(() => {
url =
'http://www.al.com/news/birmingham/index.ssf/2016/12/two_arrested_in_multi-state_de.html#incart_river_home';
const html =
fs.readFileSync('./fixtures/www.al.com/1482445422101.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/www.al.com/1482445422101.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +27,49 @@ describe('WwwAlComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.al.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.al.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Two arrested in multi-state debit card skimming scheme');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Two arrested in multi-state debit card skimming scheme'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.al.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.al.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Mike Cason | mcason@al.com');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.al.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.al.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-22T19:47:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.al.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.al.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://image.al.com/home/bama-media/width620/img/news_birmingham_impact/photo/21753198-standard.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://image.al.com/home/bama-media/width620/img/news_birmingham_impact/photo/21753198-standard.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +81,19 @@ describe('WwwAlComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Jake Frith, special agent with the Alabama Attorney General\'s office, talks about an');
assert.equal(
first13,
"Jake Frith, special agent with the Alabama Attorney General's office, talks about an"
);
});
});
});

@ -2,22 +2,15 @@ export const WwwAmericanowComExtractor = {
domain: 'www.americanow.com',
title: {
selectors: [
'.title',
['meta[name="title"]', 'value'],
],
selectors: ['.title', ['meta[name="title"]', 'value']],
},
author: {
selectors: [
'.byline',
],
selectors: ['.byline'],
},
date_published: {
selectors: [
['meta[name="publish_date"]', 'value'],
],
selectors: [['meta[name="publish_date"]', 'value']],
},
dek: {
@ -27,28 +20,19 @@ export const WwwAmericanowComExtractor = {
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
['.article-content', '.image', '.body'],
'.body',
],
selectors: [['.article-content', '.image', '.body'], '.body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.article-video-wrapper',
'.show-for-small-only',
],
clean: ['.article-video-wrapper', '.show-for-small-only'],
},
};

@ -14,10 +14,10 @@ describe('WwwAmericanowComExtractor', () => {
beforeAll(() => {
url =
'http://www.americanow.com/story/politics/2016/12/22/trump-names-kellyanne-conway-counselor-president';
const html =
fs.readFileSync('./fixtures/www.americanow.com/1482528557836.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.americanow.com/1482528557836.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,46 @@ describe('WwwAmericanowComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.americanow.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.americanow.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(title, 'Kellyanne Conway Named Counselor To Trump');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.americanow.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.americanow.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Ray Brown');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.americanow.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.americanow.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-22T14:22:19.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.americanow.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.americanow.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://images.americanow.com:8080/ovi/catalog/downloads/preview/rndr_670x377//2016/12/conway-1482422231.JPG/rndr_670x377.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://images.americanow.com:8080/ovi/catalog/downloads/preview/rndr_670x377//2016/12/conway-1482422231.JPG/rndr_670x377.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +80,19 @@ describe('WwwAmericanowComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'President-elect Donald Trump has named Kellyanne Conway, the campaign manager who helped him');
assert.equal(
first13,
'President-elect Donald Trump has named Kellyanne Conway, the campaign manager who helped him'
);
});
});
});

@ -2,52 +2,35 @@ export const WwwAndroidcentralComExtractor = {
domain: 'www.androidcentral.com',
title: {
selectors: [
'h1',
'h1.main-title',
],
selectors: ['h1', 'h1.main-title'],
},
author: {
selectors: [
'.meta-by',
],
selectors: ['.meta-by'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
selectors: [
['meta[name="og:description"]', 'value'],
],
selectors: [['meta[name="og:description"]', 'value']],
},
lead_image_url: {
selectors: [
['.image-large', 'src'],
],
selectors: [['.image-large', 'src']],
},
content: {
selectors: [
'.article-body',
],
selectors: ['.article-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.intro',
'blockquote',
],
clean: ['.intro', 'blockquote'],
},
};

@ -14,10 +14,10 @@ describe('WwwAndroidcentralComExtractor', () => {
beforeAll(() => {
url =
'http://www.androidcentral.com/motorola-launch-new-phone-mobile-world-congress';
const html =
fs.readFileSync('./fixtures/www.androidcentral.com/1484345154702.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.androidcentral.com/1484345154702.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,53 +29,59 @@ describe('WwwAndroidcentralComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Motorola is announcing a new phone at Mobile World Congress in February');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Motorola is announcing a new phone at Mobile World Congress in February'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Daniel Bader');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2017-01-13T21:45:08.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Motorola teases a new phone to launch at MWC.');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.androidcentral.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://www.androidcentral.com/sites/androidcentral.com/files/styles/large/public/article_images/2017/01/moto-mwc-2017.jpg?itok=-Cv5scPP');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://www.androidcentral.com/sites/androidcentral.com/files/styles/large/public/article_images/2017/01/moto-mwc-2017.jpg?itok=-Cv5scPP'
);
});
it('returns the content', async () => {
@ -87,11 +93,19 @@ describe('WwwAndroidcentralComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Motorola is set to launch a new phone in Barcelona next month prior');
assert.equal(
first13,
'Motorola is set to launch a new phone in Barcelona next month prior'
);
});
});
});

@ -2,21 +2,15 @@ export const WwwAolComExtractor = {
domain: 'www.aol.com',
title: {
selectors: [
'h1.p-article__title',
],
selectors: ['h1.p-article__title'],
},
author: {
selectors: [
['meta[name="author"]', 'value'],
],
selectors: [['meta[name="author"]', 'value']],
},
date_published: {
selectors: [
'.p-article__byline__date',
],
selectors: ['.p-article__byline__date'],
timezone: 'America/New_York',
},
@ -28,26 +22,19 @@ export const WwwAolComExtractor = {
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.article-content',
],
selectors: ['.article-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,8 @@ describe('WwwAolComExtractor', () => {
beforeAll(() => {
url =
'http://www.aol.com/article/news/2016/12/01/son-of-slain-police-officer-given-teddy-bears-made-from-dads-un/21618553/';
const html =
fs.readFileSync('./fixtures/www.aol.com/1480618816916.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync('./fixtures/www.aol.com/1480618816916.html');
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
// This test should be passing by default.
@ -34,7 +32,10 @@ describe('WwwAolComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'Son of slain police officer given teddy bears made from dad\'s uniform');
assert.equal(
title,
"Son of slain police officer given teddy bears made from dad's uniform"
);
});
it('returns the author', async () => {
@ -64,7 +65,10 @@ describe('WwwAolComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://o.aolcdn.com/dims-shared/dims3/GLOB/crop/475x312+0+0/resize/1028x675!/format/jpg/quality/85/http%3A%2F%2Fo.aolcdn.com%2Fhss%2Fstorage%2Fmidas%2Fc8242ab14e089c284b031379d025d64%2F204656928%2FScreen%2BShot%2B2016-12-01%2Bat%2B1.15.51%2BPM.png');
assert.equal(
lead_image_url,
'http://o.aolcdn.com/dims-shared/dims3/GLOB/crop/475x312+0+0/resize/1028x675!/format/jpg/quality/85/http%3A%2F%2Fo.aolcdn.com%2Fhss%2Fstorage%2Fmidas%2Fc8242ab14e089c284b031379d025d64%2F204656928%2FScreen%2BShot%2B2016-12-01%2Bat%2B1.15.51%2BPM.png'
);
});
it('returns the content', async () => {
@ -76,11 +80,19 @@ describe('WwwAolComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'ST. LOUIS, MO (KTVI) Amid unimaginable grief, the widow of slain Saint');
assert.equal(
first13,
'ST. LOUIS, MO (KTVI) Amid unimaginable grief, the widow of slain Saint'
);
});
});
});

@ -4,21 +4,15 @@
export const ApartmentTherapyExtractor = {
domain: 'www.apartmenttherapy.com',
title: {
selectors: [
'h1.headline',
],
selectors: ['h1.headline'],
},
author: {
selectors: [
'.PostByline__name',
],
selectors: ['.PostByline__name'],
},
content: {
selectors: [
'div.post__content',
],
selectors: ['div.post__content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -34,26 +28,19 @@ export const ApartmentTherapyExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
date_published: {
selectors: [
['.PostByline__timestamp[datetime]', 'datetime'],
],
selectors: [['.PostByline__timestamp[datetime]', 'datetime']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
dek: {
selectors: [
],
selectors: [],
},
next_page_url: {

@ -13,12 +13,11 @@ describe('ApartmentTherapyExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'http://www.apartmenttherapy.com/a-light-filled-la-loft-236564';
const html =
fs.readFileSync('./fixtures/www.apartmenttherapy.com/1476396697639.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'http://www.apartmenttherapy.com/a-light-filled-la-loft-236564';
const html = fs.readFileSync(
'./fixtures/www.apartmenttherapy.com/1476396697639.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
// To pass this test, rename your extractor in
@ -67,7 +66,10 @@ describe('ApartmentTherapyExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://atmedia.imgix.net/9332fdca908b1fcc5c9a6891b458820718239950?w=1500&fit=max');
assert.equal(
lead_image_url,
'http://atmedia.imgix.net/9332fdca908b1fcc5c9a6891b458820718239950?w=1500&fit=max'
);
});
it('returns the content', async () => {
@ -79,11 +81,19 @@ describe('ApartmentTherapyExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Name: Ashley Location: Downtown — Los Angeles, California Welcome to our sunny and');
assert.equal(
first13,
'Name: Ashley Location: Downtown — Los Angeles, California Welcome to our sunny and'
);
});
});
});

@ -37,14 +37,11 @@ export const WwwBloombergComExtractor = {
},
dek: {
selectors: [
],
selectors: [],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
@ -60,15 +57,11 @@ export const WwwBloombergComExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.inline-newsletter',
'.page-ad',
],
clean: ['.inline-newsletter', '.page-ad'],
},
};

@ -14,10 +14,10 @@ describe('WwwBloombergComExtractor', () => {
beforeAll(() => {
url =
'http://www.bloomberg.com/politics/articles/2016-12-07/trump-hits-emblem-of-presidential-power-with-air-force-one-tweet';
const html =
fs.readFileSync('./fixtures/www.bloomberg.com/1481135708958.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.bloomberg.com/1481135708958.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,49 @@ describe('WwwBloombergComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Air Force One Costs Billions of Dollars Because Its a Flying White House');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Air Force One Costs Billions of Dollars Because Its a Flying White House'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Margaret Talev');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-07T10:00:00.011Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/-1x-1.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/-1x-1.jpg'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('WwwBloombergComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Donald Trump took aim at one of the most visible emblems of the');
assert.equal(
first13,
'Donald Trump took aim at one of the most visible emblems of the'
);
});
});
@ -89,12 +103,11 @@ describe('WwwBloombergComExtractor', () => {
let result;
let url;
beforeAll(() => {
url =
'https://www.bloomberg.com/graphics/2016-apple-profits/';
const html =
fs.readFileSync('./fixtures/www.bloomberg.com/1481136509532.html');
result =
Mercury.parse(url, html, { fallback: false });
url = 'https://www.bloomberg.com/graphics/2016-apple-profits/';
const html = fs.readFileSync(
'./fixtures/www.bloomberg.com/1481136509532.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -106,43 +119,49 @@ describe('WwwBloombergComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Americans Are Paying Apple Millions to Shelter Overseas Profits');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Americans Are Paying Apple Millions to Shelter Overseas Profits'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Andrea Wong');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-07T10:00:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://www.bloomberg.com/graphics/2016-apple-profits/img/2016-apple-profits-facebook.png');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://www.bloomberg.com/graphics/2016-apple-profits/img/2016-apple-profits-facebook.png'
);
});
it('returns the content', async () => {
@ -154,11 +173,19 @@ describe('WwwBloombergComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Over the years, Apple Inc. has become the poster child for U.S. multinationals');
assert.equal(
first13,
'Over the years, Apple Inc. has become the poster child for U.S. multinationals'
);
});
});
@ -168,10 +195,10 @@ describe('WwwBloombergComExtractor', () => {
beforeAll(() => {
url =
'https://www.bloomberg.com/news/articles/2016-12-06/stock-rally-extends-into-asia-as-traders-await-rbi-aussie-gdp';
const html =
fs.readFileSync('./fixtures/www.bloomberg.com/1481138014494.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.bloomberg.com/1481138014494.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -183,43 +210,49 @@ describe('WwwBloombergComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'U.S. Stocks Rise to Records, Bonds Gain on ECB Stimulus Optimism');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'U.S. Stocks Rise to Records, Bonds Gain on ECB Stimulus Optimism'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Jeremy Herron');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-06T23:22:22.402Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.bloomberg.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://assets.bwbx.io/javelin/public/images/social-markets-3d32d2f713.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://assets.bwbx.io/javelin/public/images/social-markets-3d32d2f713.jpg'
);
});
it('returns the content', async () => {
@ -231,11 +264,19 @@ describe('WwwBloombergComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The Dow Jones Industrial Average rose 220 points as U.S. stock indexes powered');
assert.equal(
first13,
'The Dow Jones Industrial Average rose 220 points as U.S. stock indexes powered'
);
});
});
});

@ -4,50 +4,36 @@
export const BroadwayWorldExtractor = {
domain: 'www.broadwayworld.com',
title: {
selectors: [
'h1.article-title',
],
selectors: ['h1.article-title'],
},
author: {
selectors: [
'span[itemprop=author]',
],
selectors: ['span[itemprop=author]'],
},
content: {
selectors: [
'div[itemprop=articlebody]',
],
selectors: ['div[itemprop=articlebody]'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
date_published: {
selectors: [
['meta[itemprop=datePublished]', 'value'],
],
selectors: [['meta[itemprop=datePublished]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
dek: {
selectors: [
],
selectors: [],
},
next_page_url: {

@ -15,10 +15,10 @@ describe('CustomExtractor', () => {
beforeAll(() => {
url =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.broadwayworld.com/1476392567143.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
// To pass this test, rename your extractor in
@ -37,7 +37,10 @@ describe('CustomExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'American Theatre Wing Launches Andrew Lloyd Webber Training Scholarships');
assert.equal(
title,
'American Theatre Wing Launches Andrew Lloyd Webber Training Scholarships'
);
});
it('returns the author', async () => {
@ -67,7 +70,10 @@ describe('CustomExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://images.bwwstatic.com/columnpic7/7B5FD766-A644-E386-19DE07017A3AD79C.jpg');
assert.equal(
lead_image_url,
'https://images.bwwstatic.com/columnpic7/7B5FD766-A644-E386-19DE07017A3AD79C.jpg'
);
});
it('returns the content', async () => {
@ -79,11 +85,19 @@ describe('CustomExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The American Theatre Wing announced today that their Andrew Lloyd Webber Initiative has');
assert.equal(
first13,
'The American Theatre Wing announced today that their Andrew Lloyd Webber Initiative has'
);
});
});
});

@ -2,44 +2,31 @@ export const WwwBustleComExtractor = {
domain: 'www.bustle.com',
title: {
selectors: [
'h1.post-page__title',
],
selectors: ['h1.post-page__title'],
},
author: {
selectors: [
'div.content-meta__author',
],
selectors: ['div.content-meta__author'],
},
date_published: {
selectors: [
['time.content-meta__published-date[datetime]', 'datetime'],
],
selectors: [['time.content-meta__published-date[datetime]', 'datetime']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.post-page__body',
],
selectors: ['.post-page__body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('WwwBustleComExtractor', () => {
beforeAll(() => {
url =
'https://www.bustle.com/articles/194709-13-ways-to-compliment-women-in-the-most-empowering-transformative-way-possible';
const html =
fs.readFileSync('./fixtures/www.bustle.com/1481129185239.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.bustle.com/1481129185239.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -35,7 +35,10 @@ describe('WwwBustleComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, '13 Ways To Compliment Women In The Most Empowering, Transformative Way Possible');
assert.equal(
title,
'13 Ways To Compliment Women In The Most Empowering, Transformative Way Possible'
);
});
it('returns the author', async () => {
@ -65,7 +68,10 @@ describe('WwwBustleComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://typeset-beta.imgix.net/rehost/2016/12/2/2fa248d4-0035-403f-a18d-3aeca6929b98.jpg?w=1200&h=630&fit=crop&crop=faces&auto=format&q=70');
assert.equal(
lead_image_url,
'https://typeset-beta.imgix.net/rehost/2016/12/2/2fa248d4-0035-403f-a18d-3aeca6929b98.jpg?w=1200&h=630&fit=crop&crop=faces&auto=format&q=70'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('WwwBustleComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'When I log into my Facebook these days, I\'m pretty much prepared for');
assert.equal(
first13,
"When I log into my Facebook these days, I'm pretty much prepared for"
);
});
});
});

@ -12,7 +12,8 @@ export const BuzzfeedExtractor = {
author: {
selectors: [
'a[data-action="user/username"]', 'byline__author',
'a[data-action="user/username"]',
'byline__author',
// enter author selectors
],
},
@ -30,7 +31,7 @@ export const BuzzfeedExtractor = {
transforms: {
h2: 'b',
'div.longform_custom_header_media': ($node) => {
'div.longform_custom_header_media': $node => {
if ($node.has('img') && $node.has('.longform_header_image_source')) {
return 'figure';
}
@ -54,20 +55,15 @@ export const BuzzfeedExtractor = {
},
date_published: {
selectors: [
'.buzz-datetime',
],
selectors: ['.buzz-datetime'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
dek: {
selectors: [
],
selectors: [],
},
next_page_url: null,

@ -15,10 +15,10 @@ describe('BuzzfeedExtractor', () => {
beforeAll(() => {
url =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.buzzfeed.com/1475531975121.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
@ -38,7 +38,10 @@ describe('BuzzfeedExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'People Are Calling Out This Edited Picture Of Demi Lovato For Body-Shaming Her');
assert.equal(
title,
'People Are Calling Out This Edited Picture Of Demi Lovato For Body-Shaming Her'
);
});
it('returns the author', async () => {
@ -58,7 +61,10 @@ describe('BuzzfeedExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-10/3/12/social_promotion/buzzfeed-prod-fastlane01/facebook-social-promotion-17757-1475512210-1.jpg');
assert.equal(
lead_image_url,
'https://img.buzzfeed.com/buzzfeed-static/static/2016-10/3/12/social_promotion/buzzfeed-prod-fastlane01/facebook-social-promotion-17757-1475512210-1.jpg'
);
});
it('returns the content', async () => {
@ -70,11 +76,19 @@ describe('BuzzfeedExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'A few months ago, Vladimir Serbanescu, a 17-year-old artist from Romania, drew this');
assert.equal(
first13,
'A few months ago, Vladimir Serbanescu, a 17-year-old artist from Romania, drew this'
);
});
});
@ -84,10 +98,10 @@ describe('BuzzfeedExtractor', () => {
beforeAll(() => {
url =
'https://www.buzzfeed.com/katiejmbaker/college-trump-supporters-the-new-counterculture?utm_term=.ckb72b58Y#.oxY8ZOWY3';
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1480717502688.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.buzzfeed.com/1480717502688.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('returns big header images in the content', async () => {
@ -95,9 +109,14 @@ describe('BuzzfeedExtractor', () => {
const $ = cheerio.load(content || '');
const imgSrc = $('img').first().attr('src');
const imgSrc = $('img')
.first()
.attr('src');
assert.equal(imgSrc, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-11/21/10/enhanced/buzzfeed-prod-fastlane03/longform-original-25748-1479741827-5.jpg');
assert.equal(
imgSrc,
'https://img.buzzfeed.com/buzzfeed-static/static/2016-11/21/10/enhanced/buzzfeed-prod-fastlane03/longform-original-25748-1479741827-5.jpg'
);
});
it('transforms the splash image to a figure and caption', async () => {
@ -105,12 +124,19 @@ describe('BuzzfeedExtractor', () => {
const $ = cheerio.load(content || '');
const imgSrc = $('figure img').first().attr('src');
const figcaption = $('figure figcaption').first().text();
const imgSrc = $('figure img')
.first()
.attr('src');
const figcaption = $('figure figcaption')
.first()
.text();
// Update these values with the expected values from
// the article.
assert.equal(imgSrc, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-11/21/10/enhanced/buzzfeed-prod-fastlane03/longform-original-25748-1479741827-5.jpg');
assert.equal(
imgSrc,
'https://img.buzzfeed.com/buzzfeed-static/static/2016-11/21/10/enhanced/buzzfeed-prod-fastlane03/longform-original-25748-1479741827-5.jpg'
);
assert.equal(figcaption, 'Adam Maida for BuzzFeed News');
});
});

@ -2,51 +2,36 @@ export const WwwCbssportsComExtractor = {
domain: 'www.cbssports.com',
title: {
selectors: [
'.article-headline',
],
selectors: ['.article-headline'],
},
author: {
selectors: [
'.author-name',
],
selectors: ['.author-name'],
},
date_published: {
selectors: [
['.date-original-reading-time time', 'datetime'],
],
selectors: [['.date-original-reading-time time', 'datetime']],
timezone: 'UTC',
},
dek: {
selectors: [
'.article-subline',
],
selectors: ['.article-subline'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.article',
],
selectors: ['.article'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('WwwCbssportsComExtractor', () => {
beforeAll(() => {
url =
'http://www.cbssports.com/mlb/news/why-despite-the-complaints-of-many-mlb-players-are-actually-not-overpaid/';
const html =
fs.readFileSync('./fixtures/www.cbssports.com/1482254907948.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.cbssports.com/1482254907948.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,53 +29,62 @@ describe('WwwCbssportsComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Why, despite the complaints of many, MLB players are actually not overpaid');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Why, despite the complaints of many, MLB players are actually not overpaid'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Matt Snyder');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-19T19:19:00.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Fan backlash against league-wide salaries is wholly misguided');
// Update these values with the expected values from
// the article.
assert.equal(
dek,
'Fan backlash against league-wide salaries is wholly misguided'
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.cbssports.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://sportshub.cbsistatic.com/i/r/2016/12/19/4afa0e8e-b3b8-4c44-aca2-688cd11e9b39/thumbnail/770x433/f8a6d661a00ba87cb98847bdef9dfbad/yoenis-cespedes-121916.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://sportshub.cbsistatic.com/i/r/2016/12/19/4afa0e8e-b3b8-4c44-aca2-688cd11e9b39/thumbnail/770x433/f8a6d661a00ba87cb98847bdef9dfbad/yoenis-cespedes-121916.jpg'
);
});
it('returns the content', async () => {
@ -87,11 +96,19 @@ describe('WwwCbssportsComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Once the World Series ends, we know we\'re going to be treated to');
assert.equal(
first13,
"Once the World Series ends, we know we're going to be treated to"
);
});
});
});

@ -2,44 +2,31 @@ export const WwwChicagotribuneComExtractor = {
domain: 'www.chicagotribune.com',
title: {
selectors: [
'h1.trb_ar_hl_t',
],
selectors: ['h1.trb_ar_hl_t'],
},
author: {
selectors: [
'span.trb_ar_by_nm_au',
],
selectors: ['span.trb_ar_by_nm_au'],
},
date_published: {
selectors: [
['meta[itemprop="datePublished"]', 'value'],
],
selectors: [['meta[itemprop="datePublished"]', 'value']],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'div.trb_ar_page',
],
selectors: ['div.trb_ar_page'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

@ -14,10 +14,10 @@ describe('WwwChicagotribuneComExtractor', () => {
beforeAll(() => {
url =
'http://www.chicagotribune.com/news/nationworld/politics/ct-trump-energy-department-climate-change-request-20161213-story.html';
const html =
fs.readFileSync('./fixtures/www.chicagotribune.com/1481669367099.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.chicagotribune.com/1481669367099.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -29,43 +29,49 @@ describe('WwwChicagotribuneComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'Energy Department rejects Trump\'s request to name climate change workers, who remain worried');
// Update these values with the expected values from
// the article.
assert.equal(
title,
"Energy Department rejects Trump's request to name climate change workers, who remain worried"
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Joe Davidson');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-13T21:45:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.chicagotribune.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://www.trbimg.com/img-58506bcc/turbine/ct-trump-energy-department-climate-change-request-20161213');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://www.trbimg.com/img-58506bcc/turbine/ct-trump-energy-department-climate-change-request-20161213'
);
});
it('returns the content', async () => {
@ -77,11 +83,19 @@ describe('WwwChicagotribuneComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Global warming - "it\'s a hoax."Donald Trump has said that more than once.');
assert.equal(
first13,
'Global warming - "it\'s a hoax."Donald Trump has said that more than once.'
);
});
});
});

@ -2,46 +2,33 @@ export const WwwCinemablendComExtractor = {
domain: 'www.cinemablend.com',
title: {
selectors: [
'.story_title',
],
selectors: ['.story_title'],
},
author: {
selectors: [
'.author',
],
selectors: ['.author'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
timezone: 'EST',
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'div#wrap_left_content',
],
selectors: ['div#wrap_left_content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
clean: [],
},
};

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save