chore: Inline test fixtures (#683)

Not to be confused with extractor fixtures, which are snapshots of a webpage.

This change removes the pattern of separate JS files that provide "fixtures" for tests, which are used as provided or expected strings in tests. They were inconsistent and disorganized, and generally just served to add indirection to test files. So now all those strings are defined where they are used in their respective tests.
feat-netease-extractor
John Holdun 2 years ago committed by GitHub
parent 0d2bad544c
commit 112846f74f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,15 +0,0 @@
const HTML = {
docWithH1: '<div><h1>This Is the Real Title</h1></div>',
docWith2H1s: `
<div>
<h1>This Is the Real Title</h1>
<h1>This Is the Real Title</h1>
</div>
`,
docWithTagsInH1: {
before: '<div><h1>This Is the <em>Real</em> Title</h1></div>',
after: 'This Is the Real Title',
},
};
export default HTML;

@ -9,8 +9,7 @@ describe('clean(leadImageUrl)', () => {
});
it('returns null if the url is not valid', () => {
const url = 'this is not a valid url';
assert.equal(clean(url), null);
assert.equal(clean('this is not a valid url'), null);
});
it('trims whitespace', () => {

@ -1,27 +1,35 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => {
it('only uses h1 if there is only one on the page', () => {
const title = 'Too Short';
const $ = cheerio.load(HTML.docWith2H1s);
const $ = cheerio.load(`
<div>
<h1>This Is the Real Title</h1>
<h1>This Is the Real Title</h1>
</div>
`);
assert.equal(cleanTitle(title, { url: '', $ }), title);
});
it('removes HTML tags from titles', () => {
const $ = cheerio.load(HTML.docWithTagsInH1.before);
const $ = cheerio.load(
'<div><h1>This Is the <em>Real</em> Title</h1></div>'
);
const title = $('h1').html();
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after);
assert.equal(cleanTitle(title, { url: '', $ }), 'This Is the Real Title');
});
it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love ";
const $ = cheerio.load(HTML.docWithTagsInH1.before);
const $ = cheerio.load(
'<div><h1>This Is the <em>Real</em> Title</h1></div>'
);
assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
});

@ -5,17 +5,15 @@ import detectByHtml from './detect-by-html';
describe('detectByHtml', () => {
it('detects a medium post from the html', () => {
const html = '<head><meta name="al:ios:app_name" value="Medium" /></head>';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<head><meta name="al:ios:app_name" value="Medium" /></head>'
);
assert.equal(detectByHtml($).domain, 'medium.com');
});
it('returns nothing if no match is found', () => {
const html = '<div></div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div></div>');
assert.equal(detectByHtml($), null);
});

@ -1,39 +1,54 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => {
const $ = cheerio.load(HTML.authorMeta.test);
const $ = cheerio.load(`
<html>
<meta name="dc.author" value="Adam" />
</html>
`);
const result = GenericAuthorExtractor.extract({
$,
metaCache: ['dc.author', 'something-else'],
});
assert.equal(result, HTML.authorMeta.result);
assert.equal(result, 'Adam');
});
it('extracts author from author selectors', () => {
const $ = cheerio.load(HTML.authorSelectors.test);
const $ = cheerio.load(`
<div>
<div class="byline">
<a href="/author/adam">Adam</a>
</div>
</div>
`);
const result = GenericAuthorExtractor.extract({
$,
metaCache: ['dc.author', 'something-else'],
});
assert.equal(result, HTML.authorSelectors.result);
assert.equal(result, 'Adam');
});
it('extracts author with regex selectors', () => {
const $ = cheerio.load(HTML.authorRegSelectors.test);
const $ = cheerio.load(`
<div>
<div class="byline">
<span>By Adam</span>
</div>
</div>
`);
const result = GenericAuthorExtractor.extract({
$,
metaCache: ['dc.author', 'something-else'],
});
assert.equal(result, HTML.authorRegSelectors.result);
assert.equal(result, 'Adam');
});
it('returns null if no author found', () => {

@ -1,32 +0,0 @@
const HTML = {
authorMeta: {
test: `
<html>
<meta name="dc.author" value="Adam" />
</html>
`,
result: 'Adam',
},
authorSelectors: {
test: `
<div>
<div class="byline">
<a href="/author/adam">Adam</a>
</div>
</div>
`,
result: 'Adam',
},
authorRegSelectors: {
test: `
<div>
<div class="byline">
<span>By Adam</span>
</div>
</div>
`,
result: 'Adam',
},
};
export default HTML;

@ -7,14 +7,12 @@ const fs = require('fs');
describe('extractBestNode($, flags)', () => {
it('scores the dom nodes and returns the best option', () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8');
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
};
const $ = cheerio.load(html);
const bestNode = extractBestNode($, opts);
const bestNode = extractBestNode($, {
stripUnlikelyCandidates: true,
weightNodes: true,
});
assert(typeof bestNode, 'object');
});

@ -7,17 +7,15 @@ describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => {
it("adds the specified amount to a node's score", () => {
const $ = cheerio.load('<p score="25">Foo</p>');
let $node = $('p').first();
$node = addScore($node, $, 25);
const $node = $('p').first();
addScore($node, $, 25);
assert.equal(getScore($node), 50);
});
it('adds score if score not yet set (assumes score is 0)', () => {
const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first();
$node = addScore($node, $, 25);
const $node = $('p').first();
addScore($node, $, 25);
assert.equal(getScore($node), 25);
});
});

@ -6,11 +6,8 @@ import { addToParent, getScore } from './index';
describe('Scoring utils', () => {
describe('addToParent(node, $, amount)', () => {
it("adds 1/4 of a node's score it its parent", () => {
const html = '<div score="25"><p score="40">Foo</p></div>';
const $ = cheerio.load(html);
let $node = $('p').first();
$node = addToParent($node, $, 40);
const $ = cheerio.load('<div score="25"><p score="40">Foo</p></div>');
const $node = addToParent($('p').first(), $, 40);
assert.equal(getScore($node.parent()), 35);
assert.equal(getScore($node), 40);

@ -1,15 +1,17 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { getScore, findTopCandidate, scoreContent } from './index';
const fs = require('fs');
describe('findTopCandidate($)', () => {
it('finds the top candidate from simple case', () => {
const $ = cheerio.load(HTML.findDom1);
const $ = cheerio.load(`
<div score="100">
<p score="1">Lorem ipsum etc</p>
</div>
`);
const $$topCandidate = findTopCandidate($);
@ -17,17 +19,27 @@ describe('findTopCandidate($)', () => {
});
it('finds the top candidate from a nested case', () => {
const $ = cheerio.load(HTML.findDom2);
const $ = cheerio.load(`
<div score="10">
<article score="50">
<p score="1">Lorem ipsum etc</p>
</article>
</div>
`);
const $$topCandidate = findTopCandidate($);
// this is wrapped in a div so checking
// the score of the first child
// this is wrapped in a div so checking the score of the first child
assert.equal(getScore($$topCandidate.first()), 50);
});
it('ignores tags like BR', () => {
const $ = cheerio.load(HTML.findDom3);
const $ = cheerio.load(`
<article score="50">
<p score="1">Lorem ipsum br</p>
<br score="1000" />
</article>
`);
const $topCandidate = findTopCandidate($);
@ -35,13 +47,19 @@ describe('findTopCandidate($)', () => {
});
it('returns BODY if no candidates found', () => {
const $ = cheerio.load(HTML.topBody);
const $ = cheerio.load(`
<body>
<article>
<p>Lorem ipsum etc</p>
<br />
</article>
<body>
`);
const $topCandidate = findTopCandidate($);
// browser won't allow body tag to be placed
// arbitrarily/loaded on the page, so we tranform
// it in cheerio-query, so this test would fail.
// browser won't allow body tag to be placed arbitrarily/loaded on the page,
// so we tranform it in cheerio-query, so this test would fail.
if (!$.browser) {
assert.equal($topCandidate.get(0).tagName, 'body');
}

@ -1,664 +0,0 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`,
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`,
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`,
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`,
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`,
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
},
// cleanConditionally
dropNegativeScore: {
before: `
<div>
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`,
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`,
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
previousEndsInColon: {
before: `
<div weight="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: 'What do you think?',
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
};
export default HTML;

@ -1,87 +0,0 @@
const HTML = {
score1: `
<p>Lorem ipsum dolor sit amet</p>
`,
score3: `
<p>Lorem ipsum, dolor sit, amet</p>
`,
score19: `
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`,
divScore5: `
<div>Lorem ipsum, dolor sit, amet</div>
`,
blockquoteScore3: `
<blockquote>Lorem ipsum, dolor sit, amet</blockquote>
`,
formScoreNeg3: `
<form><label>Lorem ipsum, dolor sit, amet</label></form>
`,
thScoreNeg5: `
<th>Lorem ipsum, dolor sit, amet</th>
`,
score44: `
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`,
score44Parent: `
<div>
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
hNews: {
before: `
<div class="hentry">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
after: `
<div class="hentry" score="99">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
},
nonHNews: {
before: `
<div class="">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
after: `
<div class="" score="38">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
},
// findTopCandidate
findDom1: `
<div score="100">
<p score="1">Lorem ipsum etc</p>
</div>
`,
findDom2: `
<div score="10">
<article score="50">
<p score="1">Lorem ipsum etc</p>
</article>
</div>
`,
findDom3: `
<article score="50">
<p score="1">Lorem ipsum br</p>
<br score="1000" />
</article>
`,
topBody: `
<body>
<article>
<p>Lorem ipsum etc</p>
<br />
</article>
<body>
`,
};
export default HTML;

@ -1,17 +1,13 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { getOrInitScore, getScore } from './index';
describe('getOrInitScore(node, $)', () => {
describe('when score set', () => {
it("returns score if node's score already set", () => {
const html = '<p score="40">Foo</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = getOrInitScore(node, $);
const $ = cheerio.load('<p score="40">Foo</p>');
const score = getOrInitScore($('p').first(), $);
assert.equal(score, 40);
});
@ -19,40 +15,40 @@ describe('getOrInitScore(node, $)', () => {
describe('when no score set', () => {
it('returns 0 if no class/id and text < 25 chars', () => {
const html = '<p>Foo</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = getOrInitScore(node, $);
const $ = cheerio.load('<p>Foo</p>');
const score = getOrInitScore($('p').first(), $);
assert.equal(score, 0);
});
it('returns score if no class/id and has commas/length', () => {
const $ = cheerio.load(HTML.score19);
const node = $('p').first();
const score = getOrInitScore(node, $);
const $ = cheerio.load(
`<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>`
);
const score = getOrInitScore($('p').first(), $);
assert.equal(score, 19);
});
it('returns greater score if weighted class/id is set', () => {
const $ = cheerio.load(HTML.score44);
const node = $('p').first();
const score = getOrInitScore(node, $);
const $ = cheerio.load(
`<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>`
);
const score = getOrInitScore($('p').first(), $);
assert.equal(score, 44);
});
it('gives 1/4 of its score to its parent', () => {
const $ = cheerio.load(HTML.score44Parent);
const node = $('p').first();
getOrInitScore(node, $);
assert.equal(getScore(node.parent()), 16);
const $ = cheerio.load(`
<div>
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`);
const $node = $('p').first();
getOrInitScore($node, $);
assert.equal(getScore($node.parent()), 16);
});
});
});

@ -7,15 +7,14 @@ describe('Scoring utils', () => {
describe('getScore($node)', () => {
it('returns null if the node has no score set', () => {
const $ = cheerio.load('<p>Foo</p>');
const $node = $('p').first();
assert.equal(getScore($node), null);
assert.equal(getScore($('p').first()), null);
});
it('returns 25 if the node has a score attr of 25', () => {
const $ = cheerio.load('<p score="25">Foo</p>');
const $node = $('p').first();
assert.equal(typeof getScore($node), 'number');
assert.equal(getScore($node), 25);
const score = getScore($('p').first());
assert.equal(typeof score, 'number');
assert.equal(score, 25);
});
});
});

@ -1,55 +1,90 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/get-weight';
import { getWeight } from './index';
describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => {
it('returns a score of 25 if node has positive id', () => {
const $ = cheerio.load(HTML.positiveId);
const $ = cheerio.load(`
<div id="entry">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of -25 if node has negative id', () => {
const $ = cheerio.load(HTML.negativeId);
const $ = cheerio.load(`
<div id="adbox">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), -25);
});
it('returns a score of 25 if node has positive class', () => {
const $ = cheerio.load(HTML.positiveClass);
const $ = cheerio.load(`
<div class="entry">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of -25 if node has negative class', () => {
const $ = cheerio.load(HTML.negativeClass);
const $ = cheerio.load(`
<div id="comment ad">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), -25);
});
it('returns a score of 25 if node has both positive id and class', () => {
const $ = cheerio.load(HTML.positiveIdAndClass);
const $ = cheerio.load(`
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of 25 if node has pos id and neg class', () => {
// is this really wanted? id="entry" class="adbox"
// should get positive score?
const $ = cheerio.load(HTML.positiveIdNegClass);
const $ = cheerio.load(`
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of 10 if node has pos img class', () => {
const $ = cheerio.load(HTML.positivePhotoClass);
const $ = cheerio.load(`
<div class="figure">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 10);
});
it('returns a score of 35 if node has pos id pos img class', () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto);
const $ = cheerio.load(`
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 35);
});
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
const $ = cheerio.load(HTML.entryContentAsset);
const $ = cheerio.load(`
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 50);
});
});

@ -1,8 +1,6 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { scoreContent, getScore } from './index';
const fs = require('fs');
@ -12,15 +10,24 @@ const fs = require('fs');
// probably missing something when calculating
describe('scoreContent($, weightNodes)', () => {
it('loves hNews content', () => {
const $ = cheerio.load(HTML.hNews.before);
const $ = cheerio.load(`
<div class="hentry">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`);
scoreContent($);
assert.equal(getScore($('div').first()), 140);
});
it('is so-so about non-hNews content', () => {
const $ = cheerio.load(HTML.nonHNews.before);
scoreContent($).html();
const $ = cheerio.load(`
<div class="">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`);
scoreContent($);
assert.equal(getScore($('div').first()), 65);
});
@ -28,15 +35,14 @@ describe('scoreContent($, weightNodes)', () => {
it('scores this Wired article the same', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html);
scoreContent($).html();
scoreContent($);
assert.equal(getScore($('article').first()), 65.5);
});
it('scores this Vulture article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
let $ = cheerio.load(html);
$ = scoreContent($);
const $ = scoreContent(cheerio.load(html));
assert.equal($('p[score]').length, 62);
const itemprop = $('[itemprop=articleBody]').first();
@ -50,31 +56,32 @@ describe('scoreContent($, weightNodes)', () => {
const html = `
<div score="0">
<div score="0">
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
</p>
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
</p>
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
</p>
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
<p>Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
</p>
</div>
</div>
`;
let $ = cheerio.load(html);
$ = scoreContent($);
const $ = cheerio.load(html);
scoreContent($);
assert.equal(
$('p')

@ -1,14 +1,11 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { scoreNode, scoreParagraph } from './index';
describe('scoreNode(node)', () => {
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html);
const $ = cheerio.load('<p><em>Foo</em> bar</p>');
const node = $('p').first();
const score = scoreNode(node);
@ -19,7 +16,9 @@ describe('scoreNode(node)', () => {
});
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score1);
const $ = cheerio.load(`
<p>Lorem ipsum dolor sit amet</p>
`);
const node = $('p').first();
const score = scoreNode(node);
@ -30,7 +29,9 @@ describe('scoreNode(node)', () => {
});
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score3);
const $ = cheerio.load(`
<p>Lorem ipsum, dolor sit, amet</p>
`);
const node = $('p').first();
const score = scoreNode(node);
@ -41,7 +42,9 @@ describe('scoreNode(node)', () => {
});
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score19);
const $ = cheerio.load(`
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`);
const node = $('p').first();
const score = scoreNode(node);
@ -52,7 +55,9 @@ describe('scoreNode(node)', () => {
});
it('scores divs with 5', () => {
const $ = cheerio.load(HTML.divScore5);
const $ = cheerio.load(`
<div>Lorem ipsum, dolor sit, amet</div>
`);
const node = $('div').first();
const score = scoreNode(node);
@ -61,7 +66,9 @@ describe('scoreNode(node)', () => {
});
it('scores the blockquote family with 3', () => {
const $ = cheerio.load(HTML.blockquoteScore3);
const $ = cheerio.load(`
<blockquote>Lorem ipsum, dolor sit, amet</blockquote>
`);
const node = $('blockquote').first();
const score = scoreNode(node);
@ -70,7 +77,9 @@ describe('scoreNode(node)', () => {
});
it('scores a form with negative 3', () => {
const $ = cheerio.load(HTML.formScoreNeg3);
const $ = cheerio.load(`
<form><label>Lorem ipsum, dolor sit, amet</label></form>
`);
const node = $('form').first();
const score = scoreNode(node);
@ -79,7 +88,9 @@ describe('scoreNode(node)', () => {
});
it('scores a TH element with negative 5', () => {
const $ = cheerio.load(HTML.thScoreNeg5);
const $ = cheerio.load(`
<th>Lorem ipsum, dolor sit, amet</th>
`);
const node = $('th').first();
const score = scoreNode(node);

@ -1,44 +1,36 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { scoreParagraph } from './index';
describe('Scoring utils', () => {
describe('scoreParagraph(node)', () => {
it('returns 0 if text is less than 25 chars', () => {
const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = scoreParagraph(node);
const $ = cheerio.load('<p><em>Foo</em> bar</p>');
const score = scoreParagraph($('p').first());
assert.equal(score, 0);
});
it('returns 1 if text is > 25 chars and has 0 commas', () => {
const $ = cheerio.load(HTML.score1);
const node = $('p').first();
const score = scoreParagraph(node);
const $ = cheerio.load('<p>Lorem ipsum dolor sit amet</p>');
const score = scoreParagraph($('p').first());
assert.equal(score, 1);
});
it('returns 3 if text is > 25 chars and has 2 commas', () => {
const $ = cheerio.load(HTML.score3);
const node = $('p').first();
const score = scoreParagraph(node);
const $ = cheerio.load('<p>Lorem ipsum, dolor sit, amet</p>');
const score = scoreParagraph($('p').first());
assert.equal(score, 3);
});
it('returns 19 if text has 15 commas, ~600 chars', () => {
const $ = cheerio.load(HTML.score19);
const node = $('p').first();
const score = scoreParagraph(node);
const $ = cheerio.load(
`<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>`
);
const score = scoreParagraph($('p').first());
assert.equal(score, 19);
});

@ -7,10 +7,9 @@ describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first();
const $node = $('p').first();
const newScore = 25;
$node = setScore($node, $, newScore);
setScore($node, $, newScore);
const score = getScore($node);
assert(score, newScore);

@ -2,13 +2,18 @@ import assert from 'assert';
import cheerio from 'cheerio';
import moment from 'moment-timezone';
import HTML from './fixtures/html';
import GenericDatePublishedExtractor from './extractor';
describe('GenericDatePublishedExtractor', () => {
describe('extract($, metaCache)', () => {
it('extracts datePublished from meta tags', () => {
const $ = cheerio.load(HTML.datePublishedMeta.test);
const $ = cheerio.load(`
<html>
<head>
<meta name="displaydate" value="1/1/2020 8:30 (EST)" />
</head>
</html>
`);
const metaCache = ['displaydate', 'something-else'];
const result = GenericDatePublishedExtractor.extract({
$,
@ -16,11 +21,19 @@ describe('GenericDatePublishedExtractor', () => {
metaCache,
});
assert.equal(result, HTML.datePublishedMeta.result.toISOString());
assert.equal(result, new Date('1/1/2020 8:30 (EST)').toISOString());
});
it('extracts datePublished from selectors', () => {
const $ = cheerio.load(HTML.datePublishedSelectors.test);
const $ = cheerio.load(`
<div>
<div class="hentry">
<div class="updated">
1/1/2020 <span class="time">8:30am</span>
</div>
</head>
</div>
`);
const metaCache = [];
const result = GenericDatePublishedExtractor.extract({
$,
@ -28,7 +41,7 @@ describe('GenericDatePublishedExtractor', () => {
metaCache,
});
assert.equal(result, HTML.datePublishedMeta.result.toISOString());
assert.equal(result, new Date('1/1/2020 8:30 (EST)').toISOString());
});
it('extracts from url formatted /2012/08/01/etc', () => {

@ -1,26 +0,0 @@
const HTML = {
datePublishedMeta: {
test: `
<html>
<head>
<meta name="displaydate" value="1/1/2020 8:30 (EST)" />
</head>
</html>
`,
result: new Date('1/1/2020 8:30 (EST)'),
},
datePublishedSelectors: {
test: `
<div>
<div class="hentry">
<div class="updated">
1/1/2020 <span class="time">8:30am</span>
</div>
</head>
</div>
`,
result: new Date('1/1/2020 8:30 am (EST)'),
},
};
export default HTML;

@ -1,14 +1,18 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import GenericLeadImageUrlExtractor from './extractor';
describe('GenericLeadImageUrlExtractor', () => {
describe('extract({ $, content, metaCache })', () => {
it('returns og:image first', () => {
const $ = cheerio.load(HTML.og.test);
const $ = cheerio.load(`
<html>
<head>
<meta name="og:image" value="http://example.com/lead.jpg">
</head>
</html>
`);
const content = $('*').first();
const metaCache = ['og:image'];
@ -18,11 +22,17 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache,
});
assert.equal(result, HTML.og.result);
assert.equal(result, 'http://example.com/lead.jpg');
});
it('returns twitter:image', () => {
const $ = cheerio.load(HTML.twitter.test);
const $ = cheerio.load(`
<html>
<head>
<meta name="twitter:image" value="http://example.com/lead.jpg">
</head>
</html>
`);
const content = $('*').first();
const metaCache = ['twitter:image'];
@ -32,11 +42,17 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache,
});
assert.equal(result, HTML.twitter.result);
assert.equal(result, 'http://example.com/lead.jpg');
});
it('finds images based on scoring', () => {
const $ = cheerio.load(HTML.scoring.test);
const $ = cheerio.load(`
<div>
<img src="http://example.com/sprite/abadpic.jpg" />
<img src="http://example.com/upload/goodpic.jpg" />
<img src="http://example.com/upload/whateverpic.png" />
</div>
`);
const content = $('*').first();
const metaCache = [];
@ -46,11 +62,15 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache,
});
assert.equal(result, HTML.scoring.result);
assert.equal(result, 'http://example.com/upload/goodpic.jpg');
});
it('returns image based on selectors', () => {
const $ = cheerio.load(HTML.selectors.test);
const $ = cheerio.load(`
<div>
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div>
`);
const content = $('*').first();
const metaCache = [];
@ -60,7 +80,7 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache,
});
assert.equal(result, HTML.selectors.result);
assert.equal(result, 'http://example.com/upload/goodpic.jpg');
});
});
});

@ -1,42 +0,0 @@
const HTML = {
og: {
test: `
<html>
<head>
<meta name="og:image" value="http://example.com/lead.jpg">
</head>
</html>
`,
result: 'http://example.com/lead.jpg',
},
twitter: {
test: `
<html>
<head>
<meta name="twitter:image" value="http://example.com/lead.jpg">
</head>
</html>
`,
result: 'http://example.com/lead.jpg',
},
scoring: {
test: `
<div>
<img src="http://example.com/sprite/abadpic.jpg" />
<img src="http://example.com/upload/goodpic.jpg" />
<img src="http://example.com/upload/whateverpic.png" />
</div>
`,
result: 'http://example.com/upload/goodpic.jpg',
},
selectors: {
test: `
<div>
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div>
`,
result: 'http://example.com/upload/goodpic.jpg',
},
};
export default HTML;

@ -66,15 +66,15 @@ describe('scoreAttr($img)', () => {
describe('scoreByParents($img)', () => {
it('gets 25 points if it has a figure parent', () => {
const $ = cheerio.load(
`<div>
<figure>
<div>
<img alt="Wow" />
</div>
</figure>
</div>`
);
const $ = cheerio.load(`
<div>
<figure>
<div>
<img alt="Wow" />
</div>
</figure>
</div>
`);
const $img = $('img').first();
assert.equal(scoreByParents($img), 25);
@ -88,15 +88,15 @@ describe('scoreByParents($img)', () => {
});
it('gets 15 points if parent or gparent has photo hints', () => {
const $ = cheerio.load(
`<div>
<div class="figure">
<div>
<img alt="Wow" />
</div>
const $ = cheerio.load(`
<div>
<div class="figure">
<div>
<img alt="Wow" />
</div>
</div>`
);
</div>
</div>
`);
const $img = $('img').first();
assert.equal(scoreByParents($img), 15);
@ -105,30 +105,28 @@ describe('scoreByParents($img)', () => {
describe('scoreBySibling($img)', () => {
it('gets 25 points if its sibling is figcaption', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img />
<figcaption>Wow</figcaption>
</div>
`
);
`);
const $img = $('img').first();
assert.equal(scoreBySibling($img), 25);
});
it('gets 15 points if its sibling has photo hints', () => {
const $ = cheerio.load(
`<div>
<div>
<img alt="Wow" />
<div class="caption">
Wow
</div>
</div>
</div>`
);
const $ = cheerio.load(`
<div>
<div>
<img alt="Wow" />
<div class="caption">
Wow
</div>
</div>
</div>
`);
const $img = $('img').first();
assert.equal(scoreBySibling($img), 15);
@ -137,65 +135,55 @@ describe('scoreBySibling($img)', () => {
describe('scoreByDimensions($img)', () => {
it('penalizes skinny images', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img width="10" />
</div>
`
);
`);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50);
});
it('penalizes short images', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img height="10" />
</div>
`
);
`);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50);
});
it('ignores sprites', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img src="/sprite/etc/foo.png" width="1000" height="1000" />
</div>
`
);
`);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), 0);
});
it('penalizes images with small areas', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img src="/etc/foo.png" width="60" height="60" />
</div>
`
);
`);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), -100);
});
it('prefers the largest images', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img src="/etc/foo.png" width="1000" height="1000" />
</div>
`
);
`);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), 1000);
@ -204,8 +192,7 @@ describe('scoreByDimensions($img)', () => {
describe('scoreByPosition($imgs, index)', () => {
it('gives higher scores to images that come first', () => {
const $ = cheerio.load(
`
const $ = cheerio.load(`
<div>
<img width="10" />
<img width="10" />
@ -214,8 +201,7 @@ describe('scoreByPosition($imgs, index)', () => {
<img width="10" />
<img width="10" />
</div>
`
);
`);
const $imgs = $('img');
assert.equal(scoreByPosition($imgs, 0), 3);

@ -8,7 +8,6 @@ const fs = require('fs');
describe('scoreLinks(links)', () => {
it('returns an object of scored links', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html);
const links = $('a[href]').toArray();
const url =
@ -25,9 +24,7 @@ describe('scoreLinks(links)', () => {
});
it('returns null if no possible pages', () => {
const html = '<div><p>Hello wow</p></div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div><p>Hello wow</p></div>');
const links = $('a[href]').toArray();
const url =
'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';

@ -5,30 +5,26 @@ import scoreByParents from './score-by-parents';
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
const $ = cheerio.load(`
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
`);
assert.equal(scoreByParents($link), 25);
assert.equal(scoreByParents($('a').first()), 25);
});
it('returns -25 if parent sig looks like a comment', () => {
const html = `
const $ = cheerio.load(`
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
`);
assert.equal(scoreByParents($link), -25);
assert.equal(scoreByParents($('a').first()), -25);
});
});

@ -4,14 +4,10 @@ import scoreCapLinks from './score-cap-links';
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = 'foo next Last page';
assert.equal(scoreCapLinks(linkData), -65);
assert.equal(scoreCapLinks('foo next Last page'), -65);
});
it('returns 0 if does not match a cap link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreCapLinks(linkData), 0);
assert.equal(scoreCapLinks('foo bar WOW GREAT'), 0);
});
});

@ -4,14 +4,10 @@ import scoreExtraneousLinks from './score-extraneous-links';
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = 'http://example.com/email-link';
assert.equal(scoreExtraneousLinks(url), -25);
assert.equal(scoreExtraneousLinks('http://example.com/email-link'), -25);
});
it('returns 0 if does not match extraneous text', () => {
const url = 'http://example.com/asdf';
assert.equal(scoreExtraneousLinks(url), 0);
assert.equal(scoreExtraneousLinks('http://example.com/asdf'), 0);
});
});

@ -4,14 +4,10 @@ import scoreNextLinkText from './score-next-link-text';
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = 'foo bar Next page';
assert.equal(scoreNextLinkText(linkData), 50);
assert.equal(scoreNextLinkText('foo bar Next page'), 50);
});
it('returns 0 if does not contain common next link text', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreNextLinkText(linkData), 0);
assert.equal(scoreNextLinkText('foo bar WOW GREAT'), 0);
});
});

@ -1,5 +1,4 @@
import assert from 'assert';
import scorePageInLink from './score-page-in-link';
describe('scorePageInLink(pageNum, isWp)', () => {

@ -4,14 +4,10 @@ import scorePrevLink from './score-prev-link';
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = 'foo next previous page';
assert.equal(scorePrevLink(linkData), -200);
assert.equal(scorePrevLink('foo next previous page'), -200);
});
it('returns 0 if does not match a prev link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scorePrevLink(linkData), 0);
assert.equal(scorePrevLink('foo bar WOW GREAT'), 0);
});
});

@ -1,49 +1,68 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import GenericTitleExtractor from './extractor';
describe('GenericTitleExtractor', () => {
describe('extract({ $, url, cachedMeta })', () => {
it('extracts strong meta title tags', () => {
const $ = cheerio.load(HTML.dcTitle.test);
const $ = cheerio.load(`
<html>
<meta name="dc.title" value="This Is the Title Okay" />
<html>
`);
const result = GenericTitleExtractor.extract({
$,
url: '',
metaCache: ['dc.title', 'something-else'],
});
assert.equal(result, HTML.dcTitle.result);
assert.equal(result, 'This Is the Title Okay');
});
it('pulls title from selectors lacking string meta', () => {
const $ = cheerio.load(HTML.strongTitleSelector.test);
const $ = cheerio.load(`
<html>
<article class="hentry">
<h1 class="entry-title">This Is the Title Okay</h1>
</article>
<html>
`);
const result = GenericTitleExtractor.extract({
$,
url: '',
metaCache: ['og:title', 'something-else'],
});
assert.equal(result, HTML.ogTitle.result);
assert.equal(result, 'This Is the Title Okay');
});
it('then falls back to weak meta title tags', () => {
const $ = cheerio.load(HTML.ogTitle.test);
const $ = cheerio.load(`
<html>
<meta name="og:title" value="This Is the Title Okay" />
<html>
`);
const result = GenericTitleExtractor.extract({
$,
url: '',
metaCache: ['og:title', 'something-else'],
});
assert.equal(result, HTML.ogTitle.result);
assert.equal(result, 'This Is the Title Okay');
});
});
it('then falls back to weak selectors', () => {
const $ = cheerio.load(HTML.weakTitleSelector.test);
const $ = cheerio.load(`
<html>
<head>
<title>This Is the Weak Title Okay</title>
</head>
<html>
`);
const result = GenericTitleExtractor.extract({ $, url: '', metaCache: [] });
assert.equal(result, HTML.weakTitleSelector.result);
assert.equal(result, 'This Is the Weak Title Okay');
});
});

@ -1,40 +0,0 @@
const HTML = {
dcTitle: {
test: `
<html>
<meta name="dc.title" value="This Is the Title Okay" />
<html>
`,
result: 'This Is the Title Okay',
},
ogTitle: {
test: `
<html>
<meta name="og:title" value="This Is the Title Okay" />
<html>
`,
result: 'This Is the Title Okay',
},
strongTitleSelector: {
test: `
<html>
<article class="hentry">
<h1 class="entry-title">This Is the Title Okay</h1>
</article>
<html>
`,
result: 'This Is the Title Okay',
},
weakTitleSelector: {
test: `
<html>
<head>
<title>This Is the Weak Title Okay</title>
</head>
<html>
`,
result: 'This Is the Weak Title Okay',
},
};
export default HTML;

@ -44,7 +44,8 @@ describe('cleanBySelectors($content, $, { clean })', () => {
<p>This is some good content</p>
<div class="ad">Advertisement!</div>
</div>
</div>`;
</div>
`;
const $ = cheerio.load(html);
let $content = $('.body');
@ -58,13 +59,13 @@ describe('cleanBySelectors($content, $, { clean })', () => {
describe('transformElements($content, $, { transforms })', () => {
it('performs a simple transformation on matched elements', () => {
const html = `
<div>
<div class="body">
<h1>WOW BIG TITLE</h1>
<p>Here are some words</p>
<h1>WOW BIG TITLE</h1>
<div>
<div class="body">
<h1>WOW BIG TITLE</h1>
<p>Here are some words</p>
<h1>WOW BIG TITLE</h1>
</div>
</div>
</div>
`;
const opts = {
transforms: { h1: 'h2' },
@ -86,17 +87,17 @@ describe('transformElements($content, $, { transforms })', () => {
it('performs a complex transformation on matched elements', () => {
const html = `
<div>
<div class="body">
<noscript>
<img src="/img.jpg" />
</noscript>
<noscript>
Something else
</noscript>
<p>Here are some words</p>
<div>
<div class="body">
<noscript>
<img src="/img.jpg" />
</noscript>
<noscript>
Something else
</noscript>
<p>Here are some words</p>
</div>
</div>
</div>
`;
const opts = {
transforms: {

@ -1,5 +1,4 @@
import assert from 'assert';
import { record } from 'test-helpers';
import Mercury from './mercury';

@ -5,9 +5,7 @@ import convertLazyLoadedImages from './convert-lazy-loaded-images';
describe('convertLazyLoadedImages($)', () => {
it('moves image links to src if placed in another attribute', () => {
const html = '<img data-src="http://example.com/foo.jpg">';
const $ = cheerio.load(html);
const $ = cheerio.load('<img data-src="http://example.com/foo.jpg">');
const result = convertLazyLoadedImages($).html();
assert.equal(
@ -17,9 +15,7 @@ describe('convertLazyLoadedImages($)', () => {
});
it('moves image source candidates to srcset if placed in another attribute', () => {
const html = '<img data-srcset="http://example.com/foo.jpg 2x">';
const $ = cheerio.load(html);
const $ = cheerio.load('<img data-srcset="http://example.com/foo.jpg 2x">');
const result = convertLazyLoadedImages($).html();
assert.equal(
@ -29,10 +25,9 @@ describe('convertLazyLoadedImages($)', () => {
});
it('moves image source candidates containing query strings to srcset if placed in another attribute', () => {
const html =
'<img data-srcset="http://example.com/foo.jpg?w=400 2x, http://example.com/foo.jpg?w=600 3x">';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<img data-srcset="http://example.com/foo.jpg?w=400 2x, http://example.com/foo.jpg?w=600 3x">'
);
const result = convertLazyLoadedImages($).html();
assert.equal(
@ -42,10 +37,9 @@ describe('convertLazyLoadedImages($)', () => {
});
it('properly handles src and srcset attributes', () => {
const html =
'<img data-src="http://example.com/foo.jpg" data-srcset="http://example.com/foo.jpg 2x">';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<img data-src="http://example.com/foo.jpg" data-srcset="http://example.com/foo.jpg 2x">'
);
const result = convertLazyLoadedImages($).html();
assert.equal(
@ -57,37 +51,30 @@ describe('convertLazyLoadedImages($)', () => {
it('does nothing when value is not a link', () => {
// This is far from perfect, since a relative url could
// be perfectly correct.
const html = '<img data-src="foo.jpg">';
const $ = cheerio.load(html);
const $ = cheerio.load('<img data-src="foo.jpg">');
const result = convertLazyLoadedImages($).html();
assert.equal(result, '<img data-src="foo.jpg">');
});
it('does nothing when value is not an image', () => {
const html = '<img data-src="http://example.com">';
const $ = cheerio.load(html);
const $ = cheerio.load('<img data-src="http://example.com">');
const result = convertLazyLoadedImages($).html();
assert.equal(result, '<img data-src="http://example.com">');
});
it('does not change a correct img with src', () => {
const html = '<img src="http://example.com/foo.jpg">';
const $ = cheerio.load(html);
const $ = cheerio.load('<img src="http://example.com/foo.jpg">');
const result = convertLazyLoadedImages($).html();
assert.equal(result, '<img src="http://example.com/foo.jpg">');
});
it('does not replace an img src with srcset value', () => {
const html =
'<img src="http://example.com/foo.jpg" srcset="http://example.com/foo2x.jpg 2x, http://example.com/foo.jpg">';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<img src="http://example.com/foo.jpg" srcset="http://example.com/foo2x.jpg 2x, http://example.com/foo.jpg">'
);
const result = convertLazyLoadedImages($).html();
assert.equal(

@ -5,29 +5,25 @@ import normalizeMetaTags from './normalize-meta-tags';
describe('normalizeMetaTags($)', () => {
it('replaces "content" attributes with "value"', () => {
const html = '<html><meta name="foo" content="bar"></html>';
const test = '<html><meta name="foo" value="bar"></html>';
// browser cheerio/jquery will remove/replace html, so result
// is different
const testBrowser = '<meta name="foo" value="bar">';
const $ = cheerio.load(html);
// browser cheerio/jquery will remove/replace html, so result is different
const test = cheerio.browser
? '<meta name="foo" value="bar">'
: '<html><meta name="foo" value="bar"></html>';
const $ = cheerio.load('<html><meta name="foo" content="bar"></html>');
const result = normalizeMetaTags($).html();
assert.equal(result, cheerio.browser ? testBrowser : test);
assert.equal(result, test);
});
it('replaces "property" attributes with "name"', () => {
const html = '<html><meta property="foo" value="bar"></html>';
const test = '<html><meta value="bar" name="foo"></html>';
const testBrowser = '<meta value="bar" name="foo">';
const $ = cheerio.load(html);
const test = cheerio.browser
? '<meta value="bar" name="foo">'
: '<html><meta value="bar" name="foo"></html>';
const $ = cheerio.load('<html><meta property="foo" value="bar"></html>');
const result = normalizeMetaTags($).html();
assert.equal(result, cheerio.browser ? testBrowser : test);
assert.equal(result, test);
});
});

@ -2,35 +2,95 @@ import assert from 'assert';
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import brsToPs from './brs-to-ps';
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before);
assertClean(fn($).html(), HTML[key].after);
}
describe('Generic Extractor Utils', () => {
describe('brsToPs(node)', () => {
it('does nothing when no BRs present', () => {
const $ = cheerio.load(HTML.positiveId);
assert.equal(brsToPs($).html(), HTML.positiveId);
const html = `
<div id="entry">
<p>Ooo good one</p>
</div>
`;
assert.equal(brsToPs(cheerio.load(html)).html(), html);
});
it('does nothing when a single BR is present', () => {
assertBeforeAndAfter('singleBr', brsToPs);
const before = `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
});
it('converts double BR tags to an empty P tag', () => {
assertBeforeAndAfter('doubleBrs', brsToPs);
const before = `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
});
it('converts several BR tags to an empty P tag', () => {
assertBeforeAndAfter('severalBrs', brsToPs);
const before = `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
});
it('converts BR tags in a P tag into a P containing inline children', () => {
assertBeforeAndAfter('brsInP', brsToPs);
const before = `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`;
const after = `
<p>
Here is some text
<p>
Here is more text
</p></p>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
});
});
});

@ -2,21 +2,42 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanAttributes } from './index';
describe('cleanAttributes($)', () => {
it('removes style attributes from nodes', () => {
const $ = cheerio.load(HTML.removeStyle.before);
const $ = cheerio.load(`
<div>
<p style="color: red;">What do you think?</p>
</div>
`);
const result = cleanAttributes($('*').first(), $);
assertClean($.html(result), HTML.removeStyle.after);
assertClean(
$.html(result),
`
<div>
<p>What do you think?</p>
</div>
`
);
});
it('removes align attributes from nodes', () => {
const $ = cheerio.load(HTML.removeAlign.before);
const $ = cheerio.load(`
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`);
const result = cleanAttributes($('*').first(), $);
assertClean($.html(result), HTML.removeAlign.after);
assertClean(
$.html(result),
`
<div>
<p>What do you think?</p>
</div>
`
);
});
});

@ -2,21 +2,52 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanHOnes } from './index';
describe('cleanHOnes($)', () => {
it('removes H1s if there are less than 3 of them', () => {
const $ = cheerio.load(HTML.removeTwoHOnes.before);
const $ = cheerio.load(`
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`);
const result = cleanHOnes($('*').first(), $);
assertClean(result.html(), HTML.removeTwoHOnes.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
</div>
`
);
});
it('converts H1s to H2s if there are 3 or more of them', () => {
const $ = cheerio.load(HTML.convertThreeHOnes.before);
const $ = cheerio.load(`
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`);
const result = cleanHOnes($('*').first(), $);
assertClean(result.html(), HTML.convertThreeHOnes.after);
assertClean(
result.html(),
`
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`
);
});
});

@ -2,28 +2,71 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanHeaders } from './index';
describe('cleanHeaders(article, $)', () => {
it('parses html and returns the article', () => {
const $ = cheerio.load(HTML.cleanFirstHeds.before);
const $ = cheerio.load(`
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`);
const result = cleanHeaders($('*').first(), $);
assertClean(result.html(), HTML.cleanFirstHeds.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`
);
});
it('removes headers when the header text matches the title', () => {
const $ = cheerio.load(HTML.cleanTitleMatch.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`);
const result = cleanHeaders($('*').first(), $, 'Title Match');
assertClean(result.html(), HTML.cleanTitleMatch.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
);
});
it('removes headers with a negative weight', () => {
const $ = cheerio.load(HTML.dropWithNegativeWeight.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`);
const result = cleanHeaders($('*').first(), $);
assertClean(result.html(), HTML.dropWithNegativeWeight.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
);
});
});

@ -2,28 +2,64 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanImages } from './index';
describe('cleanImages($)', () => {
it('removes images with small heights/widths', () => {
const $ = cheerio.load(HTML.cleanSmallImages.before);
const $ = cheerio.load(`
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`);
const result = cleanImages($('*').first(), $);
assertClean(result.html(), HTML.cleanSmallImages.after);
assertClean(
result.html(),
`
<div>
<img width="50">
</div>
`
);
});
it('removes height attribute from images that remain', () => {
const $ = cheerio.load(HTML.cleanHeight.before);
const $ = cheerio.load(`
<div>
<img width="50" height="50" />
</div>
`);
const result = cleanImages($('*').first(), $);
assertClean(result.html(), HTML.cleanHeight.after);
assertClean(
result.html(),
`
<div>
<img width="50">
</div>
`
);
});
it('removes spacer/transparent images', () => {
const $ = cheerio.load(HTML.cleanSpacer.before);
const $ = cheerio.load(`
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`);
const result = cleanImages($('*').first(), $);
assertClean(result.html(), HTML.cleanSpacer.after);
assertClean(
result.html(),
`
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`
);
});
});

@ -2,12 +2,22 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanTags } from './index';
describe('cleanTags($)', () => {
it('drops a matching node with a negative score', () => {
const $ = cheerio.load(HTML.dropNegativeScore.before);
const $ = cheerio.load(`
<div score="5">
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`);
const result = cleanTags($('*').first(), $);
// again small adjustments for cheerio vs. jquery implementation quirks
@ -15,58 +25,231 @@ describe('cleanTags($)', () => {
assertClean(
result.html(),
cheerio.browser
? HTML.dropNegativeScore.afterBrowser
: HTML.dropNegativeScore.after
? `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p></p>
<p>What do you think?</p>
</div>
`
: `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`
);
});
it('removes a node with too many inputs', () => {
const $ = cheerio.load(HTML.removeTooManyInputs.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`);
const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.removeTooManyInputs.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
);
});
it('removes a div with no images and very little text', () => {
const $ = cheerio.load(HTML.removeShortNoImg.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`);
const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.removeShortNoImg.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`
);
});
it('removes a node with a link density that is too high', () => {
const $ = cheerio.load(HTML.linkDensityHigh.before);
const $ = cheerio.load(`
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`);
const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.linkDensityHigh.after);
assertClean(
result.html(),
`
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
);
});
it('removes a node with a good score but link density > 0.5', () => {
const $ = cheerio.load(HTML.linkDensityHigh.before);
const $ = cheerio.load(`
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`);
const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.linkDensityHigh.after);
assertClean(
result.html(),
`
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
);
});
it('keeps node with a good score but link density > 0.5 if preceding text ends in colon', () => {
const $ = cheerio.load(HTML.previousEndsInColon.before);
const html = `
<div score="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`;
const $ = cheerio.load(html);
const result = cleanTags($('*').first(), $);
assertClean(result.html(), HTML.previousEndsInColon.before);
assertClean(result.html(), html);
});
it('keeps anything with a class of entry-content-asset', () => {
const $ = cheerio.load(HTML.cleanEntryContentAsset.before);
const html = `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`;
const $ = cheerio.load(html);
const result = cleanTags($('*').first(), $);
assertClean(result.html(), HTML.cleanEntryContentAsset.before);
assertClean(result.html(), html);
});
});

@ -5,8 +5,7 @@ import convertNodeTo from './convert-node-to';
describe('convertNodeTo(node, $)', () => {
it('takes a node and converts it to a diff tag', () => {
const html = '<div>Should become a p</div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div>Should become a p</div>');
const node = $('div').first();
const result = convertNodeTo(node, $).html();
@ -16,8 +15,9 @@ describe('convertNodeTo(node, $)', () => {
});
it('retains attributes on conversion', () => {
const html = '<span class="foo" score="100">Should keep its attrs</span>';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<span class="foo" score="100">Should keep its attrs</span>'
);
const node = $('span').first();
const result = convertNodeTo(node, $, 'div').html();
@ -42,13 +42,14 @@ describe('convertNodeTo(node, $)', () => {
// transforms on the noscript tag (commonly used for lazy-loading) don't work
// as expected. This test case handles that
it('handles noscript tags in the browser', () => {
const html = '<noscript><img src="http://example.com" /></noscript>';
const resultHtml = '<figure><img src="http://example.com"></figure>';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<noscript><img src="http://example.com" /></noscript>'
);
const node = $('noscript');
const result = convertNodeTo(node, $, 'figure', 'noscript').html();
const resultHtml = '<figure><img src="http://example.com"></figure>';
assert.equal(result, resultHtml);
});
});

@ -1,21 +1,37 @@
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import convertToParagraphs from './convert-to-paragraphs';
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before);
assertClean(fn($).html(), HTML[key].after);
}
describe('convertToParagraphs($)', () => {
it('performs simple conversions', () => {
// Skipping this one in the browser. It works, but since the browser wraps
// elements in a div, the last span conversion won't work as expected.
if (!cheerio.browser) {
assertBeforeAndAfter('convertToParagraphs', convertToParagraphs);
const before = `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`;
const after = `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`;
assertClean(convertToParagraphs(cheerio.load(before)).html(), after);
}
});
@ -29,7 +45,7 @@ describe('convertToParagraphs($)', () => {
</div>
</div>
`;
const $ = cheerio.load(html);
assertClean(convertToParagraphs($).html(), html);
assertClean(convertToParagraphs(cheerio.load(html)).html(), html);
});
});

@ -1,28 +1,41 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/extract-from-selectors';
import { extractFromMeta } from './index';
describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
it('extracts an arbitrary meta tag by name', () => {
const $ = cheerio.load(HTML.metaFoo.test);
const $ = cheerio.load(`
<html>
<meta name="foo" value="bar" />
</html>
`);
const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
assert.equal(result, HTML.metaFoo.result);
assert.equal(result, 'bar');
});
it('returns nothing if a meta name is duplicated', () => {
const $ = cheerio.load(HTML.metaDupes.test);
const $ = cheerio.load(`
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="baz" />
</html>
`);
const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
assert.equal(result, HTML.metaDupes.result);
assert.equal(result, null);
});
it('ignores duplicate meta names with empty values', () => {
const $ = cheerio.load(HTML.metaEmptyDupes.test);
const $ = cheerio.load(`
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="" />
</html>
`);
const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
assert.equal(result, HTML.metaEmptyDupes.result);
assert.equal(result, 'bar');
});
});

@ -1,35 +1,55 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/extract-from-selectors';
import extractFromSelectors from './extract-from-selectors';
describe('extractFromSelectors($, selectors, maxChildren, textOnly)', () => {
it('extracts an arbitrary node by selector', () => {
const $ = cheerio.load(HTML.simpleSelector.test);
const result = extractFromSelectors($, ['.author']);
const $ = cheerio.load(`
<html>
<div class="author">Adam</div>
</html>
`);
assert.equal(result, HTML.simpleSelector.result);
assert.equal(extractFromSelectors($, ['.author']), 'Adam');
});
it('ignores comments', () => {
const $ = cheerio.load(HTML.insideComment.test);
const result = extractFromSelectors($, ['.author']);
assert.equal(result, HTML.insideComment.result);
const $ = cheerio.load(`
<html>
<div class="comments-section">
<div class="author">Adam</div>
</div>
</html>`);
assert.equal(extractFromSelectors($, ['.author']), null);
});
it('skips a selector if it matches multiple nodes', () => {
const $ = cheerio.load(HTML.multiMatch.test);
const result = extractFromSelectors($, ['.author']);
assert.equal(result, HTML.multiMatch.result);
const $ = cheerio.load(`
<html>
<div>
<div class="author">Adam</div>
<div class="author">Adam</div>
</div>
</html>
`);
assert.equal(extractFromSelectors($, ['.author']), null);
});
it('skips a node with too many children', () => {
const $ = cheerio.load(HTML.manyChildren.test);
const result = extractFromSelectors($, ['.author']);
assert.equal(result, HTML.manyChildren.result);
const $ = cheerio.load(`
<html>
<div>
<div class="author">
<span>Adam</span>
<span>Pash</span>
</div>
</div>
</html>
`);
assert.equal(extractFromSelectors($, ['.author']), null);
});
});

@ -1,75 +0,0 @@
const HTML = {
// extractFromMeta
metaFoo: {
test: `
<html>
<meta name="foo" value="bar" />
</html>`,
result: 'bar',
},
metaDupes: {
test: `
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="baz" />
</html>`,
result: null,
},
metaEmptyDupes: {
test: `
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="" />
</html>`,
result: 'bar',
},
custom: {
test: `
<html>
<meta property="foo" content="bar" />
</html>`,
result: 'bar',
},
// extractFromSelectors
simpleSelector: {
test: `
<html>
<div class="author">Adam</div>
</html>`,
result: 'Adam',
},
insideComment: {
test: `
<html>
<div class="comments-section">
<div class="author">Adam</div>
</div>
</html>`,
result: null,
},
multiMatch: {
test: `
<html>
<div>
<div class="author">Adam</div>
<div class="author">Adam</div>
</div>
</html>`,
result: null,
},
manyChildren: {
test: `
<html>
<div>
<div class="author">
<span>Adam</span>
<span>Pash</span>
</div>
</div>
</html>`,
result: null,
},
};
export default HTML;

@ -1,714 +0,0 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`,
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`,
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`,
},
ignoresKeepable: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<iframe class="mercury-parser-keep" src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
<iframe class="" src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
</div>
`,
},
// markToKeep
marksYouTube: {
before: `
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615"></iframe>
</div>
`,
after: `
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen class="mercury-parser-keep"></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615" class="mercury-parser-keep"></iframe>
</div>
`,
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`,
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`,
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
},
// cleanConditionally
dropNegativeScore: {
before: `
<div score="5">
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`,
afterBrowser: `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p></p>
<p>What do you think?</p>
</div>
`,
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`,
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
previousEndsInColon: {
before: `
<div score="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: 'What do you think?',
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
};
export default HTML;

@ -1,16 +0,0 @@
const HTML = {
tooShort: `
<div class="foo bar">
<p>This is too short</p>
</div>
`,
longEnough: `
<div class="foo bar">
<p>
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
</p>
</div>
`,
};
export default HTML;

@ -1,13 +1,13 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { linkDensity } from './index';
describe('linkDensity($)', () => {
it('returns 0.5 if half of the text is a link', () => {
const $ = cheerio.load(HTML.linkDensity5);
const $ = cheerio.load(`
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`);
const density = linkDensity($('div').first(), $);
@ -15,7 +15,9 @@ describe('linkDensity($)', () => {
});
it('returns 1 if all of the text is a link', () => {
const $ = cheerio.load(HTML.linkDensity1);
const $ = cheerio.load(`
<div><p><a href="">Some text!</a></p></div>
`);
const density = linkDensity($('div').first(), $);
@ -23,7 +25,9 @@ describe('linkDensity($)', () => {
});
it("returns 0 if there's no text", () => {
const $ = cheerio.load(HTML.linkDensity0);
const $ = cheerio.load(`
<div><p><a href=""></a></p></div>
`);
const density = linkDensity($('div').first());

@ -1,60 +1,53 @@
import assert from 'assert';
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import makeLinksAbsolute from './make-links-absolute';
describe('makeLinksAbsolute($)', () => {
it('makes relative #hrefs absolute', () => {
const html = '<div><a href="#foo">bar</a></div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div><a href="#foo">bar</a></div>');
const $content = $('*').first();
const result = $.html(makeLinksAbsolute($content, $, 'http://example.com'));
assert.equal(
result,
'<div><a href="http://example.com/#foo">bar</a></div>'
);
assertClean(result, '<div><a href="http://example.com/#foo">bar</a></div>');
});
it('makes relative ./relative paths absolute', () => {
const html = '<div><a href="foo/bar">bar</a></div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div><a href="foo/bar">bar</a></div>');
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com/baz/bat')
);
assert.equal(
assertClean(
result,
'<div><a href="http://example.com/baz/foo/bar">bar</a></div>'
);
});
it('makes relative /root/paths absolute', () => {
const html = '<div><a href="/foo/bar">bar</a></div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div><a href="/foo/bar">bar</a></div>');
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com/baz/bat')
);
assert.equal(
assertClean(
result,
'<div><a href="http://example.com/foo/bar">bar</a></div>'
);
});
it('makes relative srcs absolute', () => {
const html = '<div><img src="#foo"></div>';
const $ = cheerio.load(html);
const $ = cheerio.load('<div><img src="#foo"></div>');
const $content = $('*').first();
const result = $.html(makeLinksAbsolute($content, $, 'http://example.com'));
assert.equal(result, '<div><img src="http://example.com/#foo"></div>');
assertClean(result, '<div><img src="http://example.com/#foo"></div>');
});
describe('makes relative srcsets absolute', () => {
@ -80,50 +73,54 @@ describe('makeLinksAbsolute($)', () => {
* assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x,
* assets/images/rhythm/240@3x.jpg 3x
*/
const html = `<div>
<picture>
<source srcset="assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/120@2x.jpg 2x, assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)">
<source srcset="assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x,assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)">
<img src="assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<picture>
<source srcset="assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/120@2x.jpg 2x, assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)">
<source srcset="assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x,assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)">
<img src="assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com')
);
assert.equal(
assertClean(
result,
`<div>
<picture>
<source srcset="http://example.com/assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)">
<source srcset="http://example.com/assets/images/rhythm/120@2x.jpg 2x, http://example.com/assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)">
<source srcset="http://example.com/assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x, http://example.com/assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)">
<img src="http://example.com/assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>`
`
<div>
<picture>
<source srcset="http://example.com/assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)">
<source srcset="http://example.com/assets/images/rhythm/120@2x.jpg 2x, http://example.com/assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)">
<source srcset="http://example.com/assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x, http://example.com/assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)">
<img src="http://example.com/assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>
`
);
});
it('does nothing when the srcset is empty or just whitespace', () => {
const html = `<div>
<picture>
<source srcset="" media="(max-width: 450px)">
<source srcset=" ">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<picture>
<source srcset="" media="(max-width: 450px)">
<source srcset=" ">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com')
);
assert.equal(
assertClean(
result,
`<div>
<picture>
@ -136,21 +133,22 @@ describe('makeLinksAbsolute($)', () => {
});
it('handles comma separated (with whitespace) srcset files with device-pixel-ratio descriptors', () => {
const html = `<div>
<picture>
<source srcset="assets/images/rhythm/076.jpg 2x, assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/076@2x.jpg 2x, assets/images/rhythm/076.jpg">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<picture>
<source srcset="assets/images/rhythm/076.jpg 2x, assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/076@2x.jpg 2x, assets/images/rhythm/076.jpg">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com')
);
assert.equal(
assertClean(
result,
`<div>
<picture>
@ -163,88 +161,100 @@ describe('makeLinksAbsolute($)', () => {
});
it('handles comma separated (without whitespace) srcset files with device-pixel-ratio descriptors', () => {
const html = `<div>
<picture>
<source srcset="assets/images/rhythm/076.jpg 2x,assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/076@2x.jpg 2x,assets/images/rhythm/076.jpg">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<picture>
<source srcset="assets/images/rhythm/076.jpg 2x,assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/076@2x.jpg 2x,assets/images/rhythm/076.jpg">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com')
);
assert.equal(
assertClean(
result,
`<div>
<picture>
<source srcset="http://example.com/assets/images/rhythm/076.jpg 2x, http://example.com/assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<source srcset="http://example.com/assets/images/rhythm/076@2x.jpg 2x, http://example.com/assets/images/rhythm/076.jpg">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>`
`
<div>
<picture>
<source srcset="http://example.com/assets/images/rhythm/076.jpg 2x, http://example.com/assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<source srcset="http://example.com/assets/images/rhythm/076@2x.jpg 2x, http://example.com/assets/images/rhythm/076.jpg">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</picture>
</div>
`
);
});
it('handles comma separated (with whitespace) srcset files with width descriptors', () => {
const html = `<div>
<img srcset="elva-fairy-320w.jpg 320w, elva-fairy-480w.jpg 480w, elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<img srcset="elva-fairy-320w.jpg 320w, elva-fairy-480w.jpg 480w, elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com')
);
assert.equal(
assertClean(
result,
`<div>
<img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>`
`
<div>
<img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>
`
);
});
it('handles multiline comma separated srcset files with width descriptors', () => {
const html = `<div>
<img srcset="elva-fairy-320w.jpg 320w,
elva-fairy-480w.jpg 480w,
elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<img srcset="elva-fairy-320w.jpg 320w,
elva-fairy-480w.jpg 480w,
elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com')
);
assert.equal(
assertClean(
result,
`<div>
<img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>`
`
<div>
<img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>
`
);
});
it('handles URLs that contain a comma', () => {
const html = `<div>
<picture><source media="(min-width: 768px)" srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"/><source srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"/><img src="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg" /></picture>
</div>`;
const $ = cheerio.load(html);
const $ = cheerio.load(`
<div>
<picture><source media="(min-width: 768px)" srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"/><source srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"/><img src="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg" /></picture>
</div>
`);
const $content = $('*').first();
const result = $.html(
makeLinksAbsolute($content, $, 'https://media.newyorker.com/')
);
assert.equal(
assertClean(
result,
`<div>
<picture><source media="(min-width: 768px)" srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"><source srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"><img src="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg"></picture>
</div>`
`
<div>
<picture><source media="(min-width: 768px)" srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"><source srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"><img src="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg"></picture>
</div>
`
);
});
});

@ -3,28 +3,43 @@ import assert from 'assert';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { markToKeep } from './index';
import { KEEP_CLASS } from './constants';
describe('markToKeep($)', () => {
it('marks elements that should be kept', () => {
const $ = cheerio.load(HTML.marksYouTube.before);
const $ = cheerio.load(`
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615"></iframe>
</div>
`);
const result = markToKeep($('*').first(), $);
assert.equal(result('iframe.mercury-parser-keep').length, 2);
if (!$.browser) {
assertClean(result.html(), HTML.marksYouTube.after);
assertClean(
result.html(),
`
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen class="mercury-parser-keep"></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615" class="mercury-parser-keep"></iframe>
</div>
`
);
}
});
it('marks same-domain elements to keep', () => {
const html =
'<div><iframe src="https://medium.com/foo/bar"></iframe></div>';
const $ = cheerio.load(html);
const $ = cheerio.load(
'<div><iframe src="https://medium.com/foo/bar"></iframe></div>'
);
const result = markToKeep($('*').first(), $, 'https://medium.com/foo');
const keptHtml = `<div><iframe src="https://medium.com/foo/bar" class="${KEEP_CLASS}"></iframe></div>`;

@ -1,21 +1,30 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/node-is-sufficient';
import nodeIsSufficient from './node-is-sufficient';
describe('Utils', () => {
describe('nodeIsSufficient(node)', () => {
it('returns false if node text length < 100 chars', () => {
const $ = cheerio.load(HTML.tooShort);
const sufficient = nodeIsSufficient($.root());
assert.equal(sufficient, false);
const $ = cheerio.load(`
<div class="foo bar">
<p>This is too short</p>
</div>
`);
assert.equal(nodeIsSufficient($.root()), false);
});
it('returns true if node text length > 100 chars', () => {
const $ = cheerio.load(HTML.longEnough);
const sufficient = nodeIsSufficient($.root());
assert.equal(sufficient, true);
const $ = cheerio.load(`
<div class="foo bar">
<p>
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
</p>
</div>
`);
assert.equal(nodeIsSufficient($.root()), true);
});
});
});

@ -2,23 +2,46 @@ import assert from 'assert';
import cheerio from 'cheerio';
import { clean } from 'test-helpers';
import HTML from './fixtures/html';
import { paragraphize } from './index';
describe('Generic Extractor Utils', () => {
describe('paragraphize(node)', () => {
it('conversts a BR into P and moves inline contents to P tag after current parent', () => {
const $ = cheerio.load(HTML.paragraphize.before);
const $ = cheerio.load(`
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`);
const node = $('br').get(0);
// note: result here is not valid html; will handle elsewhere
const result = paragraphize(node, $, true).html();
assert.equal(clean(result), clean(HTML.paragraphize.after));
assert.equal(
clean(result),
clean(`
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`)
);
});
it('conversts a BR into P and stops when block element hit', () => {
const $ = cheerio.load(HTML.paragraphizeBlock.before);
it('converts a BR into P and stops when block element hit', () => {
const $ = cheerio.load(`
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`);
const node = $('br').get(0);
// note: result here is not valid html; will handle elsewhere
@ -30,7 +53,17 @@ describe('Generic Extractor Utils', () => {
'<p> Here is some text <p> Here is more text </p></p><div>And also this</div> <p></p>';
assert.equal(clean(result), html);
} else {
assert.equal(clean(result), clean(HTML.paragraphizeBlock.after));
assert.equal(
clean(result),
clean(`
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`)
);
}
});
});

@ -2,15 +2,26 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { removeEmpty } from './index';
describe('removeEmpty($)', () => {
it('removes empty P tags', () => {
const $ = cheerio.load(HTML.removeEmptyP.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<p></p>
</div>
`);
const result = removeEmpty($('*').first(), $);
assertClean(result.html(), HTML.removeEmptyP.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
</div>
`
);
});
it('removes P tags with only space', () => {
@ -22,10 +33,22 @@ describe('removeEmpty($)', () => {
});
it('does not remove empty DIV tags', () => {
const $ = cheerio.load(HTML.removeEmptyP.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<p></p>
</div>
`);
const result = removeEmpty($('*').first(), $);
assertClean(result.html(), HTML.removeEmptyP.after);
assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
</div>
`
);
});
it('does not remove empty p tags containing an iframe', () => {

@ -3,19 +3,25 @@ import assert from 'assert';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import rewriteTopLevel from './rewrite-top-level';
describe('rewriteTopLevel(node, $)', () => {
it('turns html and body tags into divs', () => {
const $ = cheerio.load(HTML.rewriteHTMLBody.before);
const $ = cheerio.load(`
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`);
const result = rewriteTopLevel($('html').first(), $);
assert.equal(result('html').length, 0);
assert.equal(result('body').length, 0);
if (!cheerio.browser) {
assertClean(result.html(), HTML.rewriteHTMLBody.after);
assertClean(
result.html(),
`
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`
);
}
});
});

@ -3,19 +3,44 @@ import assert from 'assert';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { stripJunkTags } from './index';
describe('stripJunkTags($)', () => {
it('strips script and other junk tags', () => {
const $ = cheerio.load(HTML.stripsJunk.before);
const $ = cheerio.load(`
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`);
const result = stripJunkTags($('*').first(), $);
assertClean(result.html(), HTML.stripsJunk.after);
assertClean(
result.html(),
`
<div>
<p>What an article</p>
</div>
`
);
});
it('keeps youtube embeds', () => {
let $ = cheerio.load(HTML.ignoresKeepable.before);
let $ = cheerio.load(`
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<iframe class="mercury-parser-keep" src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<hr />
</div>
`);
$ = stripJunkTags($('*').first(), $);
assert.equal($('iframe[src^="https://www.youtube.com"]').length, 1);

@ -2,32 +2,71 @@ import assert from 'assert';
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import stripUnlikelyCandidates from './strip-unlikely-candidates';
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before);
assertClean(fn($).html(), HTML[key].after);
}
describe('Generic Extractor Utils', () => {
describe('stripUnlikelyCandidates(node)', () => {
it('returns original doc if no matches found', () => {
const $ = cheerio.load(HTML.noMatches);
const stripped = stripUnlikelyCandidates($);
assert.equal(stripped.html(), HTML.noMatches);
const html = `
<div id="foo">
<p>Ooo good one</p>
</div>
`;
const stripped = stripUnlikelyCandidates(cheerio.load(html));
assert.equal(stripped.html(), html);
});
it('strips unlikely matches from the doc', () => {
assertBeforeAndAfter('whitelistMatch', stripUnlikelyCandidates);
const before = `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article">
<p>Ooo good one</p>
</div>
`;
assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
});
it('keeps likely matches even when they also match the blacklist', () => {
assertBeforeAndAfter('whiteAndBlack', stripUnlikelyCandidates);
const before = `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`;
assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
});
it('removed likely matches when inside blacklist node', () => {
assertBeforeAndAfter('whiteInsideBlack', stripUnlikelyCandidates);
const before = `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`;
const after = `
<div>
<div>Something unrelated</div>
</div>
`;
assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
});
});
});

@ -5,29 +5,35 @@ import withinComment from './within-comment';
describe('withinComment(node)', () => {
it('returns false if its parent is not a comment', () => {
const $ = cheerio.load(`<div>
<div>
<div class="author">Adam</div>
</div>
</div>`);
const $ = cheerio.load(`
<div>
<div>
<div class="author">Adam</div>
</div>
</div>
`);
assert.equal(withinComment($('.author').first()), false);
});
it('returns true if its parent has a class of comment', () => {
const $ = cheerio.load(`<div class="comments">
<div>
<div class="author">Adam</div>
</div>
</div>`);
const $ = cheerio.load(`
<div class="comments">
<div>
<div class="author">Adam</div>
</div>
</div>
`);
assert.equal(withinComment($('.author').first()), true);
});
it('returns true if its parent has an id of comment', () => {
const $ = cheerio.load(`<div id="comment">
<div>
<div class="author">Adam</div>
</div>
</div>`);
const $ = cheerio.load(`
<div id="comment">
<div>
<div class="author">Adam</div>
</div>
</div>
`);
assert.equal(withinComment($('.author').first()), true);
});
});

@ -1,5 +1,4 @@
import assert from 'assert';
import mergeSupportedDomains from './merge-supported-domains';
describe('mergeSupportedDomains(extractor, domains)', () => {
@ -8,6 +7,7 @@ describe('mergeSupportedDomains(extractor, domains)', () => {
domain: 'foo.com',
supportedDomains: ['example.com'],
};
const expected = {
'foo.com': extractor,
'example.com': extractor,
@ -21,6 +21,7 @@ describe('mergeSupportedDomains(extractor, domains)', () => {
const extractor = {
domain: 'foo.com',
};
const expected = {
'foo.com': extractor,
};

@ -1,674 +0,0 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`,
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`,
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`,
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`,
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`,
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
},
// cleanConditionally
dropNegativeScore: {
before: `
<div>
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`,
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`,
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
previousEndsInColon: {
before: `
<div weight="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: 'What do you think?',
},
normalizeSpacesPreserve: {
before: `
<div>
<p>What do you think?</p>
<pre> What happens to spaces? </pre>
</div>
`,
after:
'<div> <p>What do you think?</p> <pre> What happens to spaces? </pre> </div>',
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
};
export default HTML;

@ -1,26 +1,36 @@
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { normalizeSpaces } from './index';
describe('normalizeSpaces(text)', () => {
it('normalizes spaces from text', () => {
const $ = cheerio.load(HTML.normalizeSpaces.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
</div>
`);
const result = normalizeSpaces(
$('*')
.first()
.text()
);
assert.equal(result, HTML.normalizeSpaces.after);
assert.equal(result, 'What do you think?');
});
it('preserves spaces in preformatted text blocks', () => {
const $ = cheerio.load(HTML.normalizeSpacesPreserve.before);
const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<pre> What happens to spaces? </pre>
</div>
`);
const result = normalizeSpaces($.html());
assert.equal(result, HTML.normalizeSpacesPreserve.after);
assert.equal(
result,
'<div> <p>What do you think?</p> <pre> What happens to spaces? </pre> </div>'
);
});
});

@ -4,42 +4,20 @@ import pageNumFromUrl from './page-num-from-url';
describe('pageNumFromUrl(url)', () => {
it('returns null if there is no page num in the url', () => {
const url1 = 'http://example.com';
assert.equal(pageNumFromUrl(url1), null);
const url2 = 'http://example.com/?pg=102';
assert.equal(pageNumFromUrl(url2), null);
const url3 = 'http://example.com/?page:102';
assert.equal(pageNumFromUrl(url3), null);
assert.equal(pageNumFromUrl('http://example.com'), null);
assert.equal(pageNumFromUrl('http://example.com/?pg=102'), null);
assert.equal(pageNumFromUrl('http://example.com/?page:102'), null);
});
it('returns a page num if one matches the url', () => {
const url1 = 'http://example.com/foo?page=1';
assert.equal(pageNumFromUrl(url1), 1);
const url2 = 'http://example.com/foo?pg=1';
assert.equal(pageNumFromUrl(url2), 1);
const url3 = 'http://example.com/foo?p=1';
assert.equal(pageNumFromUrl(url3), 1);
const url4 = 'http://example.com/foo?paging=1';
assert.equal(pageNumFromUrl(url4), 1);
const url5 = 'http://example.com/foo?pag=1';
assert.equal(pageNumFromUrl(url5), 1);
const url6 = 'http://example.com/foo?pagination/1';
assert.equal(pageNumFromUrl(url6), 1);
const url7 = 'http://example.com/foo?paging/88';
assert.equal(pageNumFromUrl(url7), 88);
const url8 = 'http://example.com/foo?pa/88';
assert.equal(pageNumFromUrl(url8), 88);
const url9 = 'http://example.com/foo?p/88';
assert.equal(pageNumFromUrl(url9), 88);
assert.equal(pageNumFromUrl('http://example.com/foo?page=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?pg=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?p=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?paging=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?pag=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?pagination/1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?paging/99'), 99);
assert.equal(pageNumFromUrl('http://example.com/foo?pa/99'), 99);
assert.equal(pageNumFromUrl('http://example.com/foo?p/99'), 99);
});
});

Loading…
Cancel
Save