chore: cleanup

pull/3/head
Adam Pash 8 years ago
parent b3481a2c45
commit 76df30e303

@ -1,10 +1,10 @@
TODO:
- Complete response:
- add excerpt
- add word count
- Test if .is method is faster than regex methods
DONE:
x add excerpt
x add total pages
x add rendered pages
x add canonicalUrl

@ -1,8 +1,8 @@
{
"name": "js_read",
"name": "node-readability",
"version": "1.0.0",
"description": "",
"main": "index.js",
"main": "./dist/iris.js",
"scripts": {
"start": "node ./build",
"lint": "eslint src/** --fix",
@ -41,6 +41,7 @@
"babel-polyfill": "^6.13.0",
"cheerio": "^0.20.0",
"difflib": "^0.2.4",
"ellipsize": "0.0.2",
"moment": "^2.14.1",
"request": "^2.74.0",
"request-promise": "^4.1.1",

@ -9,6 +9,6 @@ export default {
babel(babelrc()),
],
format: 'cjs',
dest: 'dist/bundle.js', // equivalent to --output
dest: 'dist/iris.js', // equivalent to --output
sourceMap: true,
}

@ -3,8 +3,6 @@ import { removeAnchor } from 'utils/text';
import RootExtractor from 'extractors/root-extractor';
import Resource from 'resource';
import Iris from '../iris';
export default async function collectAllPages(
{
nextPageUrl,
@ -52,7 +50,6 @@ export default async function collectAllPages(
};
nextPageUrl = nextPageResult.nextPageUrl;
}
return {

@ -1,21 +1,18 @@
import assert from 'assert'
import fs from 'fs'
import assert from 'assert';
import fs from 'fs';
import RootExtractor from 'extractors/root-extractor'
import Resource from 'extractors/root-extractor'
import Iris from 'iris'
import NYMagExtractor from 'extractors/custom/nymag.com'
import Iris from 'iris';
describe('NYMagExtractor', () => {
it('works with a feature story', async () => {
const html = fs.readFileSync('./fixtures/nymag.com/ailes.html')
const uri = 'http://nymag.com/daily/intelligencer/2016/09/how-fox-news-women-took-down-roger-ailes.html'
it('works with a feature story', (async) () => {
const html = fs.readFileSync('./fixtures/nymag.com/ailes.html');
const uri = 'http://nymag.com/daily/intelligencer/2016/09/how-fox-news-women-took-down-roger-ailes.html';
const { dek, title, author } = await Iris.parse(uri, html)
const actualDek = 'How Fox News women took down the most powerful, and predatory, man in media.'
const { dek, title, author } = await Iris.parse(uri, html);
const actualDek = 'How Fox News women took down the most powerful, and predatory, man in media.';
assert.equal(dek, actualDek)
assert.equal(title, 'The Revenge of Rogers Angels')
assert.equal(author, 'Gabriel Sherman')
})
})
assert.equal(dek, actualDek);
assert.equal(title, 'The Revenge of Rogers Angels');
assert.equal(author, 'Gabriel Sherman');
});
});

@ -1,4 +1,4 @@
import ellipsize from 'ellipsize'
import ellipsize from 'ellipsize';
import {
extractFromMeta,
@ -7,9 +7,9 @@ import {
import { EXCERPT_META_SELECTORS } from './constants';
export function clean(content, $, maxLength=200) {
content = content.replace(/[\s\n]+/g, ' ').trim()
return ellipsize(content, 200, { ellipse: '…' })
export function clean(content, $, maxLength = 200) {
content = content.replace(/[\s\n]+/g, ' ').trim();
return ellipsize(content, maxLength, { ellipse: '…' });
}
const GenericExcerptExtractor = {
@ -19,10 +19,10 @@ const GenericExcerptExtractor = {
return clean(stripTags(excerpt, $));
}
// Fall back to excerpting from the extracted content
const maxLength = 200
const shortContent = content.slice(0, maxLength * 5)
return clean($(shortContent).text(), $, maxLength)
}
}
const maxLength = 200;
const shortContent = content.slice(0, maxLength * 5);
return clean($(shortContent).text(), $, maxLength);
},
};
export default GenericExcerptExtractor
export default GenericExcerptExtractor;

@ -1,15 +1,15 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import {
default as GenericExcerptExtractor,
clean,
} from './extractor'
} from './extractor';
describe('GenericExcerptExtractor', () => {
describe('extract({ $, content, metaCache })', () => {
it('returns og:description', () => {
const actualExcerpt = "Wow this is going to be something good."
const actualExcerpt = 'Wow this is going to be something good.';
const html = `
<html>
<head>
@ -23,10 +23,10 @@ describe('GenericExcerptExtractor', () => {
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
assert.equal(excerpt, actualExcerpt);
})
});
it('returns twitter:description', () => {
const actualExcerpt = "Wow this is going to be something good."
const actualExcerpt = 'Wow this is going to be something good.';
const html = `
<html>
<head>
@ -40,7 +40,7 @@ describe('GenericExcerptExtractor', () => {
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
assert.equal(excerpt, actualExcerpt);
})
});
it('falls back to the content', () => {
const html = `
@ -50,16 +50,15 @@ describe('GenericExcerptExtractor', () => {
</html>
`;
const $ = cheerio.load(html);
const content = "<div><p>Wow <b>this</b> is going to be something good.</p></div>"
const content = '<div><p>Wow <b>this</b> is going to be something good.</p></div>';
const metaCache = [];
const excerpt = GenericExcerptExtractor.extract({ $, content, metaCache });
assert.equal(excerpt, 'Wow this is going to be something good.');
})
})
})
});
});
});
describe('clean(text)', () => {
it('truncates text longer than 200 chars and trims whitespance', () => {
@ -70,15 +69,15 @@ describe('clean(text)', () => {
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
`
const text = clean(longText)
`;
const text = clean(longText);
let shouldBe = `
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
exercitation ullamco laboris nisi ut&hellip;
`
shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim()
`;
shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim();
assert.equal(text, shouldBe)
})
})
assert.equal(text, shouldBe);
});
});

@ -7,7 +7,7 @@ const Iris = {
async parse(url, html, opts = {}) {
const { fetchAllPages = true } = opts || true;
const Extractor = getExtractor(url);
console.log(`Using extractor for ${Extractor.domain}`);
// console.log(`Using extractor for ${Extractor.domain}`);
const $ = await Resource.create(url, html);
html = $.html();
@ -29,7 +29,6 @@ const Iris = {
$,
metaCache,
result,
Extractor,
title,
url,
}
@ -39,7 +38,7 @@ const Iris = {
...result,
totalPages: 1,
renderedPages: 1,
}
};
}
return result;

@ -41,10 +41,10 @@ describe('Iris', () => {
{ fetchAllPages: true }
);
const { totalPages, pagesRendered } = result
const { totalPages, pagesRendered } = result;
assert.equal(totalPages, 3)
assert.equal(pagesRendered, 3)
assert.equal(totalPages, 3);
assert.equal(pagesRendered, 3);
// console.log(result)
assert.equal(result.nextPageUrl, `${url}2`);

@ -18,7 +18,7 @@ export const STRIP_OUTPUT_TAGS = [
export const REMOVE_ATTRS = ['style', 'align'];
export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`);
export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');
export const WHITELIST_ATTRS = ['src', 'href', 'class', 'id', 'score'];
export const WHITELIST_ATTRS = ['src', 'href', 'class', 'id'];
export const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i');
// removeEmpty

@ -1,4 +1,4 @@
#!/bin/bash
# Runs the mocha tests
mocha --compilers js:babel-register $(find src -name "*.test.js") --require babel-polyfill
mocha --reporter spec --compilers js:babel-register $(find src -name "*.test.js") --require babel-polyfill

Loading…
Cancel
Save