Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
pull/155/head
Kevin Ngao 7 years ago committed by GitHub
parent 9b371e51ac
commit f2e3f055c2

194
dist/mercury.js vendored

@ -7,10 +7,10 @@ var _extends = _interopDefault(require('babel-runtime/helpers/extends'));
var _asyncToGenerator = _interopDefault(require('babel-runtime/helpers/asyncToGenerator'));
var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
var request = _interopDefault(require('request'));
var iconv = _interopDefault(require('iconv-lite'));
var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
var request = _interopDefault(require('request'));
var _Reflect$ownKeys = _interopDefault(require('babel-runtime/core-js/reflect/own-keys'));
var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
var _defineProperty = _interopDefault(require('babel-runtime/helpers/defineProperty'));
@ -26,50 +26,6 @@ var difflib = _interopDefault(require('difflib'));
var _Array$from = _interopDefault(require('babel-runtime/core-js/array/from'));
var ellipsize = _interopDefault(require('ellipsize'));
var _marked = [range].map(_regeneratorRuntime.mark);
function range() {
var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
return _regeneratorRuntime.wrap(function range$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
if (!(start <= end)) {
_context.next = 5;
break;
}
_context.next = 3;
return start += 1;
case 3:
_context.next = 0;
break;
case 5:
case "end":
return _context.stop();
}
}
}, _marked[0], this);
}
// extremely simple url validation as a first step
function validateUrl(_ref) {
var hostname = _ref.hostname;
// If this isn't a valid url, return an error message
return !!hostname;
}
var Errors = {
badUrl: {
error: true,
messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
}
};
var NORMALIZE_RE = /\s{2,}/g;
function normalizeSpaces(text) {
@ -116,6 +72,7 @@ var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
var ENCODING_RE = /charset=([\w-]+)\b/;
var DEFAULT_ENCODING = 'utf-8';
function pageNumFromUrl(url) {
var matches = url.match(PAGE_IN_HREF_RE);
@ -224,13 +181,60 @@ function excerptContent(content) {
// used in our fetchResource function to
// ensure correctly encoded responses
function getEncoding(str) {
var encoding = DEFAULT_ENCODING;
if (ENCODING_RE.test(str)) {
return ENCODING_RE.exec(str)[1];
var testEncode = ENCODING_RE.exec(str)[1];
if (iconv.encodingExists(testEncode)) {
encoding = testEncode;
}
}
return encoding;
}
return null;
var _marked = [range].map(_regeneratorRuntime.mark);
function range() {
var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
return _regeneratorRuntime.wrap(function range$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
if (!(start <= end)) {
_context.next = 5;
break;
}
_context.next = 3;
return start += 1;
case 3:
_context.next = 0;
break;
case 5:
case "end":
return _context.stop();
}
}
}, _marked[0], this);
}
// extremely simple url validation as a first step
function validateUrl(_ref) {
var hostname = _ref.hostname;
// If this isn't a valid url, return an error message
return !!hostname;
}
var Errors = {
badUrl: {
error: true,
messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
}
};
// Browser does not like us setting user agent
var REQUEST_HEADERS = cheerio.browser ? {} : {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
@ -258,21 +262,6 @@ function get(options) {
if (err) {
reject(err);
} else {
var encoding = getEncoding(response.headers['content-type']);
if (iconv.encodingExists(encoding)) {
body = iconv.decode(body, encoding);
}
if (typeof body !== 'string') {
var $ = cheerio.load(iconv.decode(body, 'utf8'));
var contentType = $('meta[http-equiv=content-type]').attr('content');
var properEncoding = getEncoding(contentType);
if (iconv.encodingExists(properEncoding)) {
body = iconv.decode(body, properEncoding);
}
}
resolve({ body: body, response: response });
}
});
@ -343,11 +332,11 @@ var fetchResource$1 = (function () {
url: parsedUrl.href,
headers: _extends({}, REQUEST_HEADERS),
timeout: FETCH_TIMEOUT,
// Don't set encoding; fixes issues
// w/gzipped responses
encoding: null,
// Accept cookies
jar: true,
// Set to null so the response returns as binary and body as buffer
// https://github.com/request/request#requestoptions-callback
encoding: null,
// Accept and decode gzip
gzip: true,
// Follow any redirect
@ -1860,7 +1849,7 @@ var Resource = {
throw new Error('Content does not appear to be text.');
}
var $ = cheerio.load(content);
var $ = this.encodeDoc({ content: content, contentType: contentType });
if ($.root().children().length === 0) {
throw new Error('No children, likely a bad parse.');
@ -1870,6 +1859,26 @@ var Resource = {
$ = convertLazyLoadedImages($);
$ = clean($);
return $;
},
encodeDoc: function encodeDoc(_ref2) {
var content = _ref2.content,
contentType = _ref2.contentType;
var encoding = getEncoding(contentType);
var decodedContent = iconv.decode(content, encoding);
var $ = cheerio.load(decodedContent);
// after first cheerio.load, check to see if encoding matches
var metaContentType = $('meta[http-equiv=content-type]').attr('content');
var properEncoding = getEncoding(metaContentType);
// if encodings in the header/body dont match, use the one in the body
if (properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}
return $;
}
};
@ -5300,6 +5309,54 @@ var WwwOpposingviewsComExtractor = {
}
};
var GothamistComExtractor = {
domain: 'gothamist.com',
supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
title: {
selectors: ['h1', '.entry-header h1']
},
author: {
selectors: ['.author']
},
date_published: {
selectors: ['abbr', 'abbr.published'],
timezone: 'America/New_York'
},
dek: {
selectors: [null]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'div.image-none': 'figure',
'.image-none i': 'figcaption',
'div.image-left': 'figure',
'.image-left i': 'figcaption',
'div.image-right': 'figure',
'.image-right i': 'figcaption'
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
}
};
var CustomExtractors = Object.freeze({
@ -5386,7 +5443,8 @@ var CustomExtractors = Object.freeze({
FortuneComExtractor: FortuneComExtractor,
WwwLinkedinComExtractor: WwwLinkedinComExtractor,
ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor
WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
GothamistComExtractor: GothamistComExtractor
});
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {

File diff suppressed because one or more lines are too long

@ -43,9 +43,7 @@ if (process.env.CI) {
assert.equal(article.title, result.title);
done();
}).catch((e) => {
console.log('There was an error', e.message); // eslint-disable-line no-console
console.log('e.fileName', e.fileName);
console.log('e.lineNumber', e.lineNumber);
console.log(e.name, e.message); // eslint-disable-line no-console
assert.equal(true, false);
done();
});

@ -50,6 +50,9 @@ describe('Resource', () => {
});
describe('generateDoc({ body, response })', () => {
// Ideally the body would be a buffer, because of potential issues with
// string re-encoding, since these strings are blank, it should be fine
// but this is why iconv is throwing warnings.
it('throws an error if the content is not text', () => {
const response = {
headers: {

@ -91,6 +91,9 @@ export default async function fetchResource(url, parsedUrl) {
timeout: FETCH_TIMEOUT,
// Accept cookies
jar: true,
// Set to null so the response returns as binary and body as buffer
// https://github.com/request/request#requestoptions-callback
encoding: null,
// Accept and decode gzip
gzip: true,
// Follow any redirect

@ -21,6 +21,13 @@ describe('fetchResource(url)', () => {
assert.equal(error, true);
});
it('returns a buffer as its body', async () => {
const url = 'https://www.washingtonpost.com/news/post-nation/wp/2016/11/05/a-vile-and-disgusting-act-officer-accused-of-giving-fecal-sandwich-to-homeless-man-is-fired/';
const result = await fetchResource(url);
assert.equal(typeof result.body, 'object');
});
it('fetches nyt', async () => {
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
const { response } = await fetchResource(url);

Loading…
Cancel
Save