You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/resource/index.test.js

146 lines
5.5 KiB
JavaScript

import assert from 'assert';
import cheerio from 'cheerio';
import { getEncoding } from 'utils/text';
import { record } from 'test-helpers';
import Resource from './index';
describe('Resource', () => {
const recorder = record('resource-test');
beforeAll(recorder.before);
afterAll(recorder.after);
describe('create(url)', () => {
it('fetches the page and returns a cheerio object', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057';
const $ = await Resource.create(url);
assert.equal(typeof $, 'function');
});
it('returns an error message if the url is malformed', async () => {
const url = 'http://nytimes.com/500';
const error = await Resource.create(url);
assert(/instructed to reject non-200/i.test(error.message));
});
it('fetches with different encoding on body', async () => {
const url =
'http://www.playnation.de/spiele-news/kojima-productions/hideo-kojima-reflektiert-ueber-seinen-werdegang-bei-konami-id68950.html';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type]').attr('value');
assert.equal(getEncoding(metaContentType), 'iso-8859-1');
const encodedU = /ü/g;
assert.equal(encodedU.test($.html()), true);
assert.equal(typeof $, 'function');
});
it('fetches with different encoding and case insensitive regex', async () => {
const url =
'https://www.finam.ru/analysis/newsitem/putin-nagradil-grefa-ordenom-20190208-203615/';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type i]').attr(
'value'
);
assert.equal(getEncoding(metaContentType), 'windows-1251');
const badEncodingRe = /�/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('fetches with different encoding and HTML5 charset tag', async () => {
const url =
'https://www.idnes.cz/fotbal/prvni-liga/fotbalova-liga-8-kolo-slovan-liberec-slovacko.A170925_173123_fotbal_min';
const $ = await Resource.create(url);
const metaContentType = $('meta[charset]').attr('charset');
assert.equal(getEncoding(metaContentType), 'windows-1250');
const badEncodingRe = /�/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('handles special encoding', async () => {
const url =
'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
const $ = await Resource.create(url);
const badEncodingRe = /<2F>/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('doesnt mangle non-ascii characters in prefetched response', async () => {
const url = 'https://www.gruene.de/themen/digitalisierung';
const prefetched =
'<!DOCTYPE html><html lang="de"><head><meta charSet="UTF-8"/><meta http-equiv="x-ua-compatible" content="ie=edge"/><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"/></head><body><div id="___gatsby"><h1 class="styles-module--headline--niWjO">Wir gestalten die Digitalisierung</h1><p>Wir Grüne kämpfen für ein offenes, gemeinwohlorientiertes Netz. Wir wollen den digitalen Wandel gerecht gestalten und setzen uns für Verantwortung, Freiheit und Recht im Netz ein. Netzpolitik und Digitalisierung sind zentrale politische Querschnittsaufgaben für eine moderne Gesellschaft. Im Mittelpunkt stehen für uns eine zukunftsfähige digitale Infrastruktur, der freie und gleichberechtigte Zugang zum Netz für alle, der Schutz unserer Privatsphäre und persönlichen Daten, sowie eine modernisierte Verwaltung.</p></div></body></html>';
const $ = await Resource.create(url, prefetched);
assert.equal(/Gr&#xFC;ne/.test($.html()), true);
assert.equal(/&#xFFFD;/.test($.html()), false);
});
});
describe('generateDoc({ body, response })', () => {
// Ideally the body would be a buffer, because of potential issues with
// string re-encoding, since these strings are blank, it should be fine
// but this is why iconv is throwing warnings.
it('throws an error if the content is not text', () => {
const response = {
headers: {
'content-type': 'foo',
},
};
const body = '';
assert.throws(() => {
Resource.generateDoc({ body, response });
}, /content does not appear to be text/i);
});
it('throws an error if the response has no Content-Type header', () => {
const response = {
headers: {},
};
const body = '';
// This assertion is more elaborate than the others to be sure that we're
// throwing an `Error` and not raising a runtime exception.
assert.throws(
() => {
Resource.generateDoc({ body, response });
},
err =>
err instanceof Error &&
/content does not appear to be text/i.test(err)
);
});
it('throws an error if the content has no children', () => {
// jquery's parser won't work this way, and this is
// an outside case
if (!cheerio.browser) {
const response = {
headers: {
'content-type': 'html',
},
};
const body = '';
assert.throws(() => {
Resource.generateDoc({ body, response });
}, /no children/i);
}
});
});
});