You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/cleaners/content.test.js

33 lines
974 B
JavaScript

import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractCleanNode from './content';
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it('cleans cruft out of a DOM node', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html);
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
};
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
const bestNode = extractBestNode($, opts);
// let result = $.html(bestNode);
// // console.log(result)
// // console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts });
// result = $.html(cleanNode);
// // console.log(result.length)
// // console.log(result)
// // console.log(bestNode.html())
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
assert.equal($(cleanNode).text().length, 2687);
});
});
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago