You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/cleaners/content.js

62 lines
1.8 KiB
JavaScript

import {
cleanAttributes,
cleanHeaders,
cleanHOnes,
cleanImages,
cleanTags,
removeEmpty,
rewriteTopLevel,
markToKeep,
stripJunkTags,
makeLinksAbsolute,
} from 'utils/dom';
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(
article,
{ $, cleanConditionally = true, title = '', url = '', defaultCleaner = true }
) {
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
rewriteTopLevel(article, $);
// Drop small images and spacer images
// Only do this is defaultCleaner is set to true;
// this can sometimes be too aggressive.
if (defaultCleaner) cleanImages(article, $);
// Make links absolute
makeLinksAbsolute(article, $, url);
// Mark elements to keep that would normally be removed.
// E.g., stripJunkTags will remove iframes, so we're going to mark
// YouTube/Vimeo videos as elements we want to keep.
markToKeep(article, $, url);
// Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security.
stripJunkTags(article, $);
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article, $);
// Clean headers
cleanHeaders(article, $, title);
// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
// Also optionally running, since it can be overly aggressive.
if (defaultCleaner) cleanTags(article, $, cleanConditionally);
// Remove empty paragraph nodes
removeEmpty(article, $);
// Remove unnecessary attributes
cleanAttributes(article, $);
return article;
}