mercury-parser/dist/generate-custom-parser.js.map

{"version":3,"file":null,"sources":["../src/utils/dom/constants.js","../src/utils/dom/brs-to-ps.js","../src/utils/dom/paragraphize.js","../src/utils/dom/convert-to-paragraphs.js","../src/utils/dom/convert-node-to.js","../src/utils/dom/clean-images.js","../src/utils/dom/strip-junk-tags.js","../src/utils/dom/clean-attributes.js","../src/extractors/generic/content/scoring/constants.js","../src/extractors/generic/content/scoring/get-weight.js","../src/extractors/generic/content/scoring/get-score.js","../src/extractors/generic/content/scoring/score-commas.js","../src/extractors/generic/content/scoring/score-length.js","../src/extractors/generic/content/scoring/score-paragraph.js","../src/extractors/generic/content/scoring/set-score.js","../src/extractors/generic/content/scoring/add-score.js","../src/extractors/generic/content/scoring/add-to-parent.js","../src/extractors/generic/content/scoring/get-or-init-score.js","../src/extractors/generic/content/scoring/score-node.js","../src/extractors/generic/content/scoring/score-content.js","../src/utils/text/normalize-spaces.js","../src/utils/text/extract-from-url.js","../src/utils/text/constants.js","../src/utils/text/article-base-url.js","../src/utils/text/has-sentence-end.js","../src/extractors/generic/content/scoring/merge-siblings.js","../src/extractors/generic/content/scoring/index.js","../src/utils/dom/clean-tags.js","../src/utils/dom/make-links-absolute.js","../src/utils/dom/link-density.js","../src/utils/dom/extract-from-selectors.js","../src/utils/dom/strip-tags.js","../src/utils/dom/within-comment.js","../src/utils/dom/node-is-sufficient.js","../src/utils/dom/get-attrs.js","../src/utils/dom/set-attr.js","../src/utils/dom/set-attrs.js","../src/utils/dom/index.js","mercury.js","../scripts/templates/insert-values.js","../scripts/templates/index.js","../scripts/templates/custom-extractor.js","../scripts/templates/custom-extractor-test.js","../scripts/generate-custom-parser.js"],"sourcesContent":["// Spacer images to be removed\nexport const SPACER_RE = new RegExp('trans|transparent|spacer|blank', 'i');\n\n// The class we will use to mark elements we want to keep\n// but would normally remove\nexport const KEEP_CLASS = 'mercury-parser-keep';\n\nexport const KEEP_SELECTORS = [\n  'iframe[src^=\"https://www.youtube.com\"]',\n  'iframe[src^=\"http://www.youtube.com\"]',\n  'iframe[src^=\"https://player.vimeo\"]',\n  'iframe[src^=\"http://player.vimeo\"]',\n];\n\n// A list of tags to strip from the output if we encounter them.\nexport const STRIP_OUTPUT_TAGS = [\n  'title',\n  'script',\n  'noscript',\n  'link',\n  'style',\n  'hr',\n  'embed',\n  'iframe',\n  'object',\n];\n\n// cleanAttributes\nexport const REMOVE_ATTRS = ['style', 'align'];\nexport const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`);\nexport const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nexport const WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt'];\nexport const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i');\n\n// removeEmpty\nexport const REMOVE_EMPTY_TAGS = ['p'];\nexport const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',');\n\n// cleanTags\nexport const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(',');\n\n// cleanHeaders\nconst HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nexport const HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n  'ad-break',\n  'adbox',\n  'advert',\n  'addthis',\n  'agegate',\n  'aux',\n  'blogger-labels',\n  'combx',\n  'comment',\n  'conversation',\n  'disqus',\n  'entry-unrelated',\n  'extra',\n  'foot',\n  // 'form', // This is too generic, has too many false positives\n  'header',\n  'hid