'use strict' ;
function _interopDefault ( ex ) { return ( ex && ( typeof ex === 'object' ) && 'default' in ex ) ? ex [ 'default' ] : ex ; }
var _slicedToArray = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/slicedToArray' ) ) ;
var _toConsumableArray = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/toConsumableArray' ) ) ;
var fs = _interopDefault ( require ( 'fs' ) ) ;
var URL = _interopDefault ( require ( 'url' ) ) ;
var inquirer = _interopDefault ( require ( 'inquirer' ) ) ;
var ora = _interopDefault ( require ( 'ora' ) ) ;
var child _process = require ( 'child_process' ) ;
var _Reflect$ownKeys = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/reflect/own-keys' ) ) ;
var _parseInt = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/parse-int' ) ) ;
var defineProperty = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/defineProperty' ) ) ;
var objectSpread = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/objectSpread' ) ) ;
var _parseFloat = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/parse-float' ) ) ;
var iconvLite = _interopDefault ( require ( 'iconv-lite' ) ) ;
var set = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/set' ) ) ;
var _typeof = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/typeof' ) ) ;
var _getIterator = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/get-iterator' ) ) ;
var _Object$freeze = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/object/freeze' ) ) ;
var regenerator = _interopDefault ( require ( '@babel/runtime-corejs2/regenerator' ) ) ;
var objectWithoutProperties = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/objectWithoutProperties' ) ) ;
var asyncToGenerator = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/asyncToGenerator' ) ) ;
var cheerio = _interopDefault ( require ( 'cheerio' ) ) ;
var turndown = _interopDefault ( require ( 'turndown' ) ) ;
var promise = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/promise' ) ) ;
var postmanRequest = _interopDefault ( require ( 'postman-request' ) ) ;
var assign = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/object/assign' ) ) ;
var keys = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/object/keys' ) ) ;
var stringDirection = _interopDefault ( require ( 'string-direction' ) ) ;
var validUrl = _interopDefault ( require ( 'valid-url' ) ) ;
var momentTimezone = _interopDefault ( require ( 'moment-timezone' ) ) ;
var momentParseformat = _interopDefault ( require ( 'moment-parseformat' ) ) ;
var wuzzy = _interopDefault ( require ( 'wuzzy' ) ) ;
var difflib = _interopDefault ( require ( 'difflib' ) ) ;
var from = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/array/from' ) ) ;
var ellipsize = _interopDefault ( require ( 'ellipsize' ) ) ;
var isArray = _interopDefault ( require ( '@babel/runtime-corejs2/core-js/array/is-array' ) ) ;
var _taggedTemplateLiteral = _interopDefault ( require ( '@babel/runtime-corejs2/helpers/taggedTemplateLiteral' ) ) ;
// Spacer images to be removed
// but would normally remove
var KEEP _CLASS = 'mercury-parser-keep' ;
var STRIP _OUTPUT _TAGS = [ 'title' , 'script' , 'noscript' , 'link' , 'style' , 'hr' , 'embed' , 'iframe' , 'object' ] ; // cleanAttributes
function stripJunkTags ( article , $ ) {
var tags = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : [ ] ;
if ( tags . length === 0 ) {
tags = STRIP _OUTPUT _TAGS ;
} // Remove matching elements, but ignore
// any element with a class of mercury-parser-keep
$ ( tags . join ( ',' ) , article ) . not ( "." . concat ( KEEP _CLASS ) ) . remove ( ) ;
return $ ;
}
// // CONTENT FETCHING CONSTANTS ////
// return 1 for every comma in text
// Given a node type to search for, and a list of regular expressions,
// An expression that looks to try to find the page digit within a URL, if
// Given a string, return True if it appears to have an ending sentence
// Scoring
function absolutize ( $ , rootUrl , attr ) {
var baseUrl = $ ( 'base' ) . attr ( 'href' ) ;
$ ( "[" . concat ( attr , "]" ) ) . each ( function ( _ , node ) {
var attrs = getAttrs ( node ) ;
var url = attrs [ attr ] ;
if ( ! url ) return ;
var absoluteUrl = URL . resolve ( baseUrl || rootUrl , url ) ;
setAttr ( node , attr , absoluteUrl ) ;
} ) ;
}
function absolutizeSet ( $ , rootUrl , $content ) {
$ ( '[srcset]' , $content ) . each ( function ( _ , node ) {
var attrs = getAttrs ( node ) ;
var urlSet = attrs . srcset ;
if ( urlSet ) {
// a comma should be considered part of the candidate URL unless preceded by a descriptor
// descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
// space characters inside the URL should be encoded (%20 or +)
var candidates = urlSet . match ( /(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g ) ;
if ( ! candidates ) return ;
var absoluteCandidates = candidates . map ( function ( candidate ) {
// a candidate URL cannot start or end with a comma
// descriptors are separated from the URLs by unescaped whitespace
var parts = candidate . trim ( ) . replace ( /,$/ , '' ) . split ( /\s+/ ) ;
parts [ 0 ] = URL . resolve ( rootUrl , parts [ 0 ] ) ;
return parts . join ( ' ' ) ;
} ) ;
var absoluteUrlSet = _toConsumableArray ( new set ( absoluteCandidates ) ) . join ( ', ' ) ;
setAttr ( node , 'srcset' , absoluteUrlSet ) ;
}
} ) ;
}
function makeLinksAbsolute$$1 ( $content , $ , url ) {
[ 'href' , 'src' ] . forEach ( function ( attr ) {
return absolutize ( $ , url , attr ) ;
} ) ;
absolutizeSet ( $ , url , $content ) ;
return $content ;
}
// strips all tags from a string of text
// Given a node, determine if it's article-like enough to return
function getAttrs ( node ) {
var attribs = node . attribs ,
attributes = node . attributes ;
if ( ! attribs && attributes ) {
var attrs = _Reflect$ownKeys ( attributes ) . reduce ( function ( acc , index ) {
var attr = attributes [ index ] ;
if ( ! attr . name || ! attr . value ) return acc ;
acc [ attr . name ] = attr . value ;
return acc ;
} , { } ) ;
return attrs ;
}
return attribs ;
}
function setAttr ( node , attr , val ) {
if ( node . attribs ) {
node . attribs [ attr ] = val ;
} else if ( node . attributes ) {
node . setAttribute ( attr , val ) ;
}
return node ;
}
// DOM manipulation
function _interopDefault$1 ( ex ) {
return ex && _typeof ( ex ) === 'object' && 'default' in ex ? ex [ 'default' ] : ex ;
}
var _regeneratorRuntime = _interopDefault$1 ( regenerator ) ;
var _objectSpread = _interopDefault$1 ( objectSpread ) ;
var _objectWithoutProperties = _interopDefault$1 ( objectWithoutProperties ) ;
var _asyncToGenerator = _interopDefault$1 ( asyncToGenerator ) ;
var URL$1 = _interopDefault$1 ( URL ) ;
var cheerio$1 = _interopDefault$1 ( cheerio ) ;
var TurndownService = _interopDefault$1 ( turndown ) ;
var iconv = _interopDefault$1 ( iconvLite ) ;
var _parseInt$1 = _interopDefault$1 ( _parseInt ) ;
var _slicedToArray$1 = _interopDefault$1 ( _slicedToArray ) ;
var _Promise = _interopDefault$1 ( promise ) ;
var request = _interopDefault$1 ( postmanRequest ) ;
var _Reflect$ownKeys$1 = _interopDefault$1 ( _Reflect$ownKeys ) ;
var _toConsumableArray$1 = _interopDefault$1 ( _toConsumableArray ) ;
var _defineProperty = _interopDefault$1 ( defineProperty ) ;
var _parseFloat$1 = _interopDefault$1 ( _parseFloat ) ;
var _Set = _interopDefault$1 ( set ) ;
var _typeof$1 = _interopDefault$1 ( _typeof ) ;
var _getIterator$1 = _interopDefault$1 ( _getIterator ) ;
var _Object$assign = _interopDefault$1 ( assign ) ;
var _Object$keys = _interopDefault$1 ( keys ) ;
var stringDirection$1 = _interopDefault$1 ( stringDirection ) ;
var validUrl$1 = _interopDefault$1 ( validUrl ) ;
var moment = _interopDefault$1 ( momentTimezone ) ;
var parseFormat = _interopDefault$1 ( momentParseformat ) ;
var wuzzy$1 = _interopDefault$1 ( wuzzy ) ;
var difflib$1 = _interopDefault$1 ( difflib ) ;
var _Array$from = _interopDefault$1 ( from ) ;
var ellipsize$1 = _interopDefault$1 ( ellipsize ) ;
var _Array$isArray = _interopDefault$1 ( isArray ) ;
var NORMALIZE _RE$1 = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g ;
function normalizeSpaces$1 ( text ) {
return text . replace ( NORMALIZE _RE$1 , ' ' ) . trim ( ) ;
} // Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
function extractFromUrl$1 ( url , regexList ) {
var matchRe = regexList . find ( function ( re ) {
return re . test ( url ) ;
} ) ;
if ( matchRe ) {
return matchRe . exec ( url ) [ 1 ] ;
}
return null ;
} // An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
// page=1
// pg=1
// p=1
// paging=12
// pag=7
// pagination/1
// paging/88
// pa/83
// p/11
//
// Does not match:
// pg=102
// page:2
var PAGE _IN _HREF _RE$1 = new RegExp ( '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})' , 'i' ) ;
var HAS _ALPHA _RE$1 = /[a-z]/i ;
var IS _ALPHA _RE$1 = /^[a-z]+$/i ;
var IS _DIGIT _RE$1 = /^[0-9]+$/i ;
var ENCODING _RE$1 = /charset=([\w-]+)\b/ ;
var DEFAULT _ENCODING$1 = 'utf-8' ;
function pageNumFromUrl$1 ( url ) {
var matches = url . match ( PAGE _IN _HREF _RE$1 ) ;
if ( ! matches ) return null ;
var pageNum = _parseInt$1 ( matches [ 6 ] , 10 ) ; // Return pageNum < 100, otherwise
// return null
return pageNum < 100 ? pageNum : null ;
}
function removeAnchor$1 ( url ) {
return url . split ( '#' ) [ 0 ] . replace ( /\/$/ , '' ) ;
}
function isGoodSegment$1 ( segment , index , firstSegmentHasLetters ) {
var goodSegment = true ; // If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if ( index < 2 && IS _DIGIT _RE$1 . test ( segment ) && segment . length < 3 ) {
goodSegment = true ;
} // If this is the first url_segment and it's just "index",
// remove it
if ( index === 0 && segment . toLowerCase ( ) === 'index' ) {
goodSegment = false ;
} // If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if ( index < 2 && segment . length < 3 && ! firstSegmentHasLetters ) {
goodSegment = false ;
}
return goodSegment ;
} // Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
function articleBaseUrl$1 ( url , parsed ) {
var parsedUrl = parsed || URL$1 . parse ( url ) ;
var protocol = parsedUrl . protocol ,
host = parsedUrl . host ,
path = parsedUrl . path ;
var firstSegmentHasLetters = false ;
var cleanedSegments = path . split ( '/' ) . reverse ( ) . reduce ( function ( acc , rawSegment , index ) {
var segment = rawSegment ; // Split off and save anything that looks like a file type.
if ( segment . includes ( '.' ) ) {
var _segment$split = segment . split ( '.' ) ,
_segment$split2 = _slicedToArray$1 ( _segment$split , 2 ) ,
possibleSegment = _segment$split2 [ 0 ] ,
fileExt = _segment$split2 [ 1 ] ;
if ( IS _ALPHA _RE$1 . test ( fileExt ) ) {
segment = possibleSegment ;
}
} // If our first or second segment has anything looking like a page
// number, remove it.
if ( PAGE _IN _HREF _RE$1 . test ( segment ) && index < 2 ) {
segment = segment . replace ( PAGE _IN _HREF _RE$1 , '' ) ;
} // If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if ( index === 0 ) {
firstSegmentHasLetters = HAS _ALPHA _RE$1 . test ( segment ) ;
} // If it's not marked for deletion, push it to cleaned_segments.
if ( isGoodSegment$1 ( segment , index , firstSegmentHasLetters ) ) {
acc . push ( segment ) ;
}
return acc ;
} , [ ] ) ;
return "" . concat ( protocol , "//" ) . concat ( host ) . concat ( cleanedSegments . reverse ( ) . join ( '/' ) ) ;
} // Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
var SENTENCE _END _RE$1 = new RegExp ( '.( |$)' ) ;
function hasSentenceEnd$1 ( text ) {
return SENTENCE _END _RE$1 . test ( text ) ;
}
function excerptContent$1 ( content ) {
var words = arguments . length > 1 && arguments [ 1 ] !== undefined ? arguments [ 1 ] : 10 ;
return content . trim ( ) . split ( /\s+/ ) . slice ( 0 , words ) . join ( ' ' ) ;
} // used in our fetchResource function to
// ensure correctly encoded responses
function getEncoding$1 ( str ) {
var encoding = DEFAULT _ENCODING$1 ;
var matches = ENCODING _RE$1 . exec ( str ) ;
if ( matches !== null ) {
var _matches = _slicedToArray$1 ( matches , 2 ) ;
str = _matches [ 1 ] ;
}
if ( iconv . encodingExists ( str ) ) {
encoding = str ;
}
return encoding ;
}
var REQUEST _HEADERS = cheerio$1 . browser ? { } : {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
} ; // The number of milliseconds to attempt to fetch a resource before timing out.
var FETCH _TIMEOUT = 10000 ; // Content types that we do not extract content from
var BAD _CONTENT _TYPES = [ 'audio/mpeg' , 'image/gif' , 'image/jpeg' , 'image/jpg' ] ;
var BAD _CONTENT _TYPES _RE = new RegExp ( "^(" . concat ( BAD _CONTENT _TYPES . join ( '|' ) , ")$" ) , 'i' ) ; // Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.
var MAX _CONTENT _LENGTH = 5242880 ; // Turn the global proxy on or off
function get ( options ) {
return new _Promise ( function ( resolve , reject ) {
request ( options , function ( err , response , body ) {
if ( err ) {
reject ( err ) ;
} else {
resolve ( {
body : body ,
response : response
} ) ;
}
} ) ;
} ) ;
} // Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 or not.
// Validation here means that we haven't found reason to bail from
// further processing of this url.
function validateResponse ( response ) {
var parseNon200 = arguments . length > 1 && arguments [ 1 ] !== undefined ? arguments [ 1 ] : false ; // Check if we got a valid status code
// This isn't great, but I'm requiring a statusMessage to be set
// before short circuiting b/c nock doesn't set it in tests
// statusMessage only not set in nock response, in which case
// I check statusCode, which is currently only 200 for OK responses
// in tests
if ( response . statusMessage && response . statusMessage !== 'OK' || response . statusCode !== 200 ) {
if ( ! response . statusCode ) {
throw new Error ( "Unable to fetch content. Original exception was " . concat ( response . error ) ) ;
} else if ( ! parseNon200 ) {
throw new Error ( "Resource returned a response status code of " . concat ( response . statusCode , " and resource was instructed to reject non-200 status codes." ) ) ;
}
}
var _response$headers = response . headers ,
contentType = _response$headers [ 'content-type' ] ,
contentLength = _response$headers [ 'content-length' ] ; // Check that the content is not in BAD_CONTENT_TYPES
if ( BAD _CONTENT _TYPES _RE . test ( contentType ) ) {
throw new Error ( "Content-type for this resource was " . concat ( contentType , " and is not allowed." ) ) ;
} // Check that the content length is below maximum
if ( contentLength > MAX _CONTENT _LENGTH ) {
throw new Error ( "Content for this resource was too large. Maximum content length is " . concat ( MAX _CONTENT _LENGTH , "." ) ) ;
}
return true ;
} // Grabs the last two pieces of the URL and joins them back together
// TODO: This should gracefully handle timeouts and raise the
// proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
function fetchResource ( _x , _x2 ) {
return _fetchResource . apply ( this , arguments ) ;
}
function _fetchResource ( ) {
_fetchResource = _asyncToGenerator (
/*#__PURE__*/
_regeneratorRuntime . mark ( function _callee ( url , parsedUrl ) {
var headers ,
options ,
_ref2 ,
response ,
body ,
_args = arguments ;
return _regeneratorRuntime . wrap ( function _callee$ ( _context ) {
while ( 1 ) {
switch ( _context . prev = _context . next ) {
case 0 :
headers = _args . length > 2 && _args [ 2 ] !== undefined ? _args [ 2 ] : { } ;
parsedUrl = parsedUrl || URL$1 . parse ( encodeURI ( url ) ) ;
options = _objectSpread ( {
url : parsedUrl . href ,
headers : _objectSpread ( { } , REQUEST _HEADERS , headers ) ,
timeout : FETCH _TIMEOUT ,
// Accept cookies
jar : true ,
// Set to null so the response returns as binary and body as buffer
// https://github.com/request/request#requestoptions-callback
encoding : null ,
// Accept and decode gzip
gzip : true ,
// Follow any non-GET redirects
followAllRedirects : true
} , typeof window !== 'undefined' ? { } : {
// Follow GET redirects; this option is for Node only
followRedirect : true
} ) ;
_context . next = 5 ;
return get ( options ) ;
case 5 :
_ref2 = _context . sent ;
response = _ref2 . response ;
body = _ref2 . body ;
_context . prev = 8 ;
validateResponse ( response ) ;
return _context . abrupt ( "return" , {
body : body ,
response : response
} ) ;
case 13 :
_context . prev = 13 ;
_context . t0 = _context [ "catch" ] ( 8 ) ;
return _context . abrupt ( "return" , {
error : true ,
message : _context . t0 . message
} ) ;
case 16 :
case "end" :
return _context . stop ( ) ;
}
}
} , _callee , this , [ [ 8 , 13 ] ] ) ;
} ) ) ;
return _fetchResource . apply ( this , arguments ) ;
}
function convertMetaProp ( $ , from$$1 , to ) {
$ ( "meta[" . concat ( from$$1 , "]" ) ) . each ( function ( _ , node ) {
var $node = $ ( node ) ;
var value = $node . attr ( from$$1 ) ;
$node . attr ( to , value ) ;
$node . removeAttr ( from$$1 ) ;
} ) ;
return $ ;
} // For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.
function normalizeMetaTags ( $ ) {
$ = convertMetaProp ( $ , 'content' , 'value' ) ;
$ = convertMetaProp ( $ , 'property' , 'name' ) ;
return $ ;
} // Spacer images to be removed
var SPACER _RE$1 = new RegExp ( 'transparent|spacer|blank' , 'i' ) ; // The class we will use to mark elements we want to keep
// but would normally remove
var KEEP _CLASS$1 = 'mercury-parser-keep' ;
var KEEP _SELECTORS$1 = [ 'iframe[src^="https://www.youtube.com"]' , 'iframe[src^="https://www.youtube-nocookie.com"]' , 'iframe[src^="http://www.youtube.com"]' , 'iframe[src^="https://player.vimeo"]' , 'iframe[src^="http://player.vimeo"]' , 'iframe[src^="https://www.redditmedia.com"]' ] ; // A list of tags to strip from the output if we encounter them.
var STRIP _OUTPUT _TAGS$1 = [ 'title' , 'script' , 'noscript' , 'link' , 'style' , 'hr' , 'embed' , 'iframe' , 'object' ] ; // cleanAttributes
var WHITELIST _ATTRS$1 = [ 'src' , 'srcset' , 'sizes' , 'type' , 'href' , 'class' , 'id' , 'alt' , 'xlink:href' , 'width' , 'height' ] ;
var WHITELIST _ATTRS _RE$1 = new RegExp ( "^(" . concat ( WHITELIST _ATTRS$1 . join ( '|' ) , ")$" ) , 'i' ) ; // removeEmpty
var CLEAN _CONDITIONALLY _TAGS$1 = [ 'ul' , 'ol' , 'table' , 'div' , 'button' , 'form' ] . join ( ',' ) ; // cleanHeaders
var HEADER _TAGS$1 = [ 'h2' , 'h3' , 'h4' , 'h5' , 'h6' ] ;
var HEADER _TAG _LIST$1 = HEADER _TAGS$1 . join ( ',' ) ; // // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
var UNLIKELY _CANDIDATES _BLACKLIST$2 = [ 'ad-break' , 'adbox' , 'advert' , 'addthis' , 'agegate' , 'aux' , 'blogger-labels' , 'combx' , 'comment' , 'conversation' , 'disqus' , 'entry-unrelated' , 'extra' , 'foot' , // 'form', // This is too generic, has too many false positives
'header' , 'hidden' , 'loader' , 'login' , // Note: This can hit 'blogindex'.
'menu' , 'meta' , 'nav' , 'outbrain' , 'pager' , 'pagination' , 'predicta' , // readwriteweb inline ad box
'presence_control_external' , // lifehacker.com container full of false positives
'popup' , 'printfriendly' , 'related' , 'remove' , 'remark' , 'rss' , 'share' , 'shoutbox' , 'sidebar' , 'sociable' , 'sponsor' , 'taboola' , 'tools' ] ; // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.
var UNLIKELY _CANDIDATES _WHITELIST$2 = [ 'and' , 'article' , 'body' , 'blogindex' , 'column' , 'content' , 'entry-content-asset' , 'format' , // misuse of form
'hfeed' , 'hentry' , 'hatom' , 'main' , 'page' , 'posts' , 'shadow' ] ; // A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.
var DIV _TO _P _BLOCK _TAGS$2 = [ 'a' , 'blockquote' , 'dl' , 'div' , 'img' , 'p' , 'pre' , 'table' ] . join ( ',' ) ; // A list of tags that should be ignored when trying to find the top candidate
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
var POSITIVE _SCORE _HINTS$2 = [ 'article' , 'articlecontent' , 'instapaper_body' , 'blog' , 'body' , 'content' , 'entry-content-asset' , 'entry' , 'hentry' , 'main' , 'Normal' , 'page' , 'pagination' , 'permalink' , 'post' , 'story' , 'text' , '[-_]copy' , // usatoday
'\\Bcopy' ] ; // The above list, joined into a matching regular expression
var POSITIVE _SCORE _RE$2 = new RegExp ( POSITIVE _SCORE _HINTS$2 . join ( '|' ) , 'i' ) ; // Readability publisher-specific guidelines
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
var NEGATIVE _SCORE _HINTS$2 = [ 'adbox' , 'advert' , 'author' , 'bio' , 'bookmark' , 'bottom' , 'byline' , 'clear' , 'com-' , 'combx' , 'comment' , 'comment\\B' , 'contact' , 'copy' , 'credit' , 'crumb' , 'date' , 'deck' , 'excerpt' , 'featured' , // tnr.com has a featured_content which throws us off
'foot' , 'footer' , 'footnote' , 'graf' , 'head' , 'info' , 'infotext' , // newscientist.com copyright
'instapaper_ignore' , 'jump' , 'linebreak' , 'link' , 'masthead' , 'media' , 'meta' , 'modal' , 'outbrain' , // slate.com junk
'promo' , 'pr_' , // autoblog - press release
'related' , 'respond' , 'roundcontent' , // lifehacker restricted content warning
'scroll' , 'secondary' , 'share' , 'shopping' , 'shoutbox' , 'side' , 'sidebar' , 'sponsor' , 'stamp' , 'sub' , 'summary' , 'tags' , 'tools' , 'widget' ] ; // The above list, joined into a matching regular expression
var NEGATIVE _SCORE _RE$2 = new RegExp ( NEGATIVE _SCORE _HINTS$2 . join ( '|' ) , 'i' ) ; // XPath to try to determine if a page is wordpress. Not always successful.
var IS _WP _SELECTOR$1 = 'meta[name=generator][value^=WordPress]' ; // Match a digit. Pretty clear.
var PAGE _RE$1 = new RegExp ( 'pag(e|ing|inat)' , 'i' ) ; // Match any link text/classname/id that looks like it could mean the next
// http://bit.ly/qneNIT
var BLOCK _LEVEL _TAGS$2 = [ 'article' , 'aside' , 'blockquote' , 'body' , 'br' , 'button' , 'canvas' , 'caption' , 'col' , 'colgroup' , 'dd' , 'div' , 'dl' , 'dt' , 'embed' , 'fieldset' , 'figcaption' , 'figure' , 'footer' , 'form' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , 'h6' , 'header' , 'hgroup' , 'hr' , 'li' , 'map' , 'object' , 'ol' , 'output' , 'p' , 'pre' , 'progress' , 'section' , 'table' , 'tbody' , 'textarea' , 'tfoot' , 'th' , 'thead' , 'tr' , 'ul' , 'video' ] ;
var BLOCK _LEVEL _TAGS _RE$2 = new RegExp ( "^(" . concat ( BLOCK _LEVEL _TAGS$2 . join ( '|' ) , ")$" ) , 'i' ) ; // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
var candidatesBlacklist$2 = UNLIKELY _CANDIDATES _BLACKLIST$2 . join ( '|' ) ;
var CANDIDATES _BLACKLIST$2 = new RegExp ( candidatesBlacklist$2 , 'i' ) ;
var candidatesWhitelist$2 = UNLIKELY _CANDIDATES _WHITELIST$2 . join ( '|' ) ;
var CANDIDATES _WHITELIST$2 = new RegExp ( candidatesWhitelist$2 , 'i' ) ;
function stripUnlikelyCandidates$1 ( $ ) {
// Loop through the provided document and remove any non-link nodes
// that are unlikely candidates for article content.
//
// Links are ignored because there are very often links to content
// that are identified as non-body-content, but may be inside
// article-like content.
//
// :param $: a cheerio object to strip nodes from
// :return $: the cleaned cheerio object
$ ( '*' ) . not ( 'a' ) . each ( function ( index , node ) {
var $node = $ ( node ) ;
var classes = $node . attr ( 'class' ) ;
var id = $node . attr ( 'id' ) ;
if ( ! id && ! classes ) return ;
var classAndId = "" . concat ( classes || '' , " " ) . concat ( id || '' ) ;
if ( CANDIDATES _WHITELIST$2 . test ( classAndId ) ) {
return ;
}
if ( CANDIDATES _BLACKLIST$2 . test ( classAndId ) ) {
$node . remove ( ) ;
}
} ) ;
return $ ;
} // Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
// Given cheerio object, convert consecutive <br /> tags into
// <p /> tags instead.
//
// :param $: A cheerio object
function brsToPs$$1 ( $ ) {
var collapsing = false ;
$ ( 'br' ) . each ( function ( index , element ) {
var $element = $ ( element ) ;
var nextElement = $element . next ( ) . get ( 0 ) ;
if ( nextElement && nextElement . tagName . toLowerCase ( ) === 'br' ) {
collapsing = true ;
$element . remove ( ) ;
} else if ( collapsing ) {
collapsing = false ;
paragraphize$1 ( element , $ , true ) ;
}
} ) ;
return $ ;
} // make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
// If the node is a <br />, it treats the following inline siblings
// as if they were its children.
//
// :param node: The node to paragraphize; this is a raw node
// :param $: The cheerio object to handle dom manipulation
// :param br: Whether or not the passed node is a br
function paragraphize$1 ( node , $ ) {
var br = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : false ;
var $node = $ ( node ) ;
if ( br ) {
var sibling = node . nextSibling ;
var p = $ ( '<p></p>' ) ; // while the next node is text or not a block level element
// append it to a new p node
while ( sibling && ! ( sibling . tagName && BLOCK _LEVEL _TAGS _RE$2 . test ( sibling . tagName ) ) ) {
var _sibling = sibling ,
nextSibling = _sibling . nextSibling ;
$ ( sibling ) . appendTo ( p ) ;
sibling = nextSibling ;
}
$node . replaceWith ( p ) ;
$node . remove ( ) ;
return $ ;
}
return $ ;
}
function convertDivs$1 ( $ ) {
$ ( 'div' ) . each ( function ( index , div ) {
var $div = $ ( div ) ;
var convertible = $div . children ( DIV _TO _P _BLOCK _TAGS$2 ) . length === 0 ;
if ( convertible ) {
convertNodeTo$$1 ( $div , $ , 'p' ) ;
}
} ) ;
return $ ;
}
function convertSpans$2 ( $ ) {
$ ( 'span' ) . each ( function ( index , span ) {
var $span = $ ( span ) ;
var convertible = $span . parents ( 'p, div, li, figcaption' ) . length === 0 ;
if ( convertible ) {
convertNodeTo$$1 ( $span , $ , 'p' ) ;
}
} ) ;
return $ ;
} // Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
// Things fitting this criteria:
// * Multiple consecutive <br /> tags.
// * <div /> tags without block level elements inside of them
// * <span /> tags who are not children of <p /> or <div /> tags.
//
// :param $: A cheerio object to search
// :return cheerio object with new p elements
// (By-reference mutation, though. Returned just for convenience.)
function convertToParagraphs$$1 ( $ ) {
$ = brsToPs$$1 ( $ ) ;
$ = convertDivs$1 ( $ ) ;
$ = convertSpans$2 ( $ ) ;
return $ ;
}
function convertNodeTo$$1 ( $node , $ ) {
var tag = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : 'p' ;
var node = $node . get ( 0 ) ;
if ( ! node ) {
return $ ;
}
var attrs = getAttrs$1 ( node ) || { } ;
var attribString = _Reflect$ownKeys$1 ( attrs ) . map ( function ( key ) {
return "" . concat ( key , "=" ) . concat ( attrs [ key ] ) ;
} ) . join ( ' ' ) ;
var html ;
if ( $ . browser ) {
// In the browser, the contents of noscript tags aren't rendered, therefore
// transforms on the noscript tag (commonly used for lazy-loading) don't work
// as expected. This test case handles that
html = node . tagName . toLowerCase ( ) === 'noscript' ? $node . text ( ) : $node . html ( ) ;
} else {
html = $node . contents ( ) ;
}
$node . replaceWith ( "<" . concat ( tag , " " ) . concat ( attribString , ">" ) . concat ( html , "</" ) . concat ( tag , ">" ) ) ;
return $ ;
}
function cleanForHeight$1 ( $img , $ ) {
var height = _parseInt$1 ( $img . attr ( 'height' ) , 10 ) ;
var width = _parseInt$1 ( $img . attr ( 'width' ) , 10 ) || 20 ; // Remove images that explicitly have very small heights or
// widths, because they are most likely shims or icons,
// which aren't very useful for reading.
if ( ( height || 20 ) < 10 || width < 10 ) {
$img . remove ( ) ;
} else if ( height ) {
// Don't ever specify a height on images, so that we can
// scale with respect to width without screwing up the
// aspect ratio.
$img . removeAttr ( 'height' ) ;
}
return $ ;
} // Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP
function removeSpacers$1 ( $img , $ ) {
if ( SPACER _RE$1 . test ( $img . attr ( 'src' ) ) ) {
$img . remove ( ) ;
}
return $ ;
}
function cleanImages$1 ( $article , $ ) {
$article . find ( 'img' ) . each ( function ( index , img ) {
var $img = $ ( img ) ;
cleanForHeight$1 ( $img , $ ) ;
removeSpacers$1 ( $img , $ ) ;
} ) ;
return $ ;
}
function markToKeep$1 ( article , $ , url ) {
var tags = arguments . length > 3 && arguments [ 3 ] !== undefined ? arguments [ 3 ] : [ ] ;
if ( tags . length === 0 ) {
tags = KEEP _SELECTORS$1 ;
}
if ( url ) {
var _URL$parse = URL$1 . parse ( url ) ,
protocol = _URL$parse . protocol ,
hostname = _URL$parse . hostname ;
tags = [ ] . concat ( _toConsumableArray$1 ( tags ) , [ "iframe[src^=\"" . concat ( protocol , "//" ) . concat ( hostname , "\"]" ) ] ) ;
}
$ ( tags . join ( ',' ) , article ) . addClass ( KEEP _CLASS$1 ) ;
return $ ;
}
function stripJunkTags$1 ( article , $ ) {
var tags = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : [ ] ;
if ( tags . length === 0 ) {
tags = STRIP _OUTPUT _TAGS$1 ;
} // Remove matching elements, but ignore
// any element with a class of mercury-parser-keep
$ ( tags . join ( ',' ) , article ) . not ( "." . concat ( KEEP _CLASS$1 ) ) . remove ( ) ;
return $ ;
} // by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
function cleanHOnes$$1 ( article , $ ) {
var $hOnes = $ ( 'h1' , article ) ;
if ( $hOnes . length < 3 ) {
$hOnes . each ( function ( index , node ) {
return $ ( node ) . remove ( ) ;
} ) ;
} else {
$hOnes . each ( function ( index , node ) {
convertNodeTo$$1 ( $ ( node ) , $ , 'h2' ) ;
} ) ;
}
return $ ;
}
function removeAllButWhitelist$1 ( $article , $ ) {
$article . find ( '*' ) . each ( function ( index , node ) {
var attrs = getAttrs$1 ( node ) ;
setAttrs$1 ( node , _Reflect$ownKeys$1 ( attrs ) . reduce ( function ( acc , attr ) {
if ( WHITELIST _ATTRS _RE$1 . test ( attr ) ) {
return _objectSpread ( { } , acc , _defineProperty ( { } , attr , attrs [ attr ] ) ) ;
}
return acc ;
} , { } ) ) ;
} ) ; // Remove the mercury-parser-keep class from result
$ ( "." . concat ( KEEP _CLASS$1 ) , $article ) . removeClass ( KEEP _CLASS$1 ) ;
return $article ;
} // Remove attributes like style or align
function cleanAttributes$$1 ( $article , $ ) {
// Grabbing the parent because at this point
// $article will be wrapped in a div which will
// have a score set on it.
return removeAllButWhitelist$1 ( $article . parent ( ) . length ? $article . parent ( ) : $article , $ ) ;
}
function removeEmpty$1 ( $article , $ ) {
$article . find ( 'p' ) . each ( function ( index , p ) {
var $p = $ ( p ) ;
if ( $p . find ( 'iframe, img' ) . length === 0 && $p . text ( ) . trim ( ) === '' ) $p . remove ( ) ;
} ) ;
return $ ;
} // // CONTENT FETCHING CONSTANTS ////
// for a document.
var NON _TOP _CANDIDATE _TAGS$1$1 = [ 'br' , 'b' , 'i' , 'label' , 'hr' , 'area' , 'base' , 'basefont' , 'input' , 'img' , 'link' , 'meta' ] ;
var NON _TOP _CANDIDATE _TAGS _RE$1$1 = new RegExp ( "^(" . concat ( NON _TOP _CANDIDATE _TAGS$1$1 . join ( '|' ) , ")$" ) , 'i' ) ; // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats
var HNEWS _CONTENT _SELECTORS$1$1 = [ [ '.hentry' , '.entry-content' ] , [ 'entry' , '.entry-content' ] , [ '.entry' , '.entry_content' ] , [ '.post' , '.postbody' ] , [ '.post' , '.post_body' ] , [ '.post' , '.post-body' ] ] ;
var PHOTO _HINTS$1$1 = [ 'figure' , 'photo' , 'image' , 'caption' ] ;
var PHOTO _HINTS _RE$1$1 = new RegExp ( PHOTO _HINTS$1$1 . join ( '|' ) , 'i' ) ; // A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
var POSITIVE _SCORE _HINTS$1$1 = [ 'article' , 'articlecontent' , 'instapaper_body' , 'blog' , 'body' , 'content' , 'entry-content-asset' , 'entry' , 'hentry' , 'main' , 'Normal' , 'page' , 'pagination' , 'permalink' , 'post' , 'story' , 'text' , '[-_]copy' , // usatoday
'\\Bcopy' ] ; // The above list, joined into a matching regular expression
var POSITIVE _SCORE _RE$1$1 = new RegExp ( POSITIVE _SCORE _HINTS$1$1 . join ( '|' ) , 'i' ) ; // Readability publisher-specific guidelines
var READABILITY _ASSET$1$1 = new RegExp ( 'entry-content-asset' , 'i' ) ; // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
var NEGATIVE _SCORE _HINTS$1$1 = [ 'adbox' , 'advert' , 'author' , 'bio' , 'bookmark' , 'bottom' , 'byline' , 'clear' , 'com-' , 'combx' , 'comment' , 'comment\\B' , 'contact' , 'copy' , 'credit' , 'crumb' , 'date' , 'deck' , 'excerpt' , 'featured' , // tnr.com has a featured_content which throws us off
'foot' , 'footer' , 'footnote' , 'graf' , 'head' , 'info' , 'infotext' , // newscientist.com copyright
'instapaper_ignore' , 'jump' , 'linebreak' , 'link' , 'masthead' , 'media' , 'meta' , 'modal' , 'outbrain' , // slate.com junk
'promo' , 'pr_' , // autoblog - press release
'related' , 'respond' , 'roundcontent' , // lifehacker restricted content warning
'scroll' , 'secondary' , 'share' , 'shopping' , 'shoutbox' , 'side' , 'sidebar' , 'sponsor' , 'stamp' , 'sub' , 'summary' , 'tags' , 'tools' , 'widget' ] ; // The above list, joined into a matching regular expression
var NEGATIVE _SCORE _RE$1$1 = new RegExp ( NEGATIVE _SCORE _HINTS$1$1 . join ( '|' ) , 'i' ) ; // Match a digit. Pretty clear.
var PARAGRAPH _SCORE _TAGS$1$1 = new RegExp ( '^(p|li|span|pre)$' , 'i' ) ;
var CHILD _CONTENT _TAGS$1$1 = new RegExp ( '^(td|blockquote|ol|ul|dl)$' , 'i' ) ;
var BAD _TAGS$1$1 = new RegExp ( '^(address|form)$' , 'i' ) ;
function getWeight$1 ( node ) {
var classes = node . attr ( 'class' ) ;
var id = node . attr ( 'id' ) ;
var score = 0 ;
if ( id ) {
// if id exists, try to score on both positive and negative
if ( POSITIVE _SCORE _RE$1$1 . test ( id ) ) {
score += 25 ;
}
if ( NEGATIVE _SCORE _RE$1$1 . test ( id ) ) {
score -= 25 ;
}
}
if ( classes ) {
if ( score === 0 ) {
// if classes exist and id did not contribute to score
// try to score on both positive and negative
if ( POSITIVE _SCORE _RE$1$1 . test ( classes ) ) {
score += 25 ;
}
if ( NEGATIVE _SCORE _RE$1$1 . test ( classes ) ) {
score -= 25 ;
}
} // even if score has been set by id, add score for
// possible photo matches
// "try to keep photos if we can"
if ( PHOTO _HINTS _RE$1$1 . test ( classes ) ) {
score += 10 ;
} // add 25 if class matches entry-content-asset,
// a class apparently instructed for use in the
// Readability publisher guidelines
// https://www.readability.com/developers/guidelines
if ( READABILITY _ASSET$1$1 . test ( classes ) ) {
score += 25 ;
}
}
return score ;
} // returns the score of a node based on
// the node's score attribute
// returns null if no score set
function getScore$1 ( $node ) {
return _parseFloat$1 ( $node . attr ( 'score' ) ) || null ;
} // return 1 for every comma in text
function scoreCommas$1 ( text ) {
return ( text . match ( /,/g ) || [ ] ) . length ;
}
var idkRe$1 = new RegExp ( '^(p|pre)$' , 'i' ) ;
function scoreLength$1 ( textLength ) {
var tagName = arguments . length > 1 && arguments [ 1 ] !== undefined ? arguments [ 1 ] : 'p' ;
var chunks = textLength / 50 ;
if ( chunks > 0 ) {
var lengthBonus ; // No idea why p or pre are being tamped down here
// but just following the source for now
// Not even sure why tagName is included here,
// since this is only being called from the context
// of scoreParagraph
if ( idkRe$1 . test ( tagName ) ) {
lengthBonus = chunks - 2 ;
} else {
lengthBonus = chunks - 1.25 ;
}
return Math . min ( Math . max ( lengthBonus , 0 ) , 3 ) ;
}
return 0 ;
} // commas, etc. Higher is better.
function scoreParagraph$$1 ( node ) {
var score = 1 ;
var text = node . text ( ) . trim ( ) ;
var textLength = text . length ; // If this paragraph is less than 25 characters, don't count it.
if ( textLength < 25 ) {
return 0 ;
} // Add points for any commas within this paragraph
score += scoreCommas$1 ( text ) ; // For every 50 characters in this paragraph, add another point. Up
// to 3 points.
score += scoreLength$1 ( textLength ) ; // Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold.
if ( text . slice ( - 1 ) === ':' ) {
score -= 1 ;
}
return score ;
}
function setScore$1 ( $node , $ , score ) {
$node . attr ( 'score' , score ) ;
return $node ;
}
function addScore$$1 ( $node , $ , amount ) {
try {
var score = getOrInitScore$$1 ( $node , $ ) + amount ;
setScore$1 ( $node , $ , score ) ;
} catch ( e ) { // Ignoring; error occurs in scoreNode
}
return $node ;
}
function addToParent$$1 ( node , $ , score ) {
var parent = node . parent ( ) ;
if ( parent ) {
addScore$$1 ( parent , $ , score * 0.25 ) ;
}
return node ;
} // if not, initializes a score based on
// the node's tag type
function getOrInitScore$$1 ( $node , $ ) {
var weightNodes = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : true ;
var score = getScore$1 ( $node ) ;
if ( score ) {
return score ;
}
score = scoreNode$$1 ( $node ) ;
if ( weightNodes ) {
score += getWeight$1 ( $node ) ;
}
addToParent$$1 ( $node , $ , score ) ;
return score ;
} // just scores based on tag.
function scoreNode$$1 ( $node ) {
var _$node$get = $node . get ( 0 ) ,
tagName = _$node$get . tagName ; // TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node – AP
if ( PARAGRAPH _SCORE _TAGS$1$1 . test ( tagName ) ) {
return scoreParagraph$$1 ( $node ) ;
}
if ( tagName . toLowerCase ( ) === 'div' ) {
return 5 ;
}
if ( CHILD _CONTENT _TAGS$1$1 . test ( tagName ) ) {
return 3 ;
}
if ( BAD _TAGS$1$1 . test ( tagName ) ) {
return - 3 ;
}
if ( tagName . toLowerCase ( ) === 'th' ) {
return - 5 ;
}
return 0 ;
}
function convertSpans$1$1 ( $node , $ ) {
if ( $node . get ( 0 ) ) {
var _$node$get = $node . get ( 0 ) ,
tagName = _$node$get . tagName ;
if ( tagName === 'span' ) {
// convert spans to divs
convertNodeTo$$1 ( $node , $ , 'div' ) ;
}
}
}
function addScoreTo$1 ( $node , $ , score ) {
if ( $node ) {
convertSpans$1$1 ( $node , $ ) ;
addScore$$1 ( $node , $ , score ) ;
}
}
function scorePs$1 ( $ , weightNodes ) {
$ ( 'p, pre' ) . not ( '[score]' ) . each ( function ( index , node ) {
// The raw score for this paragraph, before we add any parent/child
// scores.
var $node = $ ( node ) ;
$node = setScore$1 ( $node , $ , getOrInitScore$$1 ( $node , $ , weightNodes ) ) ;
var $parent = $node . parent ( ) ;
var rawScore = scoreNode$$1 ( $node ) ;
addScoreTo$1 ( $parent , $ , rawScore , weightNodes ) ;
if ( $parent ) {
// Add half of the individual content score to the
// grandparent
addScoreTo$1 ( $parent . parent ( ) , $ , rawScore / 2 , weightNodes ) ;
}
} ) ;
return $ ;
} // score content. Parents get the full value of their children's
// content score, grandparents half
function scoreContent$$1 ( $ ) {
var weightNodes = arguments . length > 1 && arguments [ 1 ] !== undefined ? arguments [ 1 ] : true ; // First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS _CONTENT _SELECTORS$1$1 . forEach ( function ( _ref ) {
var _ref2 = _slicedToArray$1 ( _ref , 2 ) ,
parentSelector = _ref2 [ 0 ] ,
childSelector = _ref2 [ 1 ] ;
$ ( "" . concat ( parentSelector , " " ) . concat ( childSelector ) ) . each ( function ( index , node ) {
addScore$$1 ( $ ( node ) . parent ( parentSelector ) , $ , 80 ) ;
} ) ;
} ) ; // Doubling this again
// Previous solution caused a bug
// in which parents weren't retaining
// scores. This is not ideal, and
// should be fixed.
scorePs$1 ( $ , weightNodes ) ;
scorePs$1 ( $ , weightNodes ) ;
return $ ;
} // it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
function mergeSiblings$1 ( $candidate , topScore , $ ) {
if ( ! $candidate . parent ( ) . length ) {
return $candidate ;
}
var siblingScoreThreshold = Math . max ( 10 , topScore * 0.25 ) ;
var wrappingDiv = $ ( '<div></div>' ) ;
$candidate . parent ( ) . children ( ) . each ( function ( index , sibling ) {
var $sibling = $ ( sibling ) ; // Ignore tags like BR, HR, etc
if ( NON _TOP _CANDIDATE _TAGS _RE$1$1 . test ( sibling . tagName ) ) {
return null ;
}
var siblingScore = getScore$1 ( $sibling ) ;
if ( siblingScore ) {
if ( $sibling . get ( 0 ) === $candidate . get ( 0 ) ) {
wrappingDiv . append ( $sibling ) ;
} else {
var contentBonus = 0 ;
var density = linkDensity$1 ( $sibling ) ; // If sibling has a very low link density,
// give it a small bonus
if ( density < 0.05 ) {
contentBonus += 20 ;
} // If sibling has a high link density,
// give it a penalty
if ( density >= 0.5 ) {
contentBonus -= 20 ;
} // If sibling node has the same class as
// candidate, give it a bonus
if ( $sibling . attr ( 'class' ) === $candidate . attr ( 'class' ) ) {
contentBonus += topScore * 0.2 ;
}
var newScore = siblingScore + contentBonus ;
if ( newScore >= siblingScoreThreshold ) {
return wrappingDiv . append ( $sibling ) ;
}
if ( sibling . tagName === 'p' ) {
var siblingContent = $sibling . text ( ) ;
var siblingContentLength = textLength$1 ( siblingContent ) ;
if ( siblingContentLength > 80 && density < 0.25 ) {
return wrappingDiv . append ( $sibling ) ;
}
if ( siblingContentLength <= 80 && density === 0 && hasSentenceEnd$1 ( siblingContent ) ) {
return wrappingDiv . append ( $sibling ) ;
}
}
}
}
return null ;
} ) ;
if ( wrappingDiv . children ( ) . length === 1 && wrappingDiv . children ( ) . first ( ) . get ( 0 ) === $candidate . get ( 0 ) ) {
return $candidate ;
}
return wrappingDiv ;
} // candidate nodes we found and find the one with the highest score.
function findTopCandidate$$1 ( $ ) {
var $candidate ;
var topScore = 0 ;
$ ( '[score]' ) . each ( function ( index , node ) {
// Ignore tags like BR, HR, etc
if ( NON _TOP _CANDIDATE _TAGS _RE$1$1 . test ( node . tagName ) ) {
return ;
}
var $node = $ ( node ) ;
var score = getScore$1 ( $node ) ;
if ( score > topScore ) {
topScore = score ;
$candidate = $node ;
}
} ) ; // If we don't have a candidate, return the body
// or whatever the first element is
if ( ! $candidate ) {
return $ ( 'body' ) || $ ( '*' ) . first ( ) ;
}
$candidate = mergeSiblings$1 ( $candidate , topScore , $ ) ;
return $candidate ;
} // Scoring
function removeUnlessContent$1 ( $node , $ , weight ) {
// Explicitly save entry-content-asset tags, which are
// noted as valuable in the Publisher guidelines. For now
// this works everywhere. We may want to consider making
// this less of a sure-thing later.
if ( $node . hasClass ( 'entry-content-asset' ) ) {
return ;
}
var content = normalizeSpaces$1 ( $node . text ( ) ) ;
if ( scoreCommas$1 ( content ) < 10 ) {
var pCount = $ ( 'p' , $node ) . length ;
var inputCount = $ ( 'input' , $node ) . length ; // Looks like a form, too many inputs.
if ( inputCount > pCount / 3 ) {
$node . remove ( ) ;
return ;
}
var contentLength = content . length ;
var imgCount = $ ( 'img' , $node ) . length ; // Content is too short, and there are no images, so
// this is probably junk content.
if ( contentLength < 25 && imgCount === 0 ) {
$node . remove ( ) ;
return ;
}
var density = linkDensity$1 ( $node ) ; // Too high of link density, is probably a menu or
// something similar.
// console.log(weight, density, contentLength)
if ( weight < 25 && density > 0.2 && contentLength > 75 ) {
$node . remove ( ) ;
return ;
} // Too high of a link density, despite the score being
// high.
if ( weight >= 25 && density > 0.5 ) {
// Don't remove the node if it's a list and the
// previous sibling starts with a colon though. That
// means it's probably content.
var tagName = $node . get ( 0 ) . tagName . toLowerCase ( ) ;
var nodeIsList = tagName === 'ol' || tagName === 'ul' ;
if ( nodeIsList ) {
var previousNode = $node . prev ( ) ;
if ( previousNode && normalizeSpaces$1 ( previousNode . text ( ) ) . slice ( - 1 ) === ':' ) {
return ;
}
}
$node . remove ( ) ;
return ;
}
var scriptCount = $ ( 'script' , $node ) . length ; // Too many script tags, not enough content.
if ( scriptCount > 0 && contentLength < 150 ) {
$node . remove ( ) ;
}
}
} // Given an article, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.
function cleanTags$$1 ( $article , $ ) {
$ ( CLEAN _CONDITIONALLY _TAGS$1 , $article ) . each ( function ( index , node ) {
var $node = $ ( node ) ; // If marked to keep, skip it
if ( $node . hasClass ( KEEP _CLASS$1 ) || $node . find ( "." . concat ( KEEP _CLASS$1 ) ) . length > 0 ) return ;
var weight = getScore$1 ( $node ) ;
if ( ! weight ) {
weight = getOrInitScore$$1 ( $node , $ ) ;
setScore$1 ( $node , $ , weight ) ;
} // drop node if its weight is < 0
if ( weight < 0 ) {
$node . remove ( ) ;
} else {
// deteremine if node seems like content
removeUnlessContent$1 ( $node , $ , weight ) ;
}
} ) ;
return $ ;
}
function cleanHeaders$1 ( $article , $ ) {
var title = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : '' ;
$ ( HEADER _TAG _LIST$1 , $article ) . each ( function ( index , header ) {
var $header = $ ( header ) ; // Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
// all of which should be handled by other metadata handling.
if ( $ ( $header , $article ) . prevAll ( 'p' ) . length === 0 ) {
return $header . remove ( ) ;
} // Remove any headers that match the title exactly.
if ( normalizeSpaces$1 ( $ ( header ) . text ( ) ) === title ) {
return $header . remove ( ) ;
} // If this header has a negative weight, it's probably junk.
// Get rid of it.
if ( getWeight$1 ( $ ( header ) ) < 0 ) {
return $header . remove ( ) ;
}
return $header ;
} ) ;
return $ ;
} // html to avoid later complications with multiple body tags.
function rewriteTopLevel$$1 ( article , $ ) {
// I'm not using context here because
// it's problematic when converting the
// top-level/root node - AP
$ = convertNodeTo$$1 ( $ ( 'html' ) , $ , 'div' ) ;
$ = convertNodeTo$$1 ( $ ( 'body' ) , $ , 'div' ) ;
return $ ;
}
function absolutize$1 ( $ , rootUrl , attr ) {
var baseUrl = $ ( 'base' ) . attr ( 'href' ) ;
$ ( "[" . concat ( attr , "]" ) ) . each ( function ( _ , node ) {
var attrs = getAttrs$1 ( node ) ;
var url = attrs [ attr ] ;
if ( ! url ) return ;
var absoluteUrl = URL$1 . resolve ( baseUrl || rootUrl , url ) ;
setAttr$1 ( node , attr , absoluteUrl ) ;
} ) ;
}
function absolutizeSet$1 ( $ , rootUrl , $content ) {
$ ( '[srcset]' , $content ) . each ( function ( _ , node ) {
var attrs = getAttrs$1 ( node ) ;
var urlSet = attrs . srcset ;
if ( urlSet ) {
// a comma should be considered part of the candidate URL unless preceded by a descriptor
// descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
// space characters inside the URL should be encoded (%20 or +)
var candidates = urlSet . match ( /(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g ) ;
if ( ! candidates ) return ;
var absoluteCandidates = candidates . map ( function ( candidate ) {
// a candidate URL cannot start or end with a comma
// descriptors are separated from the URLs by unescaped whitespace
var parts = candidate . trim ( ) . replace ( /,$/ , '' ) . split ( /\s+/ ) ;
parts [ 0 ] = URL$1 . resolve ( rootUrl , parts [ 0 ] ) ;
return parts . join ( ' ' ) ;
} ) ;
var absoluteUrlSet = _toConsumableArray$1 ( new _Set ( absoluteCandidates ) ) . join ( ', ' ) ;
setAttr$1 ( node , 'srcset' , absoluteUrlSet ) ;
}
} ) ;
}
function makeLinksAbsolute$$1 ( $content , $ , url ) {
[ 'href' , 'src' ] . forEach ( function ( attr ) {
return absolutize$1 ( $ , url , attr ) ;
} ) ;
absolutizeSet$1 ( $ , url , $content ) ;
return $content ;
}
function textLength$1 ( text ) {
return text . trim ( ) . replace ( /\s+/g , ' ' ) . length ;
} // Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float
function linkDensity$1 ( $node ) {
var totalTextLength = textLength$1 ( $node . text ( ) ) ;
var linkText = $node . find ( 'a' ) . text ( ) ;
var linkLength = textLength$1 ( linkText ) ;
if ( totalTextLength > 0 ) {
return linkLength / totalTextLength ;
}
if ( totalTextLength === 0 && linkLength > 0 ) {
return 1 ;
}
return 0 ;
} // search for, find a meta tag associated.
function extractFromMeta$$1 ( $ , metaNames , cachedNames ) {
var cleanTags = arguments . length > 3 && arguments [ 3 ] !== undefined ? arguments [ 3 ] : true ;
var foundNames = metaNames . filter ( function ( name ) {
return cachedNames . indexOf ( name ) !== - 1 ;
} ) ; // eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true ;
var _didIteratorError = false ;
var _iteratorError = undefined ;
try {
var _loop = function _loop ( ) {
var name = _step . value ;
var type = 'name' ;
var value = 'value' ;
var nodes = $ ( "meta[" . concat ( type , "=\"" ) . concat ( name , "\"]" ) ) ; // Get the unique value of every matching node, in case there
// are two meta tags with the same name and value.
// Remove empty values.
var values = nodes . map ( function ( index , node ) {
return $ ( node ) . attr ( value ) ;
} ) . toArray ( ) . filter ( function ( text ) {
return text !== '' ;
} ) ; // If we have more than one value for the same name, we have a
// conflict and can't trust any of them. Skip this name. If we have
// zero, that means our meta tags had no values. Skip this name
// also.
if ( values . length === 1 ) {
var metaValue ; // Meta values that contain HTML should be stripped, as they
// weren't subject to cleaning previously.
if ( cleanTags ) {
metaValue = stripTags$1 ( values [ 0 ] , $ ) ;
} else {
var _values = _slicedToArray$1 ( values , 1 ) ;
metaValue = _values [ 0 ] ;
}
return {
v : metaValue
} ;
}
} ;
for ( var _iterator = _getIterator$1 ( foundNames ) , _step ; ! ( _iteratorNormalCompletion = ( _step = _iterator . next ( ) ) . done ) ; _iteratorNormalCompletion = true ) {
var _ret = _loop ( ) ;
if ( _typeof$1 ( _ret ) === "object" ) return _ret . v ;
} // If nothing is found, return null
} catch ( err ) {
_didIteratorError = true ;
_iteratorError = err ;
} finally {
try {
if ( ! _iteratorNormalCompletion && _iterator . return != null ) {
_iterator . return ( ) ;
}
} finally {
if ( _didIteratorError ) {
throw _iteratorError ;
}
}
}
return null ;
}
function isGoodNode$1 ( $node , maxChildren ) {
// If it has a number of children, it's more likely a container
// element. Skip it.
if ( $node . children ( ) . length > maxChildren ) {
return false ;
} // If it looks to be within a comment, skip it.
if ( withinComment$$1 ( $node ) ) {
return false ;
}
return true ;
} // Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.
function extractFromSelectors$$1 ( $ , selectors ) {
var maxChildren = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : 1 ;
var textOnly = arguments . length > 3 && arguments [ 3 ] !== undefined ? arguments [ 3 ] : true ; // eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true ;
var _didIteratorError = false ;
var _iteratorError = undefined ;
try {
for ( var _iterator = _getIterator$1 ( selectors ) , _step ; ! ( _iteratorNormalCompletion = ( _step = _iterator . next ( ) ) . done ) ; _iteratorNormalCompletion = true ) {
var selector = _step . value ;
var nodes = $ ( selector ) ; // If we didn't get exactly one of this selector, this may be
// a list of articles or comments. Skip it.
if ( nodes . length === 1 ) {
var $node = $ ( nodes [ 0 ] ) ;
if ( isGoodNode$1 ( $node , maxChildren ) ) {
var content = void 0 ;
if ( textOnly ) {
content = $node . text ( ) ;
} else {
content = $node . html ( ) ;
}
if ( content ) {
return content ;
}
}
}
}
} catch ( err ) {
_didIteratorError = true ;
_iteratorError = err ;
} finally {
try {
if ( ! _iteratorNormalCompletion && _iterator . return != null ) {
_iterator . return ( ) ;
}
} finally {
if ( _didIteratorError ) {
throw _iteratorError ;
}
}
}
return null ;
} // strips all tags from a string of text
function stripTags$1 ( text , $ ) {
// Wrapping text in html element prevents errors when text
// has no html
var cleanText = $ ( "<span>" . concat ( text , "</span>" ) ) . text ( ) ;
return cleanText === '' ? text : cleanText ;
}
function withinComment$$1 ( $node ) {
var parents = $node . parents ( ) . toArray ( ) ;
var commentParent = parents . find ( function ( parent ) {
var attrs = getAttrs$1 ( parent ) ;
var nodeClass = attrs . class ,
id = attrs . id ;
var classAndId = "" . concat ( nodeClass , " " ) . concat ( id ) ;
return classAndId . includes ( 'comment' ) ;
} ) ;
return commentParent !== undefined ;
} // Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean
function nodeIsSufficient$1 ( $node ) {
return $node . text ( ) . trim ( ) . length >= 100 ;
}
function isWordpress$1 ( $ ) {
return $ ( IS _WP _SELECTOR$1 ) . length > 0 ;
}
function getAttrs$1 ( node ) {
var attribs = node . attribs ,
attributes = node . attributes ;
if ( ! attribs && attributes ) {
var attrs = _Reflect$ownKeys$1 ( attributes ) . reduce ( function ( acc , index ) {
var attr = attributes [ index ] ;
if ( ! attr . name || ! attr . value ) return acc ;
acc [ attr . name ] = attr . value ;
return acc ;
} , { } ) ;
return attrs ;
}
return attribs ;
}
function setAttr$1 ( node , attr , val ) {
if ( node . attribs ) {
node . attribs [ attr ] = val ;
} else if ( node . attributes ) {
node . setAttribute ( attr , val ) ;
}
return node ;
}
function setAttrs$1 ( node , attrs ) {
if ( node . attribs ) {
node . attribs = attrs ;
} else if ( node . attributes ) {
while ( node . attributes . length > 0 ) {
node . removeAttribute ( node . attributes [ 0 ] . name ) ;
}
_Reflect$ownKeys$1 ( attrs ) . forEach ( function ( key ) {
node . setAttribute ( key , attrs [ key ] ) ;
} ) ;
}
return node ;
} // DOM manipulation
var IS _LINK = new RegExp ( 'https?://' , 'i' ) ;
var IMAGE _RE = '.(png|gif|jpe?g)' ;
var IS _IMAGE = new RegExp ( "" . concat ( IMAGE _RE ) , 'i' ) ;
var IS _SRCSET = new RegExp ( "" . concat ( IMAGE _RE , "(\\?\\S+)?(\\s*[\\d.]+[wx])" ) , 'i' ) ;
var TAGS _TO _REMOVE = [ 'script' , 'style' , 'form' ] . join ( ',' ) ; // lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
function convertLazyLoadedImages ( $ ) {
var extractSrcFromJSON = function extractSrcFromJSON ( str ) {
try {
var _JSON$parse = JSON . parse ( str ) ,
src = _JSON$parse . src ;
if ( typeof src === 'string' ) return src ;
} catch ( _ ) {
return false ;
}
return false ;
} ;
$ ( 'img' ) . each ( function ( _ , img ) {
var attrs = getAttrs$1 ( img ) ;
_Reflect$ownKeys$1 ( attrs ) . forEach ( function ( attr ) {
var value = attrs [ attr ] ;
if ( attr !== 'srcset' && IS _LINK . test ( value ) && IS _SRCSET . test ( value ) ) {
$ ( img ) . attr ( 'srcset' , value ) ;
} else if ( attr !== 'src' && attr !== 'srcset' && IS _LINK . test ( value ) && IS _IMAGE . test ( value ) ) {
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
var existingSrc = extractSrcFromJSON ( value ) ;
if ( existingSrc ) {
$ ( img ) . attr ( 'src' , existingSrc ) ;
} else {
$ ( img ) . attr ( 'src' , value ) ;
}
}
} ) ;
} ) ;
return $ ;
}
function isComment ( index , node ) {
return node . type === 'comment' ;
}
function cleanComments ( $ ) {
$ . root ( ) . find ( '*' ) . contents ( ) . filter ( isComment ) . remove ( ) ;
return $ ;
}
function clean ( $ ) {
$ ( TAGS _TO _REMOVE ) . remove ( ) ;
$ = cleanComments ( $ ) ;
return $ ;
}
var Resource = {
// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
// :param headers: Custom headers to be included in the request
create : function ( ) {
var _create = _asyncToGenerator (
/*#__PURE__*/
_regeneratorRuntime . mark ( function _callee ( url , preparedResponse , parsedUrl ) {
var headers ,
result ,
validResponse ,
_args = arguments ;
return _regeneratorRuntime . wrap ( function _callee$ ( _context ) {
while ( 1 ) {
switch ( _context . prev = _context . next ) {
case 0 :
headers = _args . length > 3 && _args [ 3 ] !== undefined ? _args [ 3 ] : { } ;
if ( ! preparedResponse ) {
_context . next = 6 ;
break ;
}
validResponse = {
statusMessage : 'OK' ,
statusCode : 200 ,
headers : {
'content-type' : 'text/html' ,
'content-length' : 500
}
} ;
result = {
body : preparedResponse ,
response : validResponse ,
alreadyDecoded : true
} ;
_context . next = 9 ;
break ;
case 6 :
_context . next = 8 ;
return fetchResource ( url , parsedUrl , headers ) ;
case 8 :
result = _context . sent ;
case 9 :
if ( ! result . error ) {
_context . next = 12 ;
break ;
}
result . failed = true ;
return _context . abrupt ( "return" , result ) ;
case 12 :
return _context . abrupt ( "return" , this . generateDoc ( result ) ) ;
case 13 :
case "end" :
return _context . stop ( ) ;
}
}
} , _callee , this ) ;
} ) ) ;
function create ( _x , _x2 , _x3 ) {
return _create . apply ( this , arguments ) ;
}
return create ;
} ( ) ,
generateDoc : function generateDoc ( _ref ) {
var content = _ref . body ,
response = _ref . response ,
_ref$alreadyDecoded = _ref . alreadyDecoded ,
alreadyDecoded = _ref$alreadyDecoded === void 0 ? false : _ref$alreadyDecoded ;
var _response$headers$con = response . headers [ 'content-type' ] ,
contentType = _response$headers$con === void 0 ? '' : _response$headers$con ; // TODO: Implement is_text function from
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
if ( ! contentType . includes ( 'html' ) && ! contentType . includes ( 'text' ) ) {
throw new Error ( 'Content does not appear to be text.' ) ;
}
var $ = this . encodeDoc ( {
content : content ,
contentType : contentType ,
alreadyDecoded : alreadyDecoded
} ) ;
if ( $ . root ( ) . children ( ) . length === 0 ) {
throw new Error ( 'No children, likely a bad parse.' ) ;
}
$ = normalizeMetaTags ( $ ) ;
$ = convertLazyLoadedImages ( $ ) ;
$ = clean ( $ ) ;
return $ ;
} ,
encodeDoc : function encodeDoc ( _ref2 ) {
var content = _ref2 . content ,
contentType = _ref2 . contentType ,
_ref2$alreadyDecoded = _ref2 . alreadyDecoded ,
alreadyDecoded = _ref2$alreadyDecoded === void 0 ? false : _ref2$alreadyDecoded ;
if ( alreadyDecoded ) {
return cheerio$1 . load ( content ) ;
}
var encoding = getEncoding$1 ( contentType ) ;
var decodedContent = iconv . decode ( content , encoding ) ;
var $ = cheerio$1 . load ( decodedContent ) ; // after first cheerio.load, check to see if encoding matches
var contentTypeSelector = cheerio$1 . browser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]' ;
var metaContentType = $ ( contentTypeSelector ) . attr ( 'content' ) || $ ( 'meta[charset]' ) . attr ( 'charset' ) ;
var properEncoding = getEncoding$1 ( metaContentType ) ; // if encodings in the header/body dont match, use the one in the body
if ( metaContentType && properEncoding !== encoding ) {
decodedContent = iconv . decode ( content , properEncoding ) ;
$ = cheerio$1 . load ( decodedContent ) ;
}
return $ ;
}
} ;
var _marked =
/*#__PURE__*/
_regeneratorRuntime . mark ( range ) ;
function range ( ) {
var start ,
end ,
_args = arguments ;
return _regeneratorRuntime . wrap ( function range$ ( _context ) {
while ( 1 ) {
switch ( _context . prev = _context . next ) {
case 0 :
start = _args . length > 0 && _args [ 0 ] !== undefined ? _args [ 0 ] : 1 ;
end = _args . length > 1 && _args [ 1 ] !== undefined ? _args [ 1 ] : 1 ;
case 2 :
if ( ! ( start <= end ) ) {
_context . next = 7 ;
break ;
}
_context . next = 5 ;
return start += 1 ;
case 5 :
_context . next = 2 ;
break ;
case 7 :
case "end" :
return _context . stop ( ) ;
}
}
} , _marked , this ) ;
} // extremely simple url validation as a first step
function validateUrl ( _ref ) {
var hostname = _ref . hostname ; // If this isn't a valid url, return an error message
return ! ! hostname ;
}
var merge = function merge ( extractor , domains ) {
return domains . reduce ( function ( acc , domain ) {
acc [ domain ] = extractor ;
return acc ;
} , { } ) ;
} ;
function mergeSupportedDomains ( extractor ) {
return extractor . supportedDomains ? merge ( extractor , [ extractor . domain ] . concat ( _toConsumableArray$1 ( extractor . supportedDomains ) ) ) : merge ( extractor , [ extractor . domain ] ) ;
}
var apiExtractors = { } ;
function addExtractor ( extractor ) {
if ( ! extractor || ! extractor . domain ) {
return {
error : true ,
message : 'Unable to add custom extractor. Invalid parameters.'
} ;
}
_Object$assign ( apiExtractors , mergeSupportedDomains ( extractor ) ) ;
return apiExtractors ;
}
var BloggerExtractor = {
domain : 'blogspot.com' ,
content : {
// Blogger is insane and does not load its content
// initially in the page, but it's all there
// in noscript
selectors : [ '.post-content noscript' ] ,
// Selectors to remove from the extracted content
clean : [ ] ,
// Convert the noscript tag to a div
transforms : {
noscript : 'div'
}
} ,
author : {
selectors : [ '.post-author-name' ]
} ,
title : {
selectors : [ '.post h2.title' ]
} ,
date _published : {
selectors : [ 'span.publishdate' ]
}
} ;
var NYMagExtractor = {
domain : 'nymag.com' ,
content : {
// Order by most likely. Extractor will stop on first occurrence
selectors : [ 'div.article-content' , 'section.body' , 'article.article' ] ,
// Selectors to remove from the extracted content
clean : [ '.ad' , '.single-related-story' ] ,
// Object of tranformations to make on matched elements
// Each key is the selector, each value is the tag to
// transform to.
// If a function is given, it should return a string
// to convert to or nothing (in which case it will not perform
// the transformation.
transforms : {
// Convert h1s to h2s
h1 : 'h2' ,
// Convert lazy-loaded noscript images to figures
noscript : function noscript ( $node , $ ) {
var $children = $ . browser ? $ ( $node . text ( ) ) : $node . children ( ) ;
if ( $children . length === 1 && $children . get ( 0 ) !== undefined && $children . get ( 0 ) . tagName . toLowerCase ( ) === 'img' ) {
return 'figure' ;
}
return null ;
}
}
} ,
title : {
selectors : [ 'h1.lede-feature-title' , 'h1.headline-primary' , 'h1' ]
} ,
author : {
selectors : [ '.by-authors' , '.lede-feature-author' ]
} ,
dek : {
selectors : [ '.lede-feature-teaser' ]
} ,
date _published : {
selectors : [ [ 'time.article-timestamp[datetime]' , 'datetime' ] , 'time.article-timestamp' ]
}
} ;
var WikipediaExtractor = {
domain : 'wikipedia.org' ,
content : {
selectors : [ '#mw-content-text' ] ,
defaultCleaner : false ,
// transform top infobox to an image with caption
transforms : {
'.infobox img' : function infoboxImg ( $node ) {
var $parent = $node . parents ( '.infobox' ) ; // Only prepend the first image in .infobox
if ( $parent . children ( 'img' ) . length === 0 ) {
$parent . prepend ( $node ) ;
}
} ,
'.infobox caption' : 'figcaption' ,
'.infobox' : 'figure'
} ,
// Selectors to remove from the extracted content
clean : [ '.mw-editsection' , 'figure tr, figure td, figure tbody' , '#toc' , '.navbox' ]
} ,
author : 'Wikipedia Contributors' ,
title : {
selectors : [ 'h2.title' ]
} ,
date _published : {
selectors : [ '#footer-info-lastmod' ]
}
} ;
var TwitterExtractor = {
domain : 'twitter.com' ,
content : {
transforms : {
// We're transforming essentially the whole page here.
// Twitter doesn't have nice selectors, so our initial
// selector grabs the whole page, then we're re-writing
// it to fit our needs before we clean it up.
'.permalink[role=main]' : function permalinkRoleMain ( $node , $ ) {
var tweets = $node . find ( '.tweet' ) ;
var $tweetContainer = $ ( '<div id="TWEETS_GO_HERE"></div>' ) ;
$tweetContainer . append ( tweets ) ;
$node . replaceWith ( $tweetContainer ) ;
} ,
// Twitter wraps @ with s, which
// renders as a strikethrough
s : 'span'
} ,
selectors : [ '.permalink[role=main]' ] ,
defaultCleaner : false ,
clean : [ '.stream-item-footer' , 'button' , '.tweet-details-fixer' ]
} ,
author : {
selectors : [ '.tweet.permalink-tweet .username' ]
} ,
date _published : {
selectors : [ [ '.permalink-tweet ._timestamp[data-time-ms]' , 'data-time-ms' ] ]
}
} ;
var NYTimesExtractor = {
domain : 'www.nytimes.com' ,
title : {
selectors : [ 'h1[data-testid="headline"]' , 'h1.g-headline' , 'h1[itemprop="headline"]' , 'h1.headline' , 'h1 .balancedHeadline' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] , '.g-byline' , '.byline' , [ 'meta[name="byl"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.g-blocks' , 'section[name="articleBody"]' , 'article#story' ] ,
transforms : {
'img.g-lazy' : function imgGLazy ( $node ) {
var src = $node . attr ( 'src' ) ;
var width = 640 ;
src = src . replace ( '{{size}}' , width ) ;
$node . attr ( 'src' , src ) ;
}
} ,
clean : [ '.ad' , 'header#story-header' , '.story-body-1 .lede.video' , '.visually-hidden' , '#newsletter-promo' , '.promo' , '.comments-button' , '.hidden' , '.comments' , '.supplemental' , '.nocontent' , '.story-footer-links' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'meta[name="article:published"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : null ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
var TheAtlanticExtractor = {
domain : 'www.theatlantic.com' ,
title : {
selectors : [ 'h1' , '.c-article-header__hed' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] , '.c-byline__author' ]
} ,
content : {
selectors : [ 'article' , '.article-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.partner-box' , '.callout' , '.c-article-writer__image' , '.c-article-writer__content' , '.c-letters-cta__text' , '.c-footer__logo' , '.c-recirculation-link' , '.twitter-tweet' ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'time[itemprop="datePublished"]' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var NewYorkerExtractor = {
domain : 'www.newyorker.com' ,
title : {
selectors : [ 'h1[class^="content-header"]' , 'h1[class^="ArticleHeader__hed"]' , 'h1[class*="ContentHeaderHed"]' , [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ 'article header div[class^="BylinesWrapper"]' , [ 'meta[name="article:author"]' , 'value' ] , 'div[class^="ArticleContributors"] a[rel="author"]' , 'article header div[class*="Byline__multipleContributors"]' ]
} ,
content : {
selectors : [ '.article__body' , 'article.article.main-content' , 'main[class^="Layout__content"]' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.caption__text' : 'figcaption' ,
'.caption__credit' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'footer[class^="ArticleFooter__footer"]' , 'aside' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , 'time.content-header__publish-date' , [ 'meta[name="pubdate"]' , 'value' ] ] ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'div[class^="ContentHeaderDek"]' , 'div.content-header__dek' , 'h2[class^="ArticleHeader__dek"]' ]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WiredExtractor = {
domain : 'www.wired.com' ,
title : {
selectors : [ 'h1[data-testId="ContentHeaderHed"]' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] , 'a[rel="author"]' ]
} ,
content : {
selectors : [ 'article.article.main-content' , 'article.content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.visually-hidden' , 'figcaption img.photo' , '.alert-message' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ ]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var MSNExtractor = {
domain : 'www.msn.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ 'span.authorname-txt' ]
} ,
content : {
selectors : [ 'div.richtext' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'span.caption' ]
} ,
date _published : {
selectors : [ 'span.time' ]
} ,
lead _image _url : {
selectors : [ ]
} ,
dek : {
selectors : [ ]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var YahooExtractor = {
domain : 'www.yahoo.com' ,
title : {
selectors : [ 'header.canvas-header' ]
} ,
author : {
selectors : [ 'span.provider-name' ]
} ,
content : {
selectors : [ // enter content selectors
'.content-canvas' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.figure-caption' ]
} ,
date _published : {
selectors : [ [ 'time.date[datetime]' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ // enter dek selectors
]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BuzzfeedExtractor = {
domain : 'www.buzzfeed.com' ,
supportedDomains : [ 'www.buzzfeednews.com' ] ,
title : {
selectors : [ 'h1.embed-headline-title' ]
} ,
author : {
selectors : [ 'a[data-action="user/username"]' , 'byline__author' , [ 'meta[name="author"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'div[class^="featureimage_featureImageWrapper"]' , '.js-subbuzz-wrapper' ] , [ '.js-subbuzz-wrapper' ] ] ,
defaultCleaner : false ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
h2 : 'b' ,
'div.longform_custom_header_media' : function divLongform _custom _header _media ( $node ) {
if ( $node . has ( 'img' ) && $node . has ( '.longform_header_image_source' ) ) {
return 'figure' ;
}
return null ;
} ,
'figure.longform_custom_header_media .longform_header_image_source' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.instapaper_ignore' , '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline' , '.share-box' , '.print' , '.js-inline-share-bar' , '.js-ad-placement' ]
} ,
date _published : {
selectors : [ [ 'time[datetime]' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.embed-headline-description' ]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WikiaExtractor = {
domain : 'fandom.wikia.com' ,
title : {
selectors : [ 'h1.entry-title' ]
} ,
author : {
selectors : [ '.author vcard' , '.fn' ]
} ,
content : {
selectors : [ '.grid-content' , '.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ ]
} ,
next _page _url : null ,
excerpt : null
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var LittleThingsExtractor = {
domain : 'www.littlethings.com' ,
title : {
selectors : [ 'h1[class*="PostHeader"]' , 'h1.post-title' ]
} ,
author : {
selectors : [ 'div[class^="PostHeader__ScAuthorNameSection"]' , [ 'meta[name="author"]' , 'value' ] ]
} ,
content : {
selectors : [ // enter content selectors
'section[class*="PostMainArticle"]' , '.mainContentIntro' , '.content-wrapper' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
next _page _url : null ,
excerpt : null
} ;
var PoliticoExtractor = {
domain : 'www.politico.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'div[itemprop="author"] meta[itemprop="name"]' , 'value' ] , '.story-meta__authors .vcard' , '.story-main-content .byline .vcard' ]
} ,
content : {
selectors : [ [ '.story-text' ] , '.story-main-content' , '.story-core' ] ,
transforms : [ ] ,
clean : [ 'figcaption' , '.story-meta' , '.ad' ]
} ,
date _published : {
selectors : [ [ 'time[itemprop="datePublished"]' , 'datetime' ] , [ '.story-meta__details time[datetime]' , 'datetime' ] , [ '.story-main-content .timestamp time[datetime]' , 'datetime' ] ] ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
}
} ;
var DeadspinExtractor = {
domain : 'deadspin.com' ,
supportedDomains : [ 'jezebel.com' , 'lifehacker.com' , 'kotaku.com' , 'gizmodo.com' , 'jalopnik.com' , 'kinja.com' , 'avclub.com' , 'clickhole.com' , 'splinternews.com' , 'theonion.com' , 'theroot.com' , 'thetakeout.com' , 'theinventory.com' ] ,
title : {
selectors : [ 'header h1' , 'h1.headline' ]
} ,
author : {
selectors : [ 'a[data-ga*="Author"]' , '.author' ]
} ,
content : {
selectors : [ '.js_post-content' , '.post-content' , '.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'iframe.lazyload[data-recommend-id^="youtube://"]' : function iframeLazyloadDataRecommendIdYoutube ( $node ) {
var youtubeId = $node . attr ( 'id' ) . split ( 'youtube-' ) [ 1 ] ;
$node . attr ( 'src' , "https://www.youtube.com/embed/" . concat ( youtubeId ) ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.magnifier' , '.lightbox' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'time.updated[datetime]' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
next _page _url : {
selectors : [ // enter selectors
]
} ,
excerpt : {
selectors : [ // enter selectors
]
}
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BroadwayWorldExtractor = {
domain : 'www.broadwayworld.com' ,
title : {
selectors : [ 'h1[itemprop=headline]' , 'h1.article-title' ]
} ,
author : {
selectors : [ 'span[itemprop=author]' ]
} ,
content : {
selectors : [ 'div[itemprop=articlebody]' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
} ,
date _published : {
selectors : [ [ 'meta[itemprop=datePublished]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ ]
} ,
next _page _url : {
selectors : [ // enter selectors
]
} ,
excerpt : {
selectors : [ // enter selectors
]
}
} ; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var ApartmentTherapyExtractor = {
domain : 'www.apartmenttherapy.com' ,
title : {
selectors : [ 'h1.headline' ]
} ,
author : {
selectors : [ '.PostByline__name' ]
} ,
content : {
selectors : [ 'div.post__content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div[data-render-react-id="images/LazyPicture"]' : function divDataRenderReactIdImagesLazyPicture ( $node , $ ) {
var data = JSON . parse ( $node . attr ( 'data-props' ) ) ;
var src = data . sources [ 0 ] . src ;
var $img = $ ( '<img />' ) . attr ( 'src' , src ) ;
$node . replaceWith ( $img ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
} ,
date _published : {
selectors : [ [ '.PostByline__timestamp[datetime]' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ ]
} ,
next _page _url : {
selectors : [ // enter selectors
]
} ,
excerpt : {
selectors : [ // enter selectors
]
}
} ;
var MediumExtractor = {
domain : 'medium.com' ,
title : {
selectors : [ 'h1' , [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
// Allow drop cap character.
'section span:first-of-type' : function sectionSpanFirstOfType ( $node ) {
var $text = $node . html ( ) ;
if ( $text . length === 1 && /^[a-zA-Z()]+$/ . test ( $text ) ) {
$node . replaceWith ( $text ) ;
}
} ,
// Re-write lazy-loaded youtube videos
iframe : function iframe ( $node ) {
var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\// ;
var thumb = decodeURIComponent ( $node . attr ( 'data-thumbnail' ) ) ;
var $parent = $node . parents ( 'figure' ) ;
if ( ytRe . test ( thumb ) ) {
var _thumb$match = thumb . match ( ytRe ) ,
_thumb$match2 = _slicedToArray$1 ( _thumb$match , 2 ) ,
_ = _thumb$match2 [ 0 ] ,
youtubeId = _thumb$match2 [ 1 ] ; // eslint-disable-line
$node . attr ( 'src' , "https://www.youtube.com/embed/" . concat ( youtubeId ) ) ;
var $caption = $parent . find ( 'figcaption' ) ;
$parent . empty ( ) . append ( [ $node , $caption ] ) ;
return ;
} // If we can't draw the YouTube preview, remove the figure.
$parent . remove ( ) ;
} ,
// rewrite figures to pull out image and caption, remove rest
figure : function figure ( $node ) {
// ignore if figure has an iframe
if ( $node . find ( 'iframe' ) . length > 0 ) return ;
var $img = $node . find ( 'img' ) . slice ( - 1 ) [ 0 ] ;
var $caption = $node . find ( 'figcaption' ) ;
$node . empty ( ) . append ( [ $img , $caption ] ) ;
} ,
// Remove any smaller images that did not get caught by the generic image
// cleaner (author photo 48px, leading sentence images 79px, etc.).
img : function img ( $node ) {
var width = _parseInt$1 ( $node . attr ( 'width' ) , 10 ) ;
if ( width < 100 ) $node . remove ( ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'span a' , 'svg' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : null ,
next _page _url : {
selectors : [ // enter selectors
]
} ,
excerpt : {
selectors : [ // enter selectors
]
}
} ;
var WwwTmzComExtractor = {
domain : 'www.tmz.com' ,
title : {
selectors : [ '.post-title-breadcrumb' , 'h1' , '.headline' ]
} ,
author : 'TMZ STAFF' ,
date _published : {
selectors : [ '.article__published-at' , '.article-posted-date' ] ,
timezone : 'America/Los_Angeles'
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article__blocks' , '.article-content' , '.all-post-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.lightbox-link' ]
}
} ;
var WwwWashingtonpostComExtractor = {
domain : 'www.washingtonpost.com' ,
title : {
selectors : [ 'h1' , '#topper-headline-wrapper' ]
} ,
author : {
selectors : [ '.pb-author-name' ]
} ,
date _published : {
selectors : [ [ '.author-timestamp[itemprop="datePublished"]' , 'content' ] ]
} ,
dek : {
selectors : [ ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div.inline-content' : function divInlineContent ( $node ) {
if ( $node . has ( 'img,iframe,video' ) . length > 0 ) {
return 'figure' ;
}
$node . remove ( ) ;
return null ;
} ,
'.pb-caption' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.interstitial-link' , '.newsletter-inline-unit' ]
}
} ;
var WwwHuffingtonpostComExtractor = {
domain : 'www.huffingtonpost.com' ,
title : {
selectors : [ 'h1.headline__title' ]
} ,
author : {
selectors : [ 'span.author-card__details__name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:modified_time"]' , 'value' ] , [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'h2.headline__subtitle' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.entry__body' ] ,
defaultCleaner : false ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.pull-quote' , '.tag-cloud' , '.embed-asset' , '.below-entry' , '.entry-corrections' , '#suggested-story' ]
}
} ;
var NewrepublicComExtractor = {
domain : 'newrepublic.com' ,
title : {
selectors : [ 'h1.article-headline' ]
} ,
author : {
selectors : [ 'span.AuthorList' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ 'h2.article-subhead' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'div.article-body' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'aside' ]
}
} ;
var MoneyCnnComExtractor = {
domain : 'money.cnn.com' ,
title : {
selectors : [ '.article-title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] , '.byline a' ]
} ,
date _published : {
selectors : [ [ 'meta[name="date"]' , 'value' ] ] ,
timezone : 'GMT'
} ,
dek : {
selectors : [ '#storytext h2' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#storytext' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.inStoryHeading' ]
}
} ;
var WwwThevergeComExtractor = {
domain : 'www.theverge.com' ,
supportedDomains : [ 'www.polygon.com' ] ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.p-dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ // feature template multi-match
[ '.c-entry-hero .e-image' , '.c-entry-intro' , '.c-entry-content' ] , // regular post multi-match
[ '.e-image--hero' , '.c-entry-content' ] , // feature template fallback
'.l-wrapper .l-feature' , // regular post fallback
'div.c-entry-content' ] ,
// Transform lazy-loaded images
transforms : {
noscript : function noscript ( $node ) {
var $children = $node . children ( ) ;
if ( $children . length === 1 && $children . get ( 0 ) . tagName === 'img' ) {
return 'span' ;
}
return null ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.aside' , 'img.c-dynamic-image' ]
}
} ;
var WwwCnnComExtractor = {
domain : 'www.cnn.com' ,
title : {
selectors : [ 'h1.pg-headline' , 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ // a more specific selector to grab the lead image and the body
[ '.media__video--thumbnail' , '.zn-body-text' ] , // a fallback for the above
'.zn-body-text' , 'div[itemprop="articleBody"]' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.zn-body__paragraph, .el__leafmedia--sourced-paragraph' : function znBody _ _paragraphEl _ _leafmediaSourcedParagraph ( $node ) {
var $text = $node . html ( ) ;
if ( $text ) {
return 'p' ;
}
return null ;
} ,
// this transform cleans the short, all-link sections linking
// to related content but not marked as such in any way.
'.zn-body__paragraph' : function znBody _ _paragraph ( $node ) {
if ( $node . has ( 'a' ) ) {
if ( $node . text ( ) . trim ( ) === $node . find ( 'a' ) . text ( ) . trim ( ) ) {
$node . remove ( ) ;
}
}
} ,
'.media__video--thumbnail' : 'figure'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwAolComExtractor = {
domain : 'www.aol.com' ,
title : {
selectors : [ 'h1.p-article__title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ '.p-article__byline__date' ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwYoutubeComExtractor = {
domain : 'www.youtube.com' ,
title : {
selectors : [ [ 'meta[name="title"]' , 'value' ] , '.watch-title' , 'h1.watch-title-container' ]
} ,
author : {
selectors : [ [ 'link[itemprop="name"]' , 'content' ] , '.yt-user-info' ]
} ,
date _published : {
selectors : [ [ 'meta[itemProp="datePublished"]' , 'value' ] ] ,
timezone : 'GMT'
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
defaultCleaner : false ,
selectors : [ '#player-container-outer' , 'ytd-expandable-video-description-body-renderer #description' , [ '#player-api' , '#description' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'#player-api' : function playerApi ( $node , $ ) {
var videoId = $ ( 'meta[itemProp="videoId"]' ) . attr ( 'value' ) ;
$node . html ( "\n <iframe src=\"https://www.youtube.com/embed/" . concat ( videoId , "\" frameborder=\"0\" allowfullscreen></iframe>" ) ) ;
} ,
'#player-container-outer' : function playerContainerOuter ( $node , $ ) {
var videoId = $ ( 'meta[itemProp="videoId"]' ) . attr ( 'value' ) ;
var description = $ ( 'meta[itemProp="description"]' ) . attr ( 'value' ) ;
$node . html ( "\n <iframe src=\"https://www.youtube.com/embed/" . concat ( videoId , "\" frameborder=\"0\" allowfullscreen></iframe>\n <div><span>" ) . concat ( description , "</span></div>" ) ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwTheguardianComExtractor = {
domain : 'www.theguardian.com' ,
title : {
selectors : [ 'h1' , '.content__headline' ]
} ,
author : {
selectors : [ 'address[data-link-name="byline"]' , 'p.byline' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'div[data-gu-name="standfirst"]' , '.content__standfirst' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#maincontent' , '.content__article-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.hide-on-mobile' , '.inline-icon' ]
}
} ;
var WwwSbnationComExtractor = {
domain : 'www.sbnation.com' ,
title : {
selectors : [ 'h1.c-page-title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'p.c-entry-summary.p-dek' , 'h2.c-entry-summary.p-dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.c-entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwBloombergComExtractor = {
domain : 'www.bloomberg.com' ,
title : {
selectors : [ // normal articles
'.lede-headline' , // /graphics/ template
'h1.article-title' , // /news/ template
'h1[class^="headline"]' , 'h1.lede-text-only__hed' ]
} ,
author : {
selectors : [ [ 'meta[name="parsely-author"]' , 'value' ] , '.byline-details__link' , // /graphics/ template
'.bydek' , // /news/ template
'.author' , 'p[class*="author"]' ]
} ,
date _published : {
selectors : [ [ 'time.published-at' , 'datetime' ] , [ 'time[datetime]' , 'datetime' ] , [ 'meta[name="date"]' , 'value' ] , [ 'meta[name="parsely-pub-date"]' , 'value' ] , [ 'meta[name="parsely-pub-date"]' , 'content' ] ]
} ,
dek : {
selectors : [ ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] , [ 'meta[name="og:image"]' , 'content' ] ]
} ,
content : {
selectors : [ '.article-body__content' , '.body-content' , // /graphics/ template
[ 'section.copy-block' ] , // /news/ template
'.body-copy' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.inline-newsletter' , '.page-ad' ]
}
} ;
var WwwBustleComExtractor = {
domain : 'www.bustle.com' ,
title : {
selectors : [ 'h1' , 'h1.post-page__title' ]
} ,
author : {
selectors : [ 'a[href*="profile"]' , 'div.content-meta__author' ]
} ,
date _published : {
selectors : [ [ 'time' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article' , '.post-page__body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwNprOrgExtractor = {
domain : 'www.npr.org' ,
title : {
selectors : [ 'h1' , '.storytitle' ]
} ,
author : {
selectors : [ 'p.byline__name.byline__name--block' ]
} ,
date _published : {
selectors : [ [ '.dateblock time[datetime]' , 'datetime' ] , [ 'meta[name="date"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] , [ 'meta[name="twitter:image:src"]' , 'value' ] ]
} ,
content : {
selectors : [ '.storytext' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.bucketwrap.image' : 'figure' ,
'.bucketwrap.image .credit-caption' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'div.enlarge_measure' ]
}
} ;
var WwwRecodeNetExtractor = {
domain : 'www.recode.net' ,
title : {
selectors : [ 'h1.c-page-title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'h2.c-entry-summary.p-dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'figure.e-image--hero' , '.c-entry-content' ] , '.c-entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var QzComExtractor = {
domain : 'qz.com' ,
title : {
selectors : [ 'article header h1' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'time[datetime]' , 'datetime' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] , [ 'meta[property="og:image"]' , 'content' ] , [ 'meta[name="twitter:image"]' , 'content' ] ]
} ,
content : {
selectors : [ '#article-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwDmagazineComExtractor = {
domain : 'www.dmagazine.com' ,
title : {
selectors : [ 'h1.story__title' ]
} ,
author : {
selectors : [ '.story__info .story__info__item:first-child' ]
} ,
date _published : {
selectors : [ // enter selectors
'.story__info' ] ,
timezone : 'America/Chicago' ,
format : 'MMMM D, YYYY h:mm a'
} ,
dek : {
selectors : [ '.story__subhead' ]
} ,
lead _image _url : {
selectors : [ [ 'article figure a:first-child' , 'href' ] ]
} ,
content : {
selectors : [ '.story__content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwReutersComExtractor = {
domain : 'www.reuters.com' ,
title : {
selectors : [ 'h1[class*="ArticleHeader-headline-"]' , 'h1.article-headline' ]
} ,
author : {
selectors : [ [ 'meta[name="og:article:author"]' , 'value' ] , '.author' ]
} ,
date _published : {
selectors : [ [ 'meta[name="og:article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.ArticleBodyWrapper' , '#article-text' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.article-subtitle' : 'h4'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'div[class^="ArticleBody-byline-container-"]' , '#article-byline .author' ]
}
} ;
var MashableComExtractor = {
domain : 'mashable.com' ,
title : {
selectors : [ 'header h1' , 'h1.title' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] , 'span.author_name a' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#article' , 'section.article-content.blueprint' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.image-credit' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwChicagotribuneComExtractor = {
domain : 'www.chicagotribune.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ 'div.article_byline span:first-of-type' ]
} ,
date _published : {
selectors : [ 'time' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwVoxComExtractor = {
domain : 'www.vox.com' ,
title : {
selectors : [ 'h1.c-page-title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.p-dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'figure.e-image--hero' , '.c-entry-content' ] , '.c-entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'figure .e-image__image noscript' : function figureEImage _ _imageNoscript ( $node ) {
var imgHtml = $node . html ( ) ;
$node . parents ( '.e-image__image' ) . find ( '.c-dynamic-image' ) . replaceWith ( imgHtml ) ;
} ,
'figure .e-image__meta' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var NewsNationalgeographicComExtractor = {
domain : 'news.nationalgeographic.com' ,
title : {
selectors : [ 'h1' , 'h1.main-title' ]
} ,
author : {
selectors : [ '.byline-component__contributors b span' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ] ,
format : 'ddd MMM DD HH:mm:ss zz YYYY' ,
timezone : 'EST'
} ,
dek : {
selectors : [ '.article__deck' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '.parsys.content' , '.__image-lead__' ] , '.content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.parsys.content' : function parsysContent ( $node , $ ) {
var $imgSrc = $node . find ( '.image.parbase.section' ) . find ( '.picturefill' ) . first ( ) . data ( 'platform-src' ) ;
if ( $imgSrc ) {
$node . prepend ( $ ( "<img class=\"__image-lead__\" src=\"" . concat ( $imgSrc , "\"/>" ) ) ) ;
}
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.pull-quote.pull-quote--large' ]
}
} ;
var WwwNationalgeographicComExtractor = {
domain : 'www.nationalgeographic.com' ,
title : {
selectors : [ 'h1' , 'h1.main-title' ]
} ,
author : {
selectors : [ '.byline-component__contributors b span' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.Article__Headline__Desc' , '.article__deck' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'section.Article__Content' , [ '.parsys.content' , '.__image-lead__' ] , '.content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.parsys.content' : function parsysContent ( $node , $ ) {
var $imageParent = $node . children ( ) . first ( ) ;
if ( $imageParent . hasClass ( 'imageGroup' ) ) {
var $dataAttrContainer = $imageParent . find ( '.media--medium__container' ) . children ( ) . first ( ) ;
var imgPath1 = $dataAttrContainer . data ( 'platform-image1-path' ) ;
var imgPath2 = $dataAttrContainer . data ( 'platform-image2-path' ) ;
if ( imgPath2 && imgPath1 ) {
$node . prepend ( $ ( "<div class=\"__image-lead__\">\n <img src=\"" . concat ( imgPath1 , "\"/>\n <img src=\"" ) . concat ( imgPath2 , "\"/>\n </div>" ) ) ) ;
}
} else {
var $imgSrc = $node . find ( '.image.parbase.section' ) . find ( '.picturefill' ) . first ( ) . data ( 'platform-src' ) ;
if ( $imgSrc ) {
$node . prepend ( $ ( "<img class=\"__image-lead__\" src=\"" . concat ( $imgSrc , "\"/>" ) ) ) ;
}
}
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.pull-quote.pull-quote--small' ]
}
} ;
var WwwLatimesComExtractor = {
domain : 'www.latimes.com' ,
title : {
selectors : [ 'h1.headline' , '.trb_ar_hl' ]
} ,
author : {
selectors : [ 'a[data-click="standardBylineAuthorName"]' , [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'meta[itemprop="datePublished"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.page-article-body' , '.trb_ar_main' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.trb_ar_la' : function trb _ar _la ( $node ) {
var $figure = $node . find ( 'figure' ) ;
$node . replaceWith ( $figure ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.trb_ar_by' , '.trb_ar_cr' ]
}
} ;
var PagesixComExtractor = {
domain : 'pagesix.com' ,
supportedDomains : [ 'nypost.com' ] ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ '.byline' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '#featured-image-wrapper' , '.entry-content' ] , '.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'#featured-image-wrapper' : 'figure' ,
'.wp-caption-text' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.modal-trigger' ]
}
} ;
var ThefederalistpapersOrgExtractor = {
domain : 'thefederalistpapers.org' ,
title : {
selectors : [ 'h1.entry-title' ]
} ,
author : {
selectors : [ '.author-meta-title' , 'main span.entry-author-name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'header' , '.article-sharing' , '.after-article' , '.type-commenting' , '.more-posts' , [ 'p[style]' ] ]
}
} ;
var WwwCbssportsComExtractor = {
domain : 'www.cbssports.com' ,
title : {
selectors : [ '.Article-headline' , '.article-headline' ]
} ,
author : {
selectors : [ '.ArticleAuthor-nameText' , '.author-name' ]
} ,
date _published : {
selectors : [ [ 'meta[itemprop="datePublished"]' , 'value' ] ] ,
timezone : 'UTC'
} ,
dek : {
selectors : [ '.Article-subline' , '.article-subline' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwMsnbcComExtractor = {
domain : 'www.msnbc.com' ,
title : {
selectors : [ 'h1' , 'h1.is-title-pane' ]
} ,
author : {
selectors : [ '.byline-name' , '.author' ]
} ,
date _published : {
selectors : [ [ 'meta[itemprop="datePublished"]' , 'value' ] , [ 'meta[name="DC.date.issued"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article-body__content' , '.pane-node-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.pane-node-body' : function paneNodeBody ( $node , $ ) {
var _WwwMsnbcComExtractor = _slicedToArray$1 ( WwwMsnbcComExtractor . lead _image _url . selectors [ 0 ] , 2 ) ,
selector = _WwwMsnbcComExtractor [ 0 ] ,
attr = _WwwMsnbcComExtractor [ 1 ] ;
var src = $ ( selector ) . attr ( attr ) ;
if ( src ) {
$node . prepend ( "<img src=\"" . concat ( src , "\" />" ) ) ;
}
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwThepoliticalinsiderComExtractor = {
domain : 'www.thepoliticalinsider.com' ,
title : {
selectors : [ [ 'meta[name="sailthru.title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'meta[name="sailthru.author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="sailthru.date"]' , 'value' ] ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div#article-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwMentalflossComExtractor = {
domain : 'www.mentalfloss.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , 'h1.title' , '.title-group' , '.inner' ]
} ,
author : {
selectors : [ 'a[data-vars-label*="authors"]' , '.field-name-field-enhanced-authors' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , '.date-display-single' ] ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article main' , 'div.field.field-name-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'small' ]
}
} ;
var AbcnewsGoComExtractor = {
domain : 'abcnews.go.com' ,
title : {
selectors : [ 'div[class*="Article_main__body"] h1' , '.article-header h1' ]
} ,
author : {
selectors : [ '.ShareByline span:nth-child(2)' , '.authors' ] ,
clean : [ '.author-overlay' , '.by-text' ]
} ,
date _published : {
selectors : [ '.ShareByline' , '.timestamp' ] ,
format : 'MMMM D, YYYY h:mm a' ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article' , '.article-copy' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwNydailynewsComExtractor = {
domain : 'www.nydailynews.com' ,
title : {
selectors : [ 'h1.headline' , 'h1#ra-headline' ]
} ,
author : {
selectors : [ '.article_byline span' , [ 'meta[name="parsely-author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ 'time' , [ 'meta[name="sailthru.date"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article' , 'article#ra-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'dl#ra-tags' , '.ra-related' , 'a.ra-editor' , 'dl#ra-share-bottom' ]
}
} ;
var WwwCnbcComExtractor = {
domain : 'www.cnbc.com' ,
title : {
selectors : [ 'h1.title' , 'h1.ArticleHeader-headline' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div#article_body.content' , 'div.story' , 'div.ArticleBody-articleBody' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwPopsugarComExtractor = {
domain : 'www.popsugar.com' ,
title : {
selectors : [ 'h2.post-title' , 'title-text' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.share-copy-title' , '.post-tags' , '.reactions' ]
}
} ;
var ObserverComExtractor = {
domain : 'observer.com' ,
title : {
selectors : [ 'h1.entry-title' ]
} ,
author : {
selectors : [ '.author' , '.vcard' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'h2.dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var PeopleComExtractor = {
domain : 'people.com' ,
title : {
selectors : [ '.article-header h1' , [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'meta[name="sailthru.author"]' , 'value' ] , 'a.author.url.fn' ]
} ,
date _published : {
selectors : [ '.mntl-attribution__item-date' , [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.article-header h2' ]
} ,
content : {
selectors : [ 'div[class^="loc article-content"]' , 'div.article-body__inner' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwUsmagazineComExtractor = {
domain : 'www.usmagazine.com' ,
title : {
selectors : [ 'header h1' ]
} ,
author : {
selectors : [ 'a.author' , 'a.article-byline.tracked-offpage' ]
} ,
date _published : {
timezone : 'America/New_York' ,
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.module-related' ]
}
} ;
var WwwRollingstoneComExtractor = {
domain : 'www.rollingstone.com' ,
title : {
selectors : [ 'h1.l-article-header__row--title' , 'h1.content-title' ]
} ,
author : {
selectors : [ 'a.c-byline__link' , 'a.content-author.tracked-offpage' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , 'time.content-published-date' ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ 'h2.l-article-header__row--lead' , '.content-description' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.l-article-content' , [ '.lead-container' , '.article-content' ] , '.article-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.c-related-links-wrapper' , '.module-related' ]
}
} ;
var twofortysevensportsComExtractor = {
domain : '247sports.com' ,
title : {
selectors : [ 'title' , 'article header h1' ]
} ,
author : {
selectors : [ '.article-cnt__author' , '.author' ]
} ,
date _published : {
selectors : [ [ 'time[data-published]' , 'data-published' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article-body' , 'section.body.article' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var UproxxComExtractor = {
domain : 'uproxx.com' ,
title : {
selectors : [ 'div.entry-header h1' ]
} ,
author : {
selectors : [ [ 'meta[name="qc:author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div.image' : 'figure' ,
'div.image .wp-media-credit' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwEonlineComExtractor = {
domain : 'www.eonline.com' ,
title : {
selectors : [ 'h1.article-detail__title' , 'h1.article__title' ]
} ,
author : {
selectors : [ '.article-detail__meta__author' , '.entry-meta__author a' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'meta[itemprop="datePublished"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '.article-detail__main-content section' ] , [ '.post-content section, .post-content div.post-content__image' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div.post-content__image' : 'figure' ,
'div.post-content__image .image__credits' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwMiamiheraldComExtractor = {
domain : 'www.miamiherald.com' ,
title : {
selectors : [ 'h1.title' ]
} ,
date _published : {
selectors : [ 'p.published-date' ] ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.dateline-storybody' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwRefinery29ComExtractor = {
domain : 'www.refinery29.com' ,
title : {
selectors : [ 'h1.title' ]
} ,
author : {
selectors : [ '.contributor' ]
} ,
date _published : {
selectors : [ [ 'meta[name="sailthru.date"]' , 'value' ] ] ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '.full-width-opener' , '.article-content' ] , '.article-content' , '.body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div.loading noscript' : function divLoadingNoscript ( $node ) {
var imgHtml = $node . html ( ) ;
$node . parents ( '.loading' ) . replaceWith ( imgHtml ) ;
} ,
'.section-image' : 'figure' ,
'.section-image .content-caption' : 'figcaption' ,
'.section-text' : 'p'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.story-share' ]
}
} ;
var WwwMacrumorsComExtractor = {
domain : 'www.macrumors.com' ,
title : {
selectors : [ 'h1' , 'h1.title' ]
} ,
author : {
selectors : [ 'article a[rel="author"]' , '.author-url' ]
} ,
date _published : {
selectors : [ [ 'time' , 'datetime' ] ] ,
timezone : 'America/Los_Angeles'
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article' , '.article' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwAndroidcentralComExtractor = {
domain : 'www.androidcentral.com' ,
title : {
selectors : [ 'h1' , 'h1.main-title' ]
} ,
author : {
selectors : [ [ 'meta[name="parsely-author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#article-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.intro' , 'blockquote' ]
}
} ;
var WwwSiComExtractor = {
domain : 'www.si.com' ,
title : {
selectors : [ 'h1' , 'h1.headline' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="published"]' , 'value' ] ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ '.m-detail-header--dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.m-detail--body' , [ 'p' , '.marquee_large_2x' , '.component.image' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
noscript : function noscript ( $node ) {
var $children = $node . children ( ) ;
if ( $children . length === 1 && $children . get ( 0 ) . tagName === 'img' ) {
return 'figure' ;
}
return null ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ [ '.inline-thumb' , '.primary-message' , '.description' , '.instructions' ] ]
}
} ;
var WwwRawstoryComExtractor = {
domain : 'www.rawstory.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , '.blog-title' ]
} ,
author : {
selectors : [ 'div.main-post-head .social-author__name' , '.blog-author a:first-of-type' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , '.blog-author a:last-of-type' ] ,
timezone : 'EST'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.post-body' , '.blog-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwCnetComExtractor = {
domain : 'www.cnet.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ 'span.author' , 'a.author' ]
} ,
date _published : {
selectors : [ 'time' ] ,
timezone : 'America/Los_Angeles'
} ,
dek : {
selectors : [ '.c-head_dek' , '.article-dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'img.__image-lead__' , '.article-main-body' ] , '.article-main-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'figure.image' : function figureImage ( $node ) {
var $img = $node . find ( 'img' ) ;
$img . attr ( 'width' , '100%' ) ;
$img . attr ( 'height' , '100%' ) ;
$img . addClass ( '__image-lead__' ) ;
$node . remove ( '.imgContainer' ) . prepend ( $img ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwTodayComExtractor = {
domain : 'www.today.com' ,
title : {
selectors : [ 'h1.article-hero-headline__htag' , 'h1.entry-headline' ]
} ,
author : {
selectors : [ 'span.byline-name' , [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ 'time[datetime]' , [ 'meta[name="DC.date.issued"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article-body__content' , '.entry-container' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.label-comment' ]
}
} ;
var WwwAlComExtractor = {
domain : 'www.al.com' ,
title : {
selectors : [ [ 'meta[name="title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'meta[name="article_author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article_date_original"]' , 'value' ] ] ,
timezone : 'EST'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwThepennyhoarderComExtractor = {
domain : 'www.thepennyhoarder.com' ,
title : {
selectors : [ [ 'meta[name="dcterms.title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'link[rel="author"]' , 'title' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '.post-img' , '.post-text' ] , '.post-text' , '.single-post-content-inner' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwWesternjournalismComExtractor = {
domain : 'www.westernjournalism.com' ,
title : {
selectors : [ 'title' , 'h1.entry-title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="DC.date.issued"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.subtitle' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article-sharing.top + div' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.ad-notice-small' ]
}
} ;
var WwwAmericanowComExtractor = {
domain : 'www.americanow.com' ,
title : {
selectors : [ '.title' , [ 'meta[name="title"]' , 'value' ] ]
} ,
author : {
selectors : [ '.byline' ]
} ,
date _published : {
selectors : [ [ 'meta[name="publish_date"]' , 'value' ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '.article-content' , '.image' , '.body' ] , '.body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.article-video-wrapper' , '.show-for-small-only' ]
}
} ;
var ScienceflyComExtractor = {
domain : 'sciencefly.com' ,
title : {
selectors : [ '.entry-title' , '.cb-entry-title' , '.cb-single-title' ]
} ,
author : {
selectors : [ 'div.cb-author' , 'div.cb-author-title' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'div.theiaPostSlider_slides img' , 'src' ] ]
} ,
content : {
selectors : [ 'div.theiaPostSlider_slides' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var HellogigglesComExtractor = {
domain : 'hellogiggles.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , '.title' ]
} ,
author : {
selectors : [ '.byline-wrapper span.author_name' , '.author-link' ]
} ,
date _published : {
selectors : [ [ 'meta[property="article:published_time"]' , 'content' ] , [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.main-content' , '.entry-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var ThoughtcatalogComExtractor = {
domain : 'thoughtcatalog.com' ,
title : {
selectors : [ 'h1.title' , [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ 'cite a' , 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name' , 'h1.writer-name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.entry.post' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.tc_mark' , 'figcaption' ]
}
} ;
var WwwInquisitrComExtractor = {
domain : 'www.inquisitr.com' ,
title : {
selectors : [ 'h1.entry-title.story--header--title' ]
} ,
author : {
selectors : [ 'div.story--header--author' ]
} ,
date _published : {
selectors : [ [ 'meta[name="datePublished"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article.story' , '.entry-content.' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.post-category' , '.story--header--socials' , '.story--header--content' ]
}
} ;
var WwwNbcnewsComExtractor = {
domain : 'www.nbcnews.com' ,
title : {
selectors : [ 'div.article-hero-headline h1' , 'div.article-hed h1' ]
} ,
author : {
selectors : [ 'div.article-inline-byline span.byline-name' , 'span.byline_author' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published"]' , 'value' ] , [ '.flag_article-wrapper time.timestamp_article[datetime]' , 'datetime' ] , '.flag_article-wrapper time' ] ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article-body__content' , 'div.article-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var FortuneComExtractor = {
domain : 'fortune.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ '.MblGHNMJ' ] ,
timezone : 'UTC'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'picture' , 'article.row' ] , 'article.row' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwLinkedinComExtractor = {
domain : 'www.linkedin.com' ,
title : {
selectors : [ '.article-title' , 'h1' ]
} ,
author : {
selectors : [ '.main-author-card h3' , [ 'meta[name="article:author"]' , 'value' ] , '.entity-name a[rel=author]' ]
} ,
date _published : {
selectors : [ '.base-main-card__metadata' , [ 'time[itemprop="datePublished"]' , 'datetime' ] ] ,
timezone : 'America/Los_Angeles'
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article-content__body' , [ 'header figure' , '.prose' ] , '.prose' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.entity-image' ]
}
} ;
var ObamawhitehouseArchivesGovExtractor = {
domain : 'obamawhitehouse.archives.gov' ,
supportedDomains : [ 'whitehouse.gov' ] ,
title : {
selectors : [ 'h1' , '.pane-node-title' ]
} ,
author : {
selectors : [ '.blog-author-link' , '.node-person-name-link' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.field-name-field-forall-summary' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
defaultCleaner : false ,
selectors : [ 'div#content-start' , '.pane-node-field-forall-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.pane-node-title' , '.pane-custom.pane-1' ]
}
} ;
var WwwOpposingviewsComExtractor = {
domain : 'www.opposingviews.com' ,
title : {
selectors : [ 'h1.m-detail-header--title' , 'h1.title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] , 'div.date span span a' ]
} ,
date _published : {
selectors : [ [ 'meta[name="published"]' , 'value' ] , [ 'meta[name="publish_date"]' , 'value' ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.m-detail--body' , '.article-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.show-for-small-only' ]
}
} ;
var WwwProspectmagazineCoUkExtractor = {
domain : 'www.prospectmagazine.co.uk' ,
title : {
selectors : [ '.blog-header__title' , '.page-title' ]
} ,
author : {
selectors : [ '.blog-header__author-link' , '.aside_author .title' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , '.post-info' ] ,
timezone : 'Europe/London'
} ,
dek : {
selectors : [ '.blog-header__description' , '.page-subtitle' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.blog__container' , 'article .post_content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var ForwardComExtractor = {
domain : 'forward.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ '.post-author a' , '.author-name' , [ 'meta[name="sailthru.author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'meta[name="date"]' , 'value' ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.content-container article' , [ '.post-item-media-wrap' , '.post-item p' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.post-author' , '.donate-box' , '.message' , '.subtitle' ]
}
} ;
var WwwQdailyComExtractor = {
domain : 'www.qdaily.com' ,
title : {
selectors : [ 'h2' , 'h2.title' ]
} ,
author : {
selectors : [ '.name' ]
} ,
date _published : {
selectors : [ [ '.date.smart-date' , 'data-origindate' ] ]
} ,
dek : {
selectors : [ '.excerpt' ]
} ,
lead _image _url : {
selectors : [ [ '.article-detail-hd img' , 'src' ] ]
} ,
content : {
selectors : [ '.detail' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.lazyload' , '.lazylad' , '.lazylood' ]
}
} ;
var GothamistComExtractor = {
domain : 'gothamist.com' ,
supportedDomains : [ 'chicagoist.com' , 'laist.com' , 'sfist.com' , 'shanghaiist.com' , 'dcist.com' ] ,
title : {
selectors : [ 'h1' , '.entry-header h1' ]
} ,
author : {
// There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
selectors : [ '.article-metadata:nth-child(3) .byline-author' , '.author' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , 'abbr' , 'abbr.published' ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ null ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article-body' , '.entry-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div.image-none' : 'figure' ,
'.image-none i' : 'figcaption' ,
'div.image-left' : 'figure' ,
'.image-left i' : 'figcaption' ,
'div.image-right' : 'figure' ,
'.image-right i' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.image-none br' , '.image-left br' , '.image-right br' , '.galleryEase' ]
}
} ;
var WwwFoolComExtractor = {
domain : 'www.fool.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] , '.author-inline .author-name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="date"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] , 'header h2' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.tailwind-article-body' , '.article-content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'.caption img' : function captionImg ( $node ) {
var src = $node . attr ( 'src' ) ;
$node . parent ( ) . replaceWith ( "<figure><img src=\"" . concat ( src , "\"/></figure>" ) ) ;
} ,
'.caption' : 'figcaption'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '#pitch' ]
}
} ;
var WwwSlateComExtractor = {
domain : 'www.slate.com' ,
title : {
selectors : [ '.hed' , 'h1' ]
} ,
author : {
selectors : [ 'a[rel=author]' ]
} ,
date _published : {
selectors : [ '.pub-date' ] ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ '.dek' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.about-the-author' , '.pullquote' , '.newsletter-signup-component' , '.top-comment' ]
}
} ;
var IciRadioCanadaCaExtractor = {
domain : 'ici.radio-canada.ca' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="dc.creator"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="dc.date.created"]' , 'value' ] ] ,
format : 'YYYY-MM-DD|HH[h]mm' ,
timezone : 'America/New_York'
} ,
dek : {
selectors : [ 'div.lead-container' , '.bunker-component.lead' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'section.document-content-style' , [ '.main-multimedia-item' , '.news-story-content' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwFortinetComExtractor = {
domain : 'www.fortinet.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '.b15-blog-meta__author' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12' ] ,
transforms : {
noscript : function noscript ( $node ) {
var $children = $node . children ( ) ;
if ( $children . length === 1 && $children . get ( 0 ) . tagName === 'img' ) {
return 'figure' ;
}
return null ;
}
}
}
} ;
var WwwFastcompanyComExtractor = {
domain : 'www.fastcompany.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.post__deck' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.post__article' ]
}
} ;
var BlisterreviewComExtractor = {
domain : 'blisterreview.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , 'h1.entry-title' ]
} ,
author : {
selectors : [ 'span.author-name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'time.entry-date' , 'datetime' ] , [ 'meta[itemprop="datePublished"]' , 'content' ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] , [ 'meta[property="og:image"]' , 'content' ] , [ 'meta[itemprop="image"]' , 'content' ] , [ 'meta[name="twitter:image"]' , 'content' ] , [ 'img.attachment-large' , 'src' ] ]
} ,
content : {
selectors : [ [ '.elementor-section-wrap' , '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
figcaption : 'p'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.comments-area' ]
}
} ;
var NewsMynaviJpExtractor = {
domain : 'news.mynavi.jp' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ 'a.articleHeader_name' , 'main div.article-author a.article-author__name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article-body' , 'main article div' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
img : function img ( $node ) {
var src = $node . attr ( 'data-original' ) ;
if ( src !== '' ) {
$node . attr ( 'src' , src ) ;
}
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var ClinicaltrialsGovExtractor = {
domain : 'clinicaltrials.gov' ,
title : {
selectors : [ 'h1.tr-solo_record' ]
} ,
author : {
selectors : [ 'div#sponsor.tr-info-text' ]
} ,
date _published : {
// selectors: ['span.term[data-term="Last Update Posted"]'],
selectors : [ 'div:has(> span.term[data-term="Last Update Posted"])' ]
} ,
content : {
selectors : [ 'div#tab-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.usa-alert> img' ]
}
} ;
var GithubComExtractor = {
domain : 'github.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ // enter author selectors
]
} ,
date _published : {
selectors : [ [ 'relative-time[datetime]' , 'datetime' ] , [ 'span[itemprop="dateModified"] relative-time' , 'datetime' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] , 'span[itemprop="about"]' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '#readme article' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwRedditComExtractor = {
domain : 'www.reddit.com' ,
title : {
selectors : [ 'div[data-test-id="post-content"] h1' , 'div[data-test-id="post-content"] h2' ]
} ,
author : {
selectors : [ 'div[data-test-id="post-content"] a[href*="user/"]' ]
} ,
date _published : {
selectors : [ 'div[data-test-id="post-content"] span[data-click-id="timestamp"]' , 'div[data-test-id="post-content"] a[data-click-id="timestamp"]' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'div[data-test-id="post-content"] p' ] , // text post
[ 'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])' , // external link
'div[data-test-id="post-content"] div[data-click-id="media"]' ] , // external link with media preview (YouTube, imgur album, etc...)
[ 'div[data-test-id="post-content"] div[data-click-id="media"]' ] , // Embedded media (Reddit video)
[ 'div[data-test-id="post-content"] a' ] , // external link
'div[data-test-id="post-content"]' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'div[role="img"]' : function divRoleImg ( $node ) {
// External link image preview
var $img = $node . find ( 'img' ) ;
var bgImg = $node . css ( 'background-image' ) ;
if ( $img . length === 1 && bgImg ) {
$img . attr ( 'src' , bgImg . match ( /\((.*?)\)/ ) [ 1 ] . replace ( /('|")/g , '' ) ) ;
return $img ;
}
return $node ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.icon' , 'span[id^="PostAwardBadges"]' , 'div a[data-test-id="comments-page-link-num-comments"]' ]
}
} ;
var OtrsComExtractor = {
domain : 'otrs.com' ,
title : {
selectors : [ '#main article h1' ]
} ,
author : {
selectors : [ 'div.dateplusauthor a' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#main article' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'div.dateplusauthor' , 'div.gr-12.push-6.footershare' , '#atftbx' , 'div.category-modul' ]
}
} ;
var WwwOssnewsJpExtractor = {
domain : 'www.ossnews.jp' ,
title : {
selectors : [ '#alpha-block h1.hxnewstitle' ]
} ,
author : null ,
date _published : {
selectors : [ 'p.fs12' ] ,
format : 'YYYY年MM月DD日 HH:mm' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#alpha-block .section:has(h1.hxnewstitle)' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var BuzzapJpExtractor = {
domain : 'buzzap.jp' ,
title : {
selectors : [ 'h1.entry-title' ]
} ,
author : null ,
date _published : {
selectors : [ [ 'time.entry-date' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.ctiframe' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwAsahiComExtractor = {
domain : 'www.asahi.com' ,
title : {
selectors : [ 'main h1' , '.ArticleTitle h1' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="pubdate"]' , 'value' ] ]
} ,
dek : null ,
excerpt : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'main' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'div.AdMod' , 'div.LoginSelectArea' , 'time' , 'div.notPrint' ]
}
} ;
var WwwSanwaCoJpExtractor = {
domain : 'www.sanwa.co.jp' ,
title : {
selectors : [ '#newsContent h1' ]
} ,
author : null ,
date _published : {
selectors : [ 'dl.date' ] ,
format : 'YYYY.MM.DD' ,
timezone : 'Asia/Tokyo'
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#newsContent' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '#smartphone' , 'div.sns_box' , 'div.contentFoot' ]
}
} ;
var WwwElecomCoJpExtractor = {
domain : 'www.elecom.co.jp' ,
title : {
selectors : [ 'title' ]
} ,
author : null ,
date _published : {
selectors : [ 'p.section-last' ] ,
format : 'YYYY.MM.DD' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ 'td.TableMain2' ] ,
defaultCleaner : false ,
transforms : {
table : function table ( $node ) {
$node . attr ( 'width' , 'auto' ) ;
}
} ,
clean : [ ]
}
} ;
var ScanNetsecurityNeJpExtractor = {
domain : 'scan.netsecurity.ne.jp' ,
title : {
selectors : [ 'header.arti-header h1.head' ]
} ,
author : null ,
date _published : {
selectors : [ [ 'meta[name="article:modified_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'header.arti-header p.arti-summary' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.arti-content.arti-content--thumbnail' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'aside.arti-giga' ]
}
} ;
var JvndbJvnJpExtractor = {
domain : 'jvndb.jvn.jp' ,
title : {
selectors : [ 'title' ]
} ,
author : null ,
date _published : {
selectors : [ 'div.modifytxt:nth-child(2)' ] ,
format : 'YYYY/MM/DD' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ '#news-list' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var GeniusComExtractor = {
domain : 'genius.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ 'h2 a' ]
} ,
date _published : {
selectors : [ [ 'meta[itemprop=page_data]' , 'value' , function ( res ) {
var json = JSON . parse ( res ) ;
return json . song . release _date ;
} ] ]
} ,
dek : {
selectors : [ // enter selectors
]
} ,
lead _image _url : {
selectors : [ [ 'meta[itemprop=page_data]' , 'value' , function ( res ) {
var json = JSON . parse ( res ) ;
return json . song . album . cover _art _url ;
} ] ]
} ,
content : {
selectors : [ '.lyrics' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwJnsaOrgExtractor = {
domain : 'www.jnsa.org' ,
title : {
selectors : [ '#wgtitle h2' ]
} ,
author : null ,
date _published : null ,
dek : null ,
excerpt : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#main_area' ] ,
transforms : { } ,
clean : [ '#pankuzu' , '#side' ]
}
} ;
var PhpspotOrgExtractor = {
domain : 'phpspot.org' ,
title : {
selectors : [ 'h3.hl' ]
} ,
author : null ,
date _published : {
selectors : [ 'h4.hl' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ 'div.entrybody' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwInfoqComExtractor = {
domain : 'www.infoq.com' ,
title : {
selectors : [ 'h1.heading' ]
} ,
author : {
selectors : [ 'div.widget.article__authors' ]
} ,
date _published : {
selectors : [ '.article__readTime.date' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article__data' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwMoongiftJpExtractor = {
domain : 'www.moongift.jp' ,
title : {
selectors : [ 'h1.title a' ]
} ,
author : null ,
date _published : {
selectors : [ 'ul.meta li:not(.social):first-of-type' ] ,
timezone : 'Asia/Tokyo'
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#main' ] ,
transforms : { } ,
clean : [ 'ul.mg_service.cf' ]
}
} ;
var WwwItmediaCoJpExtractor = {
domain : 'www.itmedia.co.jp' ,
supportedDomains : [ 'www.atmarkit.co.jp' , 'techtarget.itmedia.co.jp' , 'nlab.itmedia.co.jp' ] ,
title : {
selectors : [ '#cmsTitle h1' ]
} ,
author : {
selectors : [ '#byline' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:modified_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '#cmsAbstract h2' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#cmsBody' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '#snsSharebox' ]
}
} ;
var WwwPublickey1JpExtractor = {
domain : 'www.publickey1.jp' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '.bloggerinchief p:first-of-type' , '#subcol p:has(img)' ]
} ,
date _published : {
selectors : [ 'div.pubdate' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#maincol' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '#breadcrumbs' , 'div.sbm' , 'div.ad_footer' ]
}
} ;
var TakagihiromitsuJpExtractor = {
domain : 'takagi-hiromitsu.jp' ,
title : {
selectors : [ 'h3' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[http-equiv="Last-Modified"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ 'div.body' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var BookwalkerJpExtractor = {
domain : 'bookwalker.jp' ,
title : {
selectors : [ 'h1.p-main__title' , 'h1.main-heading' ]
} ,
author : {
selectors : [ 'div.p-author__list' , 'div.authors' ]
} ,
date _published : {
selectors : [ 'dl.p-information__data dd:nth-of-type(7)' , '.work-info .work-detail:first-of-type .work-detail-contents:last-of-type' ] ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.p-main__information' , [ 'div.main-info' , 'div.main-cover-inner' ] ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'span.label.label--trial' , 'dt.info-head.info-head--coin' , 'dd.info-contents.info-contents--coin' , 'div.info-notice.fn-toggleClass' ]
}
} ;
var WwwYomiuriCoJpExtractor = {
domain : 'www.yomiuri.co.jp' ,
title : {
selectors : [ 'h1.title-article.c-article-title' ]
} ,
author : null ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.p-main-contents' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var JapanCnetComExtractor = {
domain : 'japan.cnet.com' ,
title : {
selectors : [ '.leaf-headline-ttl' ]
} ,
author : {
selectors : [ '.writer' ]
} ,
date _published : {
selectors : [ '.date' ] ,
format : 'YYYY年MM月DD日 HH時mm分' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article_body' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var DeadlineComExtractor = {
domain : 'deadline.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ 'section.author h2' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item' ] ,
transforms : {
'.embed-twitter' : function embedTwitter ( $node ) {
var innerHtml = $node . html ( ) ;
$node . replaceWith ( innerHtml ) ;
}
} ,
clean : [ 'figcaption' ]
}
} ;
var WwwGizmodoJpExtractor = {
domain : 'www.gizmodo.jp' ,
title : {
selectors : [ 'h1.p-post-title' ]
} ,
author : {
selectors : [ 'li.p-post-AssistAuthor' ]
} ,
date _published : {
selectors : [ [ 'li.p-post-AssistTime time' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article.p-post' ] ,
transforms : {
'img.p-post-thumbnailImage' : function imgPPostThumbnailImage ( $node ) {
var src = $node . attr ( 'src' ) ;
$node . attr ( 'src' , src . replace ( /^.*=%27/ , '' ) . replace ( /%27;$/ , '' ) ) ;
}
} ,
clean : [ 'h1.p-post-title' , 'ul.p-post-Assist' ]
}
} ;
var GetnewsJpExtractor = {
domain : 'getnews.jp' ,
title : {
selectors : [ 'article h1' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] , 'span.prof' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'ul.cattag-top time' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.post-bodycopy' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwLifehackerJpExtractor = {
domain : 'www.lifehacker.jp' ,
title : {
selectors : [ 'h1[class^="article_pArticle_Title"]' , 'h1.lh-summary-title' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] , 'p.lh-entryDetailInner--credit' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'div.lh-entryDetail-header time' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div[class^="article_pArticle_Body__"]' , 'div.lh-entryDetail-body' ] ,
transforms : {
'img.lazyload' : function imgLazyload ( $node ) {
var src = $node . attr ( 'src' ) ;
$node . attr ( 'src' , src . replace ( /^.*=%27/ , '' ) . replace ( /%27;$/ , '' ) ) ;
}
} ,
clean : [ 'p.lh-entryDetailInner--credit' ]
}
} ;
var SectIijAdJpExtractor = {
domain : 'sect.iij.ad.jp' ,
title : {
selectors : [ 'div.title-box-inner h1' , 'h3' ]
} ,
author : {
selectors : [ 'p.post-author a' , 'dl.entrydate dd' ]
} ,
date _published : {
selectors : [ 'time' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.entry-inner' , '#article' ] ,
transforms : { } ,
clean : [ 'dl.entrydate' ]
}
} ;
var WwwOreillyCoJpExtractor = {
domain : 'www.oreilly.co.jp' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , 'h3' ]
} ,
author : {
selectors : [ 'span[itemprop="author"]' , 'li[itemprop="author"]' ]
} ,
date _published : {
selectors : [ [ 'dd[itemprop="datePublished"]' , 'content' ] , [ 'meta[itemprop="datePublished"]' , 'value' ] ] ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image:secure_url"]' , 'value' ] , [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'section.detail' , '#content' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '.social-tools' ]
}
} ;
var WwwIpaGoJpExtractor = {
domain : 'www.ipa.go.jp' ,
title : {
selectors : [ 'h1' ]
} ,
author : null ,
date _published : {
selectors : [ 'p.ipar_text_right' ] ,
format : 'YYYY年M月D日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ '#ipar_main' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'p.ipar_text_right' ]
}
} ;
var WeeklyAsciiJpExtractor = {
domain : 'weekly.ascii.jp' ,
title : {
selectors : [ 'article h1' , 'h1[itemprop="headline"]' ]
} ,
author : {
selectors : [ 'p.author' ]
} ,
date _published : {
selectors : [ 'p.date' , [ 'meta[name="odate"]' , 'value' ] ] ,
format : 'YYYY年MM月DD日 HH:mm' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div#contents_detail' , 'div.article' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var TechlogIijAdJpExtractor = {
domain : 'techlog.iij.ad.jp' ,
title : {
selectors : [ 'h1.entry-title' ]
} ,
author : {
selectors : [ 'a[rel="author"]' ]
} ,
date _published : {
selectors : [ [ 'time.entry-date' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.entry-content' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '.wp_social_bookmarking_light' ]
}
} ;
var WiredJpExtractor = {
domain : 'wired.jp' ,
title : {
selectors : [ 'h1[data-testid="ContentHeaderHed"]' , 'h1.post-title' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] , 'p[itemprop="author"]' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'time' , 'datetime' ] ]
} ,
dek : {
selectors : [ 'div[class^="ContentHeaderDek"]' , '.post-intro' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div[data-attribute-verso-pattern="article-body"]' , 'article.article-detail' ] ,
transforms : {
'img[data-original]' : function imgDataOriginal ( $node ) {
var dataOriginal = $node . attr ( 'data-original' ) ;
var src = $node . attr ( 'src' ) ;
var url = URL$1 . resolve ( src , dataOriginal ) ;
$node . attr ( 'src' , url ) ;
}
} ,
clean : [ '.post-category' , 'time' , 'h1.post-title' , '.social-area-syncer' ]
}
} ;
var JapanZdnetComExtractor = {
domain : 'japan.zdnet.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="cXenseParse:author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article_body' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwRbbtodayComExtractor = {
domain : 'www.rbbtoday.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '.writer.writer-name' ]
} ,
date _published : {
selectors : [ [ 'header time' , 'datetime' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] , '.arti-summary' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.arti-content' ] ,
transforms : { } ,
clean : [ '.arti-giga' ]
}
} ;
var WwwLemondeFrExtractor = {
domain : 'www.lemonde.fr' ,
title : {
selectors : [ 'h1.article__title' ]
} ,
author : {
selectors : [ '.author__name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="og:article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.article__desc' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article__content' ] ,
transforms : { } ,
clean : [ 'figcaption' ]
}
} ;
var WwwPhoronixComExtractor = {
domain : 'www.phoronix.com' ,
title : {
selectors : [ 'article h1' , 'article header' ]
} ,
author : {
selectors : [ '.author a:first-child' ]
} ,
date _published : {
selectors : [ '.author' ] ,
// 1 June 2019 at 08:34 PM EDT
format : 'D MMMM YYYY at hh:mm' ,
timezone : 'America/New_York'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ '.content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var PitchforkComExtractor = {
domain : 'pitchfork.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , 'title' ]
} ,
author : {
selectors : [ [ 'meta[name="article:author"]' , 'value' ] , '.authors-detail__display-name' ]
} ,
date _published : {
selectors : [ 'div[class^="InfoSliceWrapper-"]' , [ '.pub-date' , 'datetime' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] , '.review-detail__abstract' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] , [ '.single-album-tombstone__art img' , 'src' ] ]
} ,
content : {
selectors : [ 'div.body__inner-container' , '.review-detail__text' ]
} ,
extend : {
score : {
selectors : [ 'p[class*="Rating"]' , '.score' ]
}
}
} ;
var BiorxivOrgExtractor = {
domain : 'biorxiv.org' ,
title : {
selectors : [ 'h1#page-title' ]
} ,
author : {
selectors : [ 'div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors' ]
} ,
content : {
selectors : [ 'div#abstract-1' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var EpaperZeitDeExtractor = {
domain : 'epaper.zeit.de' ,
title : {
selectors : [ 'p.title' ]
} ,
author : {
selectors : [ '.article__author' ]
} ,
date _published : null ,
excerpt : {
selectors : [ 'subtitle' ]
} ,
lead _image _url : null ,
content : {
selectors : [ '.article' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
'p.title' : 'h1' ,
'.article__author' : 'p' ,
byline : 'p' ,
linkbox : 'p'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'image-credits' , 'box[type=citation]' ]
}
} ;
var WwwLadbibleComExtractor = {
domain : 'www.ladbible.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '[class*=Byline]' ]
} ,
date _published : {
selectors : [ 'time' ] ,
timezone : 'Europe/London'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '[class*=ArticleContainer]' ] ,
clean : [ 'time' , 'source' , 'a[href^="https://www.ladbible.com/"]' , 'picture' , '[class*=StyledCardBlock]' ]
}
} ;
var TimesofindiaIndiatimesComExtractor = {
domain : 'timesofindia.indiatimes.com' ,
title : {
selectors : [ 'h1' ]
} ,
extend : {
reporter : {
selectors : [ 'div.byline' ] ,
transforms : { }
}
} ,
date _published : {
selectors : [ '.byline' ] ,
format : 'MMM D, YYYY, HH:mm z' ,
timezone : 'Asia/Kolkata'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.contentwrapper:has(section)' ] ,
defaultCleaner : false ,
clean : [ 'section' , 'h1' , '.byline' , '.img_cptn' , '.icon_share_wrap' , 'ul[itemtype="https://schema.org/BreadcrumbList"]' ]
}
} ;
var MaTtiasBeExtractor = {
domain : 'ma.ttias.be' ,
title : {
selectors : [ [ 'meta[name="twitter:title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
content : {
selectors : [ [ '.content' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
h2 : function h2 ( $node ) {
// The "id" attribute values would result in low scores and the element being
// removed.
$node . attr ( 'id' , null ) ; // h1 elements will be demoted to h2, so demote h2 elements to h3.
return 'h3' ;
} ,
h1 : function h1 ( $node ) {
// The "id" attribute values would result in low scores and the element being
// removed.
$node . attr ( 'id' , null ) ; // A subsequent h2 will be removed if there is not a paragraph before it, so
// add a paragraph here. It will be removed anyway because it is empty.
$node . after ( '<p></p>' ) ;
} ,
ul : function ul ( $node ) {
// Articles contain lists of links which look like, but are not, navigation
// elements. Adding this class attribute avoids them being incorrectly removed.
$node . attr ( 'class' , 'entry-content-asset' ) ;
}
}
}
} ;
var PastebinComExtractor = {
domain : 'pastebin.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '.username' , '.paste_box_line2 .t_us + a' ]
} ,
date _published : {
selectors : [ '.date' , '.paste_box_line2 .t_da + span' ] ,
timezone : 'America/New_York' ,
format : 'MMMM D, YYYY'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.source' , '#selectable .text' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
ol : 'div' ,
li : 'p'
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
/* eslint-disable no-nested-ternary */
/* eslint-disable no-unused-expressions */
var WwwAbendblattDeExtractor = {
domain : 'www.abendblatt.de' ,
title : {
selectors : [ 'h2.article__header__headline' ]
} ,
author : {
selectors : [ 'span.author-info__name-text' ]
} ,
date _published : {
selectors : [ [ 'time.teaser-stream-time' , 'datetime' ] , [ 'time.article__header__date' , 'datetime' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article__body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
p : function p ( $node ) {
if ( ! $node . hasClass ( 'obfuscated' ) ) return null ;
var o = '' ;
var n = 0 ;
for ( var i = $node . text ( ) ; n < i . length ; n += 1 ) {
var r = i . charCodeAt ( n ) ;
r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && ( o += String . fromCharCode ( r - 1 ) ) ;
}
$node . html ( o ) ;
$node . removeClass ( 'obfuscated' ) ;
$node . addClass ( 'deobfuscated' ) ;
return null ;
} ,
div : function div ( $node ) {
if ( ! $node . hasClass ( 'obfuscated' ) ) return null ;
var o = '' ;
var n = 0 ;
for ( var i = $node . text ( ) ; n < i . length ; n += 1 ) {
var r = i . charCodeAt ( n ) ;
r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && ( o += String . fromCharCode ( r - 1 ) ) ;
}
$node . html ( o ) ;
$node . removeClass ( 'obfuscated' ) ;
$node . addClass ( 'deobfuscated' ) ;
return null ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var WwwGrueneDeExtractor = {
domain : 'www.gruene.de' ,
title : {
selectors : [ 'header h1' ]
} ,
author : null ,
date _published : null ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[property="og:image"]' , 'content' ] ]
} ,
content : {
// selectors: ['section'],
selectors : [ [ 'section header' , 'section h2' , 'section p' , 'section ol' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'figcaption' , 'p[class]' ]
}
} ;
var WwwEngadgetComExtractor = {
domain : 'www.engadget.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ 'a.th-meta[data-ylk*="subsec:author"]' ]
} ,
// Engadget stories have publish dates, but the only representation of them on the page
// is in a format like "2h ago". There are also these tags with blank values:
// <meta class="swiftype" name="published_at" data-type="date" value="">
date _published : {
selectors : [ // enter selectors
]
} ,
dek : {
selectors : [ 'div[class*="o-title_mark"] div' ]
} ,
// Engadget stories do have lead images specified by an og:image meta tag, but selecting
// the value attribute of that tag fails. I believe the "ℑ" sequence of characters
// is triggering this inability to select the attribute value.
lead _image _url : {
selectors : [ // enter selectors
]
} ,
content : {
selectors : [ [ // Some figures will be inside div.article-text, but some header figures/images
// will not.
'#page_body figure:not(div.article-text figure)' , 'div.article-text' ] ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var ArstechnicaComExtractor = {
domain : 'arstechnica.com' ,
// Articles from this site are often paginated, but I was unable to write a CSS
// selector to find the next page. On the last page, there will be a link with a CSS
// selector indicating that the previous page is next. But the parser appears to find
// the next page without this extractor finding it, as long as the fallback option is
// left at its default value of true.
title : {
selectors : [ 'title' ]
} ,
author : {
selectors : [ '*[rel="author"] *[itemprop="name"]' ]
} ,
date _published : {
selectors : [ [ '.byline time' , 'datetime' ] ]
} ,
dek : {
selectors : [ 'h2[itemprop="description"]' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div[itemprop="articleBody"]' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
h2 : function h2 ( $node ) {
// Some pages have an element h2 that is significant, and that the parser will
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
// the empty paragraph will be removed anyway.
$node . before ( '<p></p>' ) ;
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result.
clean : [ // Remove enlarge links and separators inside image captions.
'figcaption .enlarge-link' , 'figcaption .sep' , // I could not transform the video into usable elements, so I
// removed them.
'figure.video' , // Image galleries that do not work.
'.gallery' , 'aside' , '.sidebar' ]
}
} ;
var WwwNdtvComExtractor = {
domain : 'www.ndtv.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] , 'h1.entry-title' ]
} ,
author : {
selectors : [ 'span[itemprop="author"] span[itemprop="name"]' ]
} ,
date _published : {
selectors : [ [ 'span[itemprop="dateModified"]' , 'content' ] ]
} ,
dek : {
selectors : [ 'h2' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div[itemprop="articleBody"]' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : {
// This site puts a dateline in a 'b' above the first paragraph, and then somehow
// blends it into the first paragraph with CSS. This transform moves the dateline
// to the first paragraph.
'.place_cont' : function place _cont ( $node ) {
if ( ! $node . parents ( 'p' ) . length ) {
var nextSibling = $node . next ( 'p' ) ;
if ( nextSibling ) {
$node . remove ( ) ;
nextSibling . prepend ( $node ) ;
}
}
}
} ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.highlghts_Wdgt' , '.ins_instory_dv_caption' , 'input' , '._world-wrapper .mt20' ]
}
} ;
var SpektrumExtractor = {
domain : 'www.spektrum.de' ,
title : {
selectors : [ '.content__title' ]
} ,
author : {
selectors : [ '.content__author__info__name' ]
} ,
date _published : {
selectors : [ '.content__meta__date' ] ,
timezone : 'Europe/Berlin'
} ,
dek : {
selectors : [ '.content__intro' ]
} ,
lead _image _url : {
selectors : [ // This is how the meta tag appears in the original source code.
[ 'meta[name="og:image"]' , 'value' ] , // This is how the meta tag appears in the DOM in Chrome.
// The selector is included here to make the code work within the browser as well.
[ 'meta[property="og:image"]' , 'content' ] , // This is the image that is shown on the page.
// It can be slightly cropped compared to the original in the meta tag.
'.image__article__top img' ]
} ,
content : {
selectors : [ 'article.content' ] ,
clean : [ '.breadcrumbs' , '.hide-for-print' , 'aside' , 'header h2' , '.image__article__top' , '.content__author' , '.copyright' , '.callout-box' ]
}
} ;
var PostlightComExtractor = {
domain : 'postlight.com' ,
title : {
selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ [ 'meta[name="parsely-author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ 'h2.single-hero__abstract' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'main.post' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ 'section.pl-post-link' , 'aside' , 'section.insights_featured_case_studies' ]
}
} ;
var WwwInvestmentexecutiveComExtractor = {
domain : 'www.investmentexecutive.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ 'div[itemprop="author"]' ]
} ,
date _published : {
selectors : [ [ 'meta[itemprop="datePublished"]' , 'value' ] ]
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'section.article-body' ] ,
clean : [ '.hidden' ]
}
} ;
var WwwCbcCaExtractor = {
domain : 'www.cbc.ca' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '.authorText' , '.bylineDetails' ]
} ,
date _published : {
selectors : [ [ '.timeStamp[datetime]' , 'datetime' ] ]
} ,
dek : {
selectors : [ '.deck' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.story' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var CustomExtractors =
/*#__PURE__*/
_Object$freeze ( {
BloggerExtractor : BloggerExtractor ,
NYMagExtractor : NYMagExtractor ,
WikipediaExtractor : WikipediaExtractor ,
TwitterExtractor : TwitterExtractor ,
NYTimesExtractor : NYTimesExtractor ,
TheAtlanticExtractor : TheAtlanticExtractor ,
NewYorkerExtractor : NewYorkerExtractor ,
WiredExtractor : WiredExtractor ,
MSNExtractor : MSNExtractor ,
YahooExtractor : YahooExtractor ,
BuzzfeedExtractor : BuzzfeedExtractor ,
WikiaExtractor : WikiaExtractor ,
LittleThingsExtractor : LittleThingsExtractor ,
PoliticoExtractor : PoliticoExtractor ,
DeadspinExtractor : DeadspinExtractor ,
BroadwayWorldExtractor : BroadwayWorldExtractor ,
ApartmentTherapyExtractor : ApartmentTherapyExtractor ,
MediumExtractor : MediumExtractor ,
WwwTmzComExtractor : WwwTmzComExtractor ,
WwwWashingtonpostComExtractor : WwwWashingtonpostComExtractor ,
WwwHuffingtonpostComExtractor : WwwHuffingtonpostComExtractor ,
NewrepublicComExtractor : NewrepublicComExtractor ,
MoneyCnnComExtractor : MoneyCnnComExtractor ,
WwwThevergeComExtractor : WwwThevergeComExtractor ,
WwwCnnComExtractor : WwwCnnComExtractor ,
WwwAolComExtractor : WwwAolComExtractor ,
WwwYoutubeComExtractor : WwwYoutubeComExtractor ,
WwwTheguardianComExtractor : WwwTheguardianComExtractor ,
WwwSbnationComExtractor : WwwSbnationComExtractor ,
WwwBloombergComExtractor : WwwBloombergComExtractor ,
WwwBustleComExtractor : WwwBustleComExtractor ,
WwwNprOrgExtractor : WwwNprOrgExtractor ,
WwwRecodeNetExtractor : WwwRecodeNetExtractor ,
QzComExtractor : QzComExtractor ,
WwwDmagazineComExtractor : WwwDmagazineComExtractor ,
WwwReutersComExtractor : WwwReutersComExtractor ,
MashableComExtractor : MashableComExtractor ,
WwwChicagotribuneComExtractor : WwwChicagotribuneComExtractor ,
WwwVoxComExtractor : WwwVoxComExtractor ,
NewsNationalgeographicComExtractor : NewsNationalgeographicComExtractor ,
WwwNationalgeographicComExtractor : WwwNationalgeographicComExtractor ,
WwwLatimesComExtractor : WwwLatimesComExtractor ,
PagesixComExtractor : PagesixComExtractor ,
ThefederalistpapersOrgExtractor : ThefederalistpapersOrgExtractor ,
WwwCbssportsComExtractor : WwwCbssportsComExtractor ,
WwwMsnbcComExtractor : WwwMsnbcComExtractor ,
WwwThepoliticalinsiderComExtractor : WwwThepoliticalinsiderComExtractor ,
WwwMentalflossComExtractor : WwwMentalflossComExtractor ,
AbcnewsGoComExtractor : AbcnewsGoComExtractor ,
WwwNydailynewsComExtractor : WwwNydailynewsComExtractor ,
WwwCnbcComExtractor : WwwCnbcComExtractor ,
WwwPopsugarComExtractor : WwwPopsugarComExtractor ,
ObserverComExtractor : ObserverComExtractor ,
PeopleComExtractor : PeopleComExtractor ,
WwwUsmagazineComExtractor : WwwUsmagazineComExtractor ,
WwwRollingstoneComExtractor : WwwRollingstoneComExtractor ,
twofortysevensportsComExtractor : twofortysevensportsComExtractor ,
UproxxComExtractor : UproxxComExtractor ,
WwwEonlineComExtractor : WwwEonlineComExtractor ,
WwwMiamiheraldComExtractor : WwwMiamiheraldComExtractor ,
WwwRefinery29ComExtractor : WwwRefinery29ComExtractor ,
WwwMacrumorsComExtractor : WwwMacrumorsComExtractor ,
WwwAndroidcentralComExtractor : WwwAndroidcentralComExtractor ,
WwwSiComExtractor : WwwSiComExtractor ,
WwwRawstoryComExtractor : WwwRawstoryComExtractor ,
WwwCnetComExtractor : WwwCnetComExtractor ,
WwwTodayComExtractor : WwwTodayComExtractor ,
WwwAlComExtractor : WwwAlComExtractor ,
WwwThepennyhoarderComExtractor : WwwThepennyhoarderComExtractor ,
WwwWesternjournalismComExtractor : WwwWesternjournalismComExtractor ,
WwwAmericanowComExtractor : WwwAmericanowComExtractor ,
ScienceflyComExtractor : ScienceflyComExtractor ,
HellogigglesComExtractor : HellogigglesComExtractor ,
ThoughtcatalogComExtractor : ThoughtcatalogComExtractor ,
WwwInquisitrComExtractor : WwwInquisitrComExtractor ,
WwwNbcnewsComExtractor : WwwNbcnewsComExtractor ,
FortuneComExtractor : FortuneComExtractor ,
WwwLinkedinComExtractor : WwwLinkedinComExtractor ,
ObamawhitehouseArchivesGovExtractor : ObamawhitehouseArchivesGovExtractor ,
WwwOpposingviewsComExtractor : WwwOpposingviewsComExtractor ,
WwwProspectmagazineCoUkExtractor : WwwProspectmagazineCoUkExtractor ,
ForwardComExtractor : ForwardComExtractor ,
WwwQdailyComExtractor : WwwQdailyComExtractor ,
GothamistComExtractor : GothamistComExtractor ,
WwwFoolComExtractor : WwwFoolComExtractor ,
WwwSlateComExtractor : WwwSlateComExtractor ,
IciRadioCanadaCaExtractor : IciRadioCanadaCaExtractor ,
WwwFortinetComExtractor : WwwFortinetComExtractor ,
WwwFastcompanyComExtractor : WwwFastcompanyComExtractor ,
BlisterreviewComExtractor : BlisterreviewComExtractor ,
NewsMynaviJpExtractor : NewsMynaviJpExtractor ,
ClinicaltrialsGovExtractor : ClinicaltrialsGovExtractor ,
GithubComExtractor : GithubComExtractor ,
WwwRedditComExtractor : WwwRedditComExtractor ,
OtrsComExtractor : OtrsComExtractor ,
WwwOssnewsJpExtractor : WwwOssnewsJpExtractor ,
BuzzapJpExtractor : BuzzapJpExtractor ,
WwwAsahiComExtractor : WwwAsahiComExtractor ,
WwwSanwaCoJpExtractor : WwwSanwaCoJpExtractor ,
WwwElecomCoJpExtractor : WwwElecomCoJpExtractor ,
ScanNetsecurityNeJpExtractor : ScanNetsecurityNeJpExtractor ,
JvndbJvnJpExtractor : JvndbJvnJpExtractor ,
GeniusComExtractor : GeniusComExtractor ,
WwwJnsaOrgExtractor : WwwJnsaOrgExtractor ,
PhpspotOrgExtractor : PhpspotOrgExtractor ,
WwwInfoqComExtractor : WwwInfoqComExtractor ,
WwwMoongiftJpExtractor : WwwMoongiftJpExtractor ,
WwwItmediaCoJpExtractor : WwwItmediaCoJpExtractor ,
WwwPublickey1JpExtractor : WwwPublickey1JpExtractor ,
TakagihiromitsuJpExtractor : TakagihiromitsuJpExtractor ,
BookwalkerJpExtractor : BookwalkerJpExtractor ,
WwwYomiuriCoJpExtractor : WwwYomiuriCoJpExtractor ,
JapanCnetComExtractor : JapanCnetComExtractor ,
DeadlineComExtractor : DeadlineComExtractor ,
WwwGizmodoJpExtractor : WwwGizmodoJpExtractor ,
GetnewsJpExtractor : GetnewsJpExtractor ,
WwwLifehackerJpExtractor : WwwLifehackerJpExtractor ,
SectIijAdJpExtractor : SectIijAdJpExtractor ,
WwwOreillyCoJpExtractor : WwwOreillyCoJpExtractor ,
WwwIpaGoJpExtractor : WwwIpaGoJpExtractor ,
WeeklyAsciiJpExtractor : WeeklyAsciiJpExtractor ,
TechlogIijAdJpExtractor : TechlogIijAdJpExtractor ,
WiredJpExtractor : WiredJpExtractor ,
JapanZdnetComExtractor : JapanZdnetComExtractor ,
WwwRbbtodayComExtractor : WwwRbbtodayComExtractor ,
WwwLemondeFrExtractor : WwwLemondeFrExtractor ,
WwwPhoronixComExtractor : WwwPhoronixComExtractor ,
PitchforkComExtractor : PitchforkComExtractor ,
BiorxivOrgExtractor : BiorxivOrgExtractor ,
EpaperZeitDeExtractor : EpaperZeitDeExtractor ,
WwwLadbibleComExtractor : WwwLadbibleComExtractor ,
TimesofindiaIndiatimesComExtractor : TimesofindiaIndiatimesComExtractor ,
MaTtiasBeExtractor : MaTtiasBeExtractor ,
PastebinComExtractor : PastebinComExtractor ,
WwwAbendblattDeExtractor : WwwAbendblattDeExtractor ,
WwwGrueneDeExtractor : WwwGrueneDeExtractor ,
WwwEngadgetComExtractor : WwwEngadgetComExtractor ,
ArstechnicaComExtractor : ArstechnicaComExtractor ,
WwwNdtvComExtractor : WwwNdtvComExtractor ,
SpektrumExtractor : SpektrumExtractor ,
PostlightComExtractor : PostlightComExtractor ,
WwwInvestmentexecutiveComExtractor : WwwInvestmentexecutiveComExtractor ,
WwwCbcCaExtractor : WwwCbcCaExtractor
} ) ;
var Extractors = _Object$keys ( CustomExtractors ) . reduce ( function ( acc , key ) {
var extractor = CustomExtractors [ key ] ;
return _objectSpread ( { } , acc , mergeSupportedDomains ( extractor ) ) ;
} , { } ) ; // CLEAN AUTHOR CONSTANTS
var CLEAN _AUTHOR _RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i ; // CLEAN DEK CONSTANTS
var TEXT _LINK _RE = new RegExp ( 'http(s)?://' , 'i' ) ; // An ordered list of meta tag names that denote likely article deks.
var MS _DATE _STRING = /^\d{13}$/i ;
var SEC _DATE _STRING = /^\d{10}$/i ;
var CLEAN _DATE _STRING _RE = /^\s*published\s*:?\s*(.*)/i ;
var TIME _MERIDIAN _SPACE _RE = /(.*\d)(am|pm)(.*)/i ;
var TIME _MERIDIAN _DOTS _RE = /\.m\./i ;
var TIME _NOW _STRING = /^\s*(just|right)?\s*now\s*/i ;
var timeUnits = [ 'seconds?' , 'minutes?' , 'hours?' , 'days?' , 'weeks?' , 'months?' , 'years?' ] ;
var allTimeUnits = timeUnits . join ( '|' ) ;
var TIME _AGO _STRING = new RegExp ( "(\\d+)\\s+(" . concat ( allTimeUnits , ")\\s+ago" ) , 'i' ) ;
var months = [ 'jan' , 'feb' , 'mar' , 'apr' , 'may' , 'jun' , 'jul' , 'aug' , 'sep' , 'oct' , 'nov' , 'dec' ] ;
var allMonths = months . join ( '|' ) ;
var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?' ;
var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}' ;
var timestamp3 = '-[0-9]{3,4}$' ;
var SPLIT _DATE _STRING = new RegExp ( "(" . concat ( timestamp1 , ")|(" ) . concat ( timestamp2 , ")|(" ) . concat ( timestamp3 , ")|([0-9]{1,4})|(" ) . concat ( allMonths , ")" ) , 'ig' ) ; // 2016-11-22T08:57-500
// Check if datetime string has an offset at the end
var TIME _WITH _OFFSET _RE = /-\d{3,4}$/ ; // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
var TITLE _SPLITTERS _RE = /(: | - | \| )/g ;
var DOMAIN _ENDINGS _RE = new RegExp ( '.com$|.net$|.org$|.co.uk$' , 'g' ) ; // just the name(s): 'David Smith'.
function cleanAuthor ( author ) {
return normalizeSpaces$1 ( author . replace ( CLEAN _AUTHOR _RE , '$2' ) . trim ( ) ) ;
}
function clean$1 ( leadImageUrl ) {
leadImageUrl = leadImageUrl . trim ( ) ;
if ( validUrl$1 . isWebUri ( leadImageUrl ) ) {
return leadImageUrl ;
}
return null ;
} // Return None if the dek wasn't good enough.
function cleanDek ( dek , _ref ) {
var $ = _ref . $ ,
excerpt = _ref . excerpt ; // Sanity check that we didn't get too short or long of a dek.
if ( dek . length > 1000 || dek . length < 5 ) return null ; // Check that dek isn't the same as excerpt
if ( excerpt && excerptContent$1 ( excerpt , 10 ) === excerptContent$1 ( dek , 10 ) ) return null ;
var dekText = stripTags$1 ( dek , $ ) ; // Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail.
if ( TEXT _LINK _RE . test ( dekText ) ) return null ;
return normalizeSpaces$1 ( dekText . trim ( ) ) ;
}
function cleanDateString ( dateString ) {
return ( dateString . match ( SPLIT _DATE _STRING ) || [ ] ) . join ( ' ' ) . replace ( TIME _MERIDIAN _DOTS _RE , 'm' ) . replace ( TIME _MERIDIAN _SPACE _RE , '$1 $2 $3' ) . replace ( CLEAN _DATE _STRING _RE , '$1' ) . trim ( ) ;
}
function createDate ( dateString , timezone , format ) {
if ( TIME _WITH _OFFSET _RE . test ( dateString ) ) {
return moment ( new Date ( dateString ) ) ;
}
if ( TIME _AGO _STRING . test ( dateString ) ) {
var fragments = TIME _AGO _STRING . exec ( dateString ) ;
return moment ( ) . subtract ( fragments [ 1 ] , fragments [ 2 ] ) ;
}
if ( TIME _NOW _STRING . test ( dateString ) ) {
return moment ( ) ;
}
return timezone ? moment . tz ( dateString , format || parseFormat ( dateString ) , timezone ) : moment ( dateString , format || parseFormat ( dateString ) ) ;
} // Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
function cleanDatePublished ( dateString ) {
var _ref = arguments . length > 1 && arguments [ 1 ] !== undefined ? arguments [ 1 ] : { } ,
timezone = _ref . timezone ,
format = _ref . format ; // If string is in milliseconds or seconds, convert to int and return
if ( MS _DATE _STRING . test ( dateString ) ) {
return new Date ( _parseInt$1 ( dateString , 10 ) ) . toISOString ( ) ;
}
if ( SEC _DATE _STRING . test ( dateString ) ) {
return new Date ( _parseInt$1 ( dateString , 10 ) * 1000 ) . toISOString ( ) ;
}
var date = createDate ( dateString , timezone , format ) ;
if ( ! date . isValid ( ) ) {
dateString = cleanDateString ( dateString ) ;
date = createDate ( dateString , timezone , format ) ;
}
return date . isValid ( ) ? date . toISOString ( ) : null ;
}
function extractCleanNode ( article , _ref ) {
var $ = _ref . $ ,
_ref$cleanConditional = _ref . cleanConditionally ,
cleanConditionally = _ref$cleanConditional === void 0 ? true : _ref$cleanConditional ,
_ref$title = _ref . title ,
title = _ref$title === void 0 ? '' : _ref$title ,
_ref$url = _ref . url ,
url = _ref$url === void 0 ? '' : _ref$url ,
_ref$defaultCleaner = _ref . defaultCleaner ,
defaultCleaner = _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner ; // Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
rewriteTopLevel$$1 ( article , $ ) ; // Drop small images and spacer images
// Only do this is defaultCleaner is set to true;
// this can sometimes be too aggressive.
if ( defaultCleaner ) cleanImages$1 ( article , $ ) ; // Make links absolute
makeLinksAbsolute$$1 ( article , $ , url ) ; // Mark elements to keep that would normally be removed.
// E.g., stripJunkTags will remove iframes, so we're going to mark
// YouTube/Vimeo videos as elements we want to keep.
markToKeep$1 ( article , $ , url ) ; // Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security.
stripJunkTags$1 ( article , $ ) ; // H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
cleanHOnes$$1 ( article , $ ) ; // Clean headers
cleanHeaders$1 ( article , $ , title ) ; // We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
// Also optionally running, since it can be overly aggressive.
if ( defaultCleaner ) cleanTags$$1 ( article , $ , cleanConditionally ) ; // Remove empty paragraph nodes
removeEmpty$1 ( article , $ ) ; // Remove unnecessary attributes
cleanAttributes$$1 ( article , $ ) ;
return article ;
}
function cleanTitle$$1 ( title , _ref ) {
var url = _ref . url ,
$ = _ref . $ ; // If title has |, :, or - in it, see if
// we can clean it up.
if ( TITLE _SPLITTERS _RE . test ( title ) ) {
title = resolveSplitTitle ( title , url ) ;
} // Final sanity check that we didn't get a crazy title.
// if (title.length > 150 || title.length < 15) {
if ( title . length > 150 ) {
// If we did, return h1 from the document if it exists
var h1 = $ ( 'h1' ) ;
if ( h1 . length === 1 ) {
title = h1 . text ( ) ;
}
} // strip any html tags in the title text
return normalizeSpaces$1 ( stripTags$1 ( title , $ ) . trim ( ) ) ;
}
function extractBreadcrumbTitle ( splitTitle , text ) {
// This must be a very breadcrumbed title, like:
// The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
// NYTimes - Blogs - Bits - The Best Gadgets on Earth
if ( splitTitle . length >= 6 ) {
// Look to see if we can find a breadcrumb splitter that happens
// more than once. If we can, we'll be able to better pull out
// the title.
var termCounts = splitTitle . reduce ( function ( acc , titleText ) {
acc [ titleText ] = acc [ titleText ] ? acc [ titleText ] + 1 : 1 ;
return acc ;
} , { } ) ;
var _Reflect$ownKeys$redu = _Reflect$ownKeys$1 ( termCounts ) . reduce ( function ( acc , key ) {
if ( acc [ 1 ] < termCounts [ key ] ) {
return [ key , termCounts [ key ] ] ;
}
return acc ;
} , [ 0 , 0 ] ) ,
_Reflect$ownKeys$redu2 = _slicedToArray$1 ( _Reflect$ownKeys$redu , 2 ) ,
maxTerm = _Reflect$ownKeys$redu2 [ 0 ] ,
termCount = _Reflect$ownKeys$redu2 [ 1 ] ; // We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead.
// Note: max_term should be <= 4 characters, so that " >> "
// will match, but nothing longer than that.
if ( termCount >= 2 && maxTerm . length <= 4 ) {
splitTitle = text . split ( maxTerm ) ;
}
var splitEnds = [ splitTitle [ 0 ] , splitTitle . slice ( - 1 ) ] ;
var longestEnd = splitEnds . reduce ( function ( acc , end ) {
return acc . length > end . length ? acc : end ;
} , '' ) ;
if ( longestEnd . length > 10 ) {
return longestEnd ;
}
return text ;
}
return null ;
}
function cleanDomainFromTitle ( splitTitle , url ) {
// Search the ends of the title, looking for bits that fuzzy match
// the URL too closely. If one is found, discard it and return the
// rest.
//
// Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right.
var _URL$parse = URL$1 . parse ( url ) ,
host = _URL$parse . host ;
var nakedDomain = host . replace ( DOMAIN _ENDINGS _RE , '' ) ;
var startSlug = splitTitle [ 0 ] . toLowerCase ( ) . replace ( ' ' , '' ) ;
var startSlugRatio = wuzzy$1 . levenshtein ( startSlug , nakedDomain ) ;
if ( startSlugRatio > 0.4 && startSlug . length > 5 ) {
return splitTitle . slice ( 2 ) . join ( '' ) ;
}
var endSlug = splitTitle . slice ( - 1 ) [ 0 ] . toLowerCase ( ) . replace ( ' ' , '' ) ;
var endSlugRatio = wuzzy$1 . levenshtein ( endSlug , nakedDomain ) ;
if ( endSlugRatio > 0.4 && endSlug . length >= 5 ) {
return splitTitle . slice ( 0 , - 2 ) . join ( '' ) ;
}
return null ;
} // Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
function resolveSplitTitle ( title ) {
var url = arguments . length > 1 && arguments [ 1 ] !== undefined ? arguments [ 1 ] : '' ; // Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
var splitTitle = title . split ( TITLE _SPLITTERS _RE ) ;
if ( splitTitle . length === 1 ) {
return title ;
}
var newTitle = extractBreadcrumbTitle ( splitTitle , title ) ;
if ( newTitle ) return newTitle ;
newTitle = cleanDomainFromTitle ( splitTitle , url ) ;
if ( newTitle ) return newTitle ; // Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title ;
}
var Cleaners = {
author : cleanAuthor ,
lead _image _url : clean$1 ,
dek : cleanDek ,
date _published : cleanDatePublished ,
content : extractCleanNode ,
title : cleanTitle$$1
} ; // likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns a cheerio object $
function extractBestNode ( $ , opts ) {
if ( opts . stripUnlikelyCandidates ) {
$ = stripUnlikelyCandidates$1 ( $ ) ;
}
$ = convertToParagraphs$$1 ( $ ) ;
$ = scoreContent$$1 ( $ , opts . weightNodes ) ;
var $topCandidate = findTopCandidate$$1 ( $ ) ;
return $topCandidate ;
}
var GenericContentExtractor = {
defaultOpts : {
stripUnlikelyCandidates : true ,
weightNodes : true ,
cleanConditionally : true
} ,
// Extract the content for this resource - initially, pass in our
// most restrictive opts which will return the highest quality
// content. On each failure, retry with slightly more lax opts.
//
// :param return_type: string. If "node", should return the content
// as a cheerio node rather than as an HTML string.
//
// Opts:
// stripUnlikelyCandidates: Remove any elements that match
// non-article-like criteria first.(Like, does this element
// have a classname of "comment")
//
// weightNodes: Modify an elements score based on whether it has
// certain classNames or IDs. Examples: Subtract if a node has
// a className of 'comment', Add if a node has an ID of
// 'entry-content'.
//
// cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
extract : function extract ( _ref , opts ) {
var $ = _ref . $ ,
html = _ref . html ,
title = _ref . title ,
url = _ref . url ;
opts = _objectSpread ( { } , this . defaultOpts , opts ) ;
$ = $ || cheerio$1 . load ( html ) ; // Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content.
var node = this . getContentNode ( $ , title , url , opts ) ;
if ( nodeIsSufficient$1 ( node ) ) {
return this . cleanAndReturnNode ( node , $ ) ;
} // We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
// eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true ;
var _didIteratorError = false ;
var _iteratorError = undefined ;
try {
for ( var _iterator = _getIterator$1 ( _Reflect$ownKeys$1 ( opts ) . filter ( function ( k ) {
return opts [ k ] === true ;
} ) ) , _step ; ! ( _iteratorNormalCompletion = ( _step = _iterator . next ( ) ) . done ) ; _iteratorNormalCompletion = true ) {
var key = _step . value ;
opts [ key ] = false ;
$ = cheerio$1 . load ( html ) ;
node = this . getContentNode ( $ , title , url , opts ) ;
if ( nodeIsSufficient$1 ( node ) ) {
break ;
}
}
} catch ( err ) {
_didIteratorError = true ;
_iteratorError = err ;
} finally {
try {
if ( ! _iteratorNormalCompletion && _iterator . return != null ) {
_iterator . return ( ) ;
}
} finally {
if ( _didIteratorError ) {
throw _iteratorError ;
}
}
}
return this . cleanAndReturnNode ( node , $ ) ;
} ,
// Get node given current options
getContentNode : function getContentNode ( $ , title , url , opts ) {
return extractCleanNode ( extractBestNode ( $ , opts ) , {
$ : $ ,
cleanConditionally : opts . cleanConditionally ,
title : title ,
url : url
} ) ;
} ,
// Once we got here, either we're at our last-resort node, or
// we broke early. Make sure we at least have -something- before we
// move forward.
cleanAndReturnNode : function cleanAndReturnNode ( node , $ ) {
if ( ! node ) {
return null ;
}
return normalizeSpaces$1 ( $ . html ( node ) ) ;
}
} ; // TODO: It would be great if we could merge the meta and selector lists into
// a list of objects, because we could then rank them better. For example,
// .hentry .entry-title is far better suited than <meta title>.
// An ordered list of meta tag names that denote likely article titles. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
var STRONG _TITLE _META _TAGS = [ 'tweetmeme-title' , 'dc.title' , 'rbtitle' , 'headline' , 'title' ] ; // og:title is weak because it typically contains context that we don't like,
// for example the source site's name. Gotta get that brand into facebook!
var WEAK _TITLE _META _TAGS = [ 'og:title' ] ; // An ordered list of XPath Selectors to find likely article titles. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
var STRONG _TITLE _SELECTORS = [ '.hentry .entry-title' , 'h1#articleHeader' , 'h1.articleHeader' , 'h1.article' , '.instapaper_title' , '#meebo-title' ] ;
var WEAK _TITLE _SELECTORS = [ 'article h1' , '#entry-title' , '.entry-title' , '#entryTitle' , '#entrytitle' , '.entryTitle' , '.entrytitle' , '#articleTitle' , '.articleTitle' , 'post post-title' , 'h1.title' , 'h2.article' , 'h1' , 'html head title' , 'title' ] ;
var GenericTitleExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
url = _ref . url ,
metaCache = _ref . metaCache ; // First, check to see if we have a matching meta tag that we can make
// use of that is strongly associated with the headline.
var title ;
title = extractFromMeta$$1 ( $ , STRONG _TITLE _META _TAGS , metaCache ) ;
if ( title ) return cleanTitle$$1 ( title , {
url : url ,
$ : $
} ) ; // Second, look through our content selectors for the most likely
// article title that is strongly associated with the headline.
title = extractFromSelectors$$1 ( $ , STRONG _TITLE _SELECTORS ) ;
if ( title ) return cleanTitle$$1 ( title , {
url : url ,
$ : $
} ) ; // Third, check for weaker meta tags that may match.
title = extractFromMeta$$1 ( $ , WEAK _TITLE _META _TAGS , metaCache ) ;
if ( title ) return cleanTitle$$1 ( title , {
url : url ,
$ : $
} ) ; // Last, look for weaker selector tags that may match.
title = extractFromSelectors$$1 ( $ , WEAK _TITLE _SELECTORS ) ;
if ( title ) return cleanTitle$$1 ( title , {
url : url ,
$ : $
} ) ; // If no matches, return an empty string
return '' ;
}
} ; // An ordered list of meta tag names that denote likely article authors. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
//
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
var AUTHOR _META _TAGS = [ 'byl' , 'clmst' , 'dc.author' , 'dcsext.author' , 'dc.creator' , 'rbauthors' , 'authors' ] ;
var AUTHOR _MAX _LENGTH = 300 ; // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
var AUTHOR _SELECTORS = [ '.entry .entry-author' , '.author.vcard .fn' , '.author .vcard .fn' , '.byline.vcard .fn' , '.byline .vcard .fn' , '.byline .by .author' , '.byline .by' , '.byline .author' , '.post-author.vcard' , '.post-author .vcard' , 'a[rel=author]' , '#by_author' , '.by_author' , '#entryAuthor' , '.entryAuthor' , '.byline a[href*=author]' , '#author .authorname' , '.author .authorname' , '#author' , '.author' , '.articleauthor' , '.ArticleAuthor' , '.byline' ] ; // An ordered list of Selectors to find likely article authors, with
// regular expression for content.
var bylineRe = /^[\n\s]*By/i ;
var BYLINE _SELECTORS _RE = [ [ '#byline' , bylineRe ] , [ '.byline' , bylineRe ] ] ;
var GenericAuthorExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
metaCache = _ref . metaCache ;
var author ; // First, check to see if we have a matching
// meta tag that we can make use of.
author = extractFromMeta$$1 ( $ , AUTHOR _META _TAGS , metaCache ) ;
if ( author && author . length < AUTHOR _MAX _LENGTH ) {
return cleanAuthor ( author ) ;
} // Second, look through our selectors looking for potential authors.
author = extractFromSelectors$$1 ( $ , AUTHOR _SELECTORS , 2 ) ;
if ( author && author . length < AUTHOR _MAX _LENGTH ) {
return cleanAuthor ( author ) ;
} // Last, use our looser regular-expression based selectors for
// potential authors.
// eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true ;
var _didIteratorError = false ;
var _iteratorError = undefined ;
try {
for ( var _iterator = _getIterator$1 ( BYLINE _SELECTORS _RE ) , _step ; ! ( _iteratorNormalCompletion = ( _step = _iterator . next ( ) ) . done ) ; _iteratorNormalCompletion = true ) {
var _step$value = _slicedToArray$1 ( _step . value , 2 ) ,
selector = _step$value [ 0 ] ,
regex = _step$value [ 1 ] ;
var node = $ ( selector ) ;
if ( node . length === 1 ) {
var text = node . text ( ) ;
if ( regex . test ( text ) ) {
return cleanAuthor ( text ) ;
}
}
}
} catch ( err ) {
_didIteratorError = true ;
_iteratorError = err ;
} finally {
try {
if ( ! _iteratorNormalCompletion && _iterator . return != null ) {
_iterator . return ( ) ;
}
} finally {
if ( _didIteratorError ) {
throw _iteratorError ;
}
}
}
return null ;
}
} ; // An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var DATE _PUBLISHED _META _TAGS = [ 'article:published_time' , 'displaydate' , 'dc.date' , 'dc.date.issued' , 'rbpubdate' , 'publish_date' , 'pub_date' , 'pagedate' , 'pubdate' , 'revision_date' , 'doc_date' , 'date_created' , 'content_create_date' , 'lastmodified' , 'created' , 'date' ] ; // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.
var DATE _PUBLISHED _SELECTORS = [ '.hentry .dtstamp.published' , '.hentry .published' , '.hentry .dtstamp.updated' , '.hentry .updated' , '.single .published' , '.meta .published' , '.meta .postDate' , '.entry-date' , '.byline .date' , '.postmetadata .date' , '.article_datetime' , '.date-header' , '.story-date' , '.dateStamp' , '#story .datetime' , '.dateline' , '.pubdate' ] ; // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse
var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)' ;
var DATE _PUBLISHED _URL _RES = [ new RegExp ( '/(20\\d{2}/\\d{2}/\\d{2})/' , 'i' ) , new RegExp ( '(20\\d{2}-[01]\\d-[0-3]\\d)' , 'i' ) , new RegExp ( "/(20\\d{2}/" . concat ( abbrevMonthsStr , "/[0-3]\\d)/" ) , 'i' ) ] ;
var GenericDatePublishedExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
url = _ref . url ,
metaCache = _ref . metaCache ;
var datePublished ; // First, check to see if we have a matching meta tag
// that we can make use of.
// Don't try cleaning tags from this string
datePublished = extractFromMeta$$1 ( $ , DATE _PUBLISHED _META _TAGS , metaCache , false ) ;
if ( datePublished ) return cleanDatePublished ( datePublished ) ; // Second, look through our selectors looking for potential
// date_published's.
datePublished = extractFromSelectors$$1 ( $ , DATE _PUBLISHED _SELECTORS ) ;
if ( datePublished ) return cleanDatePublished ( datePublished ) ; // Lastly, look to see if a dately string exists in the URL
datePublished = extractFromUrl$1 ( url , DATE _PUBLISHED _URL _RES ) ;
if ( datePublished ) return cleanDatePublished ( datePublished ) ;
return null ;
}
} ; // Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.
var GenericDekExtractor = {
extract : function extract ( ) {
return null ;
}
} ; // An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var LEAD _IMAGE _URL _META _TAGS = [ 'og:image' , 'twitter:image' , 'image_src' ] ;
var LEAD _IMAGE _URL _SELECTORS = [ 'link[rel=image_src]' ] ;
var POSITIVE _LEAD _IMAGE _URL _HINTS = [ 'upload' , 'wp-content' , 'large' , 'photo' , 'wp-image' ] ;
var POSITIVE _LEAD _IMAGE _URL _HINTS _RE = new RegExp ( POSITIVE _LEAD _IMAGE _URL _HINTS . join ( '|' ) , 'i' ) ;
var NEGATIVE _LEAD _IMAGE _URL _HINTS = [ 'spacer' , 'sprite' , 'blank' , 'throbber' , 'gradient' , 'tile' , 'bg' , 'background' , 'icon' , 'social' , 'header' , 'hdr' , 'advert' , 'spinner' , 'loader' , 'loading' , 'default' , 'rating' , 'share' , 'facebook' , 'twitter' , 'theme' , 'promo' , 'ads' , 'wp-includes' ] ;
var NEGATIVE _LEAD _IMAGE _URL _HINTS _RE = new RegExp ( NEGATIVE _LEAD _IMAGE _URL _HINTS . join ( '|' ) , 'i' ) ;
var GIF _RE = /\.gif(\?.*)?$/i ;
var JPG _RE = /\.jpe?g(\?.*)?$/i ;
function getSig ( $node ) {
return "" . concat ( $node . attr ( 'class' ) || '' , " " ) . concat ( $node . attr ( 'id' ) || '' ) ;
} // Scores image urls based on a variety of heuristics.
function scoreImageUrl ( url ) {
url = url . trim ( ) ;
var score = 0 ;
if ( POSITIVE _LEAD _IMAGE _URL _HINTS _RE . test ( url ) ) {
score += 20 ;
}
if ( NEGATIVE _LEAD _IMAGE _URL _HINTS _RE . test ( url ) ) {
score -= 20 ;
} // TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were
if ( GIF _RE . test ( url ) ) {
score -= 10 ;
}
if ( JPG _RE . test ( url ) ) {
score += 10 ;
} // PNGs are neutral.
return score ;
} // Alt attribute usually means non-presentational image.
function scoreAttr ( $img ) {
if ( $img . attr ( 'alt' ) ) {
return 5 ;
}
return 0 ;
} // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them
function scoreByParents ( $img ) {
var score = 0 ;
var $figParent = $img . parents ( 'figure' ) . first ( ) ;
if ( $figParent . length === 1 ) {
score += 25 ;
}
var $parent = $img . parent ( ) ;
var $gParent ;
if ( $parent . length === 1 ) {
$gParent = $parent . parent ( ) ;
}
[ $parent , $gParent ] . forEach ( function ( $node ) {
if ( PHOTO _HINTS _RE$1$1 . test ( getSig ( $node ) ) ) {
score += 15 ;
}
} ) ;
return score ;
} // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
function scoreBySibling ( $img ) {
var score = 0 ;
var $sibling = $img . next ( ) ;
var sibling = $sibling . get ( 0 ) ;
if ( sibling && sibling . tagName . toLowerCase ( ) === 'figcaption' ) {
score += 25 ;
}
if ( PHOTO _HINTS _RE$1$1 . test ( getSig ( $sibling ) ) ) {
score += 15 ;
}
return score ;
}
function scoreByDimensions ( $img ) {
var score = 0 ;
var width = _parseFloat$1 ( $img . attr ( 'width' ) ) ;
var height = _parseFloat$1 ( $img . attr ( 'height' ) ) ;
var src = $img . attr ( 'src' ) ; // Penalty for skinny images
if ( width && width <= 50 ) {
score -= 50 ;
} // Penalty for short images
if ( height && height <= 50 ) {
score -= 50 ;
}
if ( width && height && ! src . includes ( 'sprite' ) ) {
var area = width * height ;
if ( area < 5000 ) {
// Smaller than 50 x 100
score -= 100 ;
} else {
score += Math . round ( area / 1000 ) ;
}
}
return score ;
}
function scoreByPosition ( $imgs , index ) {
return $imgs . length / 2 - index ;
} // it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
// * domain
// * weird aspect ratio
var GenericLeadImageUrlExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
content = _ref . content ,
metaCache = _ref . metaCache ,
html = _ref . html ;
var cleanUrl ;
if ( ! $ . browser && $ ( 'head' ) . length === 0 ) {
$ ( '*' ) . first ( ) . prepend ( html ) ;
} // Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph.
var imageUrl = extractFromMeta$$1 ( $ , LEAD _IMAGE _URL _META _TAGS , metaCache , false ) ;
if ( imageUrl ) {
cleanUrl = clean$1 ( imageUrl ) ;
if ( cleanUrl ) return cleanUrl ;
} // Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
var $content = $ ( content ) ;
var imgs = $ ( 'img' , $content ) . toArray ( ) ;
var imgScores = { } ;
imgs . forEach ( function ( img , index ) {
var $img = $ ( img ) ;
var src = $img . attr ( 'src' ) ;
if ( ! src ) return ;
var score = scoreImageUrl ( src ) ;
score += scoreAttr ( $img ) ;
score += scoreByParents ( $img ) ;
score += scoreBySibling ( $img ) ;
score += scoreByDimensions ( $img ) ;
score += scoreByPosition ( imgs , index ) ;
imgScores [ src ] = score ;
} ) ;
var _Reflect$ownKeys$redu = _Reflect$ownKeys$1 ( imgScores ) . reduce ( function ( acc , key ) {
return imgScores [ key ] > acc [ 1 ] ? [ key , imgScores [ key ] ] : acc ;
} , [ null , 0 ] ) ,
_Reflect$ownKeys$redu2 = _slicedToArray$1 ( _Reflect$ownKeys$redu , 2 ) ,
topUrl = _Reflect$ownKeys$redu2 [ 0 ] ,
topScore = _Reflect$ownKeys$redu2 [ 1 ] ;
if ( topScore > 0 ) {
cleanUrl = clean$1 ( topUrl ) ;
if ( cleanUrl ) return cleanUrl ;
} // If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />.
// eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true ;
var _didIteratorError = false ;
var _iteratorError = undefined ;
try {
for ( var _iterator = _getIterator$1 ( LEAD _IMAGE _URL _SELECTORS ) , _step ; ! ( _iteratorNormalCompletion = ( _step = _iterator . next ( ) ) . done ) ; _iteratorNormalCompletion = true ) {
var selector = _step . value ;
var $node = $ ( selector ) . first ( ) ;
var src = $node . attr ( 'src' ) ;
if ( src ) {
cleanUrl = clean$1 ( src ) ;
if ( cleanUrl ) return cleanUrl ;
}
var href = $node . attr ( 'href' ) ;
if ( href ) {
cleanUrl = clean$1 ( href ) ;
if ( cleanUrl ) return cleanUrl ;
}
var value = $node . attr ( 'value' ) ;
if ( value ) {
cleanUrl = clean$1 ( value ) ;
if ( cleanUrl ) return cleanUrl ;
}
}
} catch ( err ) {
_didIteratorError = true ;
_iteratorError = err ;
} finally {
try {
if ( ! _iteratorNormalCompletion && _iterator . return != null ) {
_iterator . return ( ) ;
}
} finally {
if ( _didIteratorError ) {
throw _iteratorError ;
}
}
}
return null ;
}
} ;
function scoreSimilarity ( score , articleUrl , href ) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if ( score > 0 ) {
var similarity = new difflib$1 . SequenceMatcher ( null , articleUrl , href ) . ratio ( ) ; // Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
var diffPercent = 1.0 - similarity ;
var diffModifier = - ( 250 * ( diffPercent - 0.2 ) ) ;
return score + diffModifier ;
}
return 0 ;
}
function scoreLinkText ( linkText , pageNum ) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
var score = 0 ;
if ( IS _DIGIT _RE$1 . test ( linkText . trim ( ) ) ) {
var linkTextAsNum = _parseInt$1 ( linkText , 10 ) ; // If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if ( linkTextAsNum < 2 ) {
score = - 30 ;
} else {
score = Math . max ( 0 , 10 - linkTextAsNum ) ;
} // If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if ( pageNum && pageNum >= linkTextAsNum ) {
score -= 50 ;
}
}
return score ;
}
function scorePageInLink ( pageNum , isWp ) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if ( pageNum && ! isWp ) {
return 50 ;
}
return 0 ;
}
var DIGIT _RE$2 = /\d/ ; // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
var EXTRANEOUS _LINK _HINTS$1 = [ 'print' , 'archive' , 'comment' , 'discuss' , 'e-mail' , 'email' , 'share' , 'reply' , 'all' , 'login' , 'sign' , 'single' , 'adx' , 'entry-unrelated' ] ;
var EXTRANEOUS _LINK _HINTS _RE$1 = new RegExp ( EXTRANEOUS _LINK _HINTS$1 . join ( '|' ) , 'i' ) ; // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
var NEXT _LINK _TEXT _RE$1 = new RegExp ( '(next|weiter|continue|>([^|]|$)|»([^|]|$))' , 'i' ) ; // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
var CAP _LINK _TEXT _RE$1 = new RegExp ( '(first|last|end)' , 'i' ) ; // Match any link text/classname/id that looks like it means the previous
// page.
var PREV _LINK _TEXT _RE$1 = new RegExp ( '(prev|earl|old|new|<|«)' , 'i' ) ; // Match any phrase that looks like it could be page, or paging, or pagination
function scoreExtraneousLinks ( href ) {
// If the URL itself contains extraneous values, give a penalty.
if ( EXTRANEOUS _LINK _HINTS _RE$1 . test ( href ) ) {
return - 25 ;
}
return 0 ;
}
function makeSig ( $link ) {
return "" . concat ( $link . attr ( 'class' ) || '' , " " ) . concat ( $link . attr ( 'id' ) || '' ) ;
}
function scoreByParents$1 ( $link ) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
var $parent = $link . parent ( ) ;
var positiveMatch = false ;
var negativeMatch = false ;
var score = 0 ;
_Array$from ( range ( 0 , 4 ) ) . forEach ( function ( ) {
if ( $parent . length === 0 ) {
return ;
}
var parentData = makeSig ( $parent , ' ' ) ; // If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if ( ! positiveMatch && PAGE _RE$1 . test ( parentData ) ) {
positiveMatch = true ;
score += 25 ;
} // If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if ( ! negativeMatch && NEGATIVE _SCORE _RE$2 . test ( parentData ) && EXTRANEOUS _LINK _HINTS _RE$1 . test ( parentData ) ) {
if ( ! POSITIVE _SCORE _RE$2 . test ( parentData ) ) {
negativeMatch = true ;
score -= 25 ;
}
}
$parent = $parent . parent ( ) ;
} ) ;
return score ;
}
function scorePrevLink ( linkData ) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if ( PREV _LINK _TEXT _RE$1 . test ( linkData ) ) {
return - 200 ;
}
return 0 ;
}
function shouldScore ( href , articleUrl , baseUrl , parsedUrl , linkText , previousUrls ) {
// skip if we've already fetched this url
if ( previousUrls . find ( function ( url ) {
return href === url ;
} ) !== undefined ) {
return false ;
} // If we've already parsed this URL, or the URL matches the base
// URL, or is empty, skip it.
if ( ! href || href === articleUrl || href === baseUrl ) {
return false ;
}
var hostname = parsedUrl . hostname ;
var _URL$parse = URL$1 . parse ( href ) ,
linkHost = _URL$parse . hostname ; // Domain mismatch.
if ( linkHost !== hostname ) {
return false ;
} // If href doesn't contain a digit after removing the base URL,
// it's certainly not the next page.
var fragment = href . replace ( baseUrl , '' ) ;
if ( ! DIGIT _RE$2 . test ( fragment ) ) {
return false ;
} // This link has extraneous content (like "comment") in its link
// text, so we skip it.
if ( EXTRANEOUS _LINK _HINTS _RE$1 . test ( linkText ) ) {
return false ;
} // Next page link text is never long, skip if it is too long.
if ( linkText . length > 25 ) {
return false ;
}
return true ;
}
function scoreBaseUrl ( href , baseRegex ) {
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if ( ! baseRegex . test ( href ) ) {
return - 25 ;
}
return 0 ;
}
function scoreNextLinkText ( linkData ) {
// Things like "next", ">>", etc.
if ( NEXT _LINK _TEXT _RE$1 . test ( linkData ) ) {
return 50 ;
}
return 0 ;
}
function scoreCapLinks ( linkData ) {
// Cap links are links like "last", etc.
if ( CAP _LINK _TEXT _RE$1 . test ( linkData ) ) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if ( NEXT _LINK _TEXT _RE$1 . test ( linkData ) ) {
return - 65 ;
}
}
return 0 ;
}
function makeBaseRegex ( baseUrl ) {
return new RegExp ( "^" . concat ( baseUrl ) , 'i' ) ;
}
function makeSig$1 ( $link , linkText ) {
return "" . concat ( linkText || $link . text ( ) , " " ) . concat ( $link . attr ( 'class' ) || '' , " " ) . concat ( $link . attr ( 'id' ) || '' ) ;
}
function scoreLinks ( _ref ) {
var links = _ref . links ,
articleUrl = _ref . articleUrl ,
baseUrl = _ref . baseUrl ,
parsedUrl = _ref . parsedUrl ,
$ = _ref . $ ,
_ref$previousUrls = _ref . previousUrls ,
previousUrls = _ref$previousUrls === void 0 ? [ ] : _ref$previousUrls ;
parsedUrl = parsedUrl || URL$1 . parse ( articleUrl ) ;
var baseRegex = makeBaseRegex ( baseUrl ) ;
var isWp = isWordpress$1 ( $ ) ; // Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or
// id, or being a child of a node with a page-y className or id.
//
// After we do that, assign each page a score, and pick the one that
// looks most like the next page link, as long as its score is strong
// enough to have decent confidence.
var scoredPages = links . reduce ( function ( possiblePages , link ) {
// Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
var attrs = getAttrs$1 ( link ) ; // if href is undefined, return
if ( ! attrs . href ) return possiblePages ;
var href = removeAnchor$1 ( attrs . href ) ;
var $link = $ ( link ) ;
var linkText = $link . text ( ) ;
if ( ! shouldScore ( href , articleUrl , baseUrl , parsedUrl , linkText , previousUrls ) ) {
return possiblePages ;
} // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if ( ! possiblePages [ href ] ) {
possiblePages [ href ] = {
score : 0 ,
linkText : linkText ,
href : href
} ;
} else {
possiblePages [ href ] . linkText = "" . concat ( possiblePages [ href ] . linkText , "|" ) . concat ( linkText ) ;
}
var possiblePage = possiblePages [ href ] ;
var linkData = makeSig$1 ( $link , linkText ) ;
var pageNum = pageNumFromUrl$1 ( href ) ;
var score = scoreBaseUrl ( href , baseRegex ) ;
score += scoreNextLinkText ( linkData ) ;
score += scoreCapLinks ( linkData ) ;
score += scorePrevLink ( linkData ) ;
score += scoreByParents$1 ( $link ) ;
score += scoreExtraneousLinks ( href ) ;
score += scorePageInLink ( pageNum , isWp ) ;
score += scoreLinkText ( linkText , pageNum ) ;
score += scoreSimilarity ( score , articleUrl , href ) ;
possiblePage . score = score ;
return possiblePages ;
} , { } ) ;
return _Reflect$ownKeys$1 ( scoredPages ) . length === 0 ? null : scoredPages ;
} // for multi-page articles
var GenericNextPageUrlExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
url = _ref . url ,
parsedUrl = _ref . parsedUrl ,
_ref$previousUrls = _ref . previousUrls ,
previousUrls = _ref$previousUrls === void 0 ? [ ] : _ref$previousUrls ;
parsedUrl = parsedUrl || URL$1 . parse ( url ) ;
var articleUrl = removeAnchor$1 ( url ) ;
var baseUrl = articleBaseUrl$1 ( url , parsedUrl ) ;
var links = $ ( 'a[href]' ) . toArray ( ) ;
var scoredLinks = scoreLinks ( {
links : links ,
articleUrl : articleUrl ,
baseUrl : baseUrl ,
parsedUrl : parsedUrl ,
$ : $ ,
previousUrls : previousUrls
} ) ; // If no links were scored, return null
if ( ! scoredLinks ) return null ; // now that we've scored all possible pages,
// find the biggest one.
var topPage = _Reflect$ownKeys$1 ( scoredLinks ) . reduce ( function ( acc , link ) {
var scoredLink = scoredLinks [ link ] ;
return scoredLink . score > acc . score ? scoredLink : acc ;
} , {
score : - 100
} ) ; // If the score is less than 50, we're not confident enough to use it,
// so we fail.
if ( topPage . score >= 50 ) {
return topPage . href ;
}
return null ;
}
} ;
var CANONICAL _META _SELECTORS = [ 'og:url' ] ;
function parseDomain ( url ) {
var parsedUrl = URL$1 . parse ( url ) ;
var hostname = parsedUrl . hostname ;
return hostname ;
}
function result ( url ) {
return {
url : url ,
domain : parseDomain ( url )
} ;
}
var GenericUrlExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
url = _ref . url ,
metaCache = _ref . metaCache ;
var $canonical = $ ( 'link[rel=canonical]' ) ;
if ( $canonical . length !== 0 ) {
var href = $canonical . attr ( 'href' ) ;
if ( href ) {
return result ( href ) ;
}
}
var metaUrl = extractFromMeta$$1 ( $ , CANONICAL _META _SELECTORS , metaCache ) ;
if ( metaUrl ) {
return result ( metaUrl ) ;
}
return result ( url ) ;
}
} ;
var EXCERPT _META _SELECTORS = [ 'og:description' , 'twitter:description' ] ;
function clean$2 ( content , $ ) {
var maxLength = arguments . length > 2 && arguments [ 2 ] !== undefined ? arguments [ 2 ] : 200 ;
content = content . replace ( /[\s\n]+/g , ' ' ) . trim ( ) ;
return ellipsize$1 ( content , maxLength , {
ellipse : '…'
} ) ;
}
var GenericExcerptExtractor = {
extract : function extract ( _ref ) {
var $ = _ref . $ ,
content = _ref . content ,
metaCache = _ref . metaCache ;
var excerpt = extractFromMeta$$1 ( $ , EXCERPT _META _SELECTORS , metaCache ) ;
if ( excerpt ) {
return clean$2 ( stripTags$1 ( excerpt , $ ) ) ;
} // Fall back to excerpting from the extracted content
var maxLength = 200 ;
var shortContent = content . slice ( 0 , maxLength * 5 ) ;
return clean$2 ( $ ( shortContent ) . text ( ) , $ , maxLength ) ;
}
} ;
var getWordCount = function getWordCount ( content ) {
var $ = cheerio$1 . load ( content ) ;
var $content = $ ( 'div' ) . first ( ) ;
var text = normalizeSpaces$1 ( $content . text ( ) ) ;
return text . split ( /\s/ ) . length ;
} ;
var getWordCountAlt = function getWordCountAlt ( content ) {
content = content . replace ( /<[^>]*>/g , ' ' ) ;
content = content . replace ( /\s+/g , ' ' ) ;
content = content . trim ( ) ;
return content . split ( ' ' ) . length ;
} ;
var GenericWordCountExtractor = {
extract : function extract ( _ref ) {
var content = _ref . content ;
var count = getWordCount ( content ) ;
if ( count === 1 ) count = getWordCountAlt ( content ) ;
return count ;
}
} ;
var GenericExtractor = {
// This extractor is the default for all domains
domain : '*' ,
title : GenericTitleExtractor . extract ,
date _published : GenericDatePublishedExtractor . extract ,
author : GenericAuthorExtractor . extract ,
content : GenericContentExtractor . extract . bind ( GenericContentExtractor ) ,
lead _image _url : GenericLeadImageUrlExtractor . extract ,
dek : GenericDekExtractor . extract ,
next _page _url : GenericNextPageUrlExtractor . extract ,
url _and _domain : GenericUrlExtractor . extract ,
excerpt : GenericExcerptExtractor . extract ,
word _count : GenericWordCountExtractor . extract ,
direction : function direction ( _ref ) {
var title = _ref . title ;
return stringDirection$1 . getDirection ( title ) ;
} ,
extract : function extract ( options ) {
var html = options . html ,
$ = options . $ ;
if ( html && ! $ ) {
var loaded = cheerio$1 . load ( html ) ;
options . $ = loaded ;
}
var title = this . title ( options ) ;
var date _published = this . date _published ( options ) ;
var author = this . author ( options ) ;
var content = this . content ( _objectSpread ( { } , options , {
title : title
} ) ) ;
var lead _image _url = this . lead _image _url ( _objectSpread ( { } , options , {
content : content
} ) ) ;
var dek = this . dek ( _objectSpread ( { } , options , {
content : content
} ) ) ;
var next _page _url = this . next _page _url ( options ) ;
var excerpt = this . excerpt ( _objectSpread ( { } , options , {
content : content
} ) ) ;
var word _count = this . word _count ( _objectSpread ( { } , options , {
content : content
} ) ) ;
var direction = this . direction ( {
title : title
} ) ;
var _this$url _and _domain = this . url _and _domain ( options ) ,
url = _this$url _and _domain . url ,
domain = _this$url _and _domain . domain ;
return {
title : title ,
author : author ,
date _published : date _published || null ,
dek : dek ,
lead _image _url : lead _image _url ,
content : content ,
next _page _url : next _page _url ,
url : url ,
domain : domain ,
excerpt : excerpt ,
word _count : word _count ,
direction : direction
} ;
}
} ;
var Detectors = {
'meta[name="al:ios:app_name"][value="Medium"]' : MediumExtractor ,
'meta[name="generator"][value="blogger"]' : BloggerExtractor
} ;
function detectByHtml ( $ ) {
var selector = _Reflect$ownKeys$1 ( Detectors ) . find ( function ( s ) {
return $ ( s ) . length > 0 ;
} ) ;
return Detectors [ selector ] ;
}
function getExtractor ( url , parsedUrl , $ ) {
parsedUrl = parsedUrl || URL$1 . parse ( url ) ;
var _parsedUrl = parsedUrl ,
hostname = _parsedUrl . hostname ;
var baseDomain = hostname . split ( '.' ) . slice ( - 2 ) . join ( '.' ) ;
return apiExtractors [ hostname ] || apiExtractors [ baseDomain ] || Extractors [ hostname ] || Extractors [ baseDomain ] || detectByHtml ( $ ) || GenericExtractor ;
}
function cleanBySelectors ( $content , $ , _ref ) {
var clean = _ref . clean ;
if ( ! clean ) return $content ;
$ ( clean . join ( ',' ) , $content ) . remove ( ) ;
return $content ;
} // Transform matching elements
function transformElements ( $content , $ , _ref2 ) {
var transforms = _ref2 . transforms ;
if ( ! transforms ) return $content ;
_Reflect$ownKeys$1 ( transforms ) . forEach ( function ( key ) {
var $matches = $ ( key , $content ) ;
var value = transforms [ key ] ; // If value is a string, convert directly
if ( typeof value === 'string' ) {
$matches . each ( function ( index , node ) {
convertNodeTo$$1 ( $ ( node ) , $ , transforms [ key ] ) ;
} ) ;
} else if ( typeof value === 'function' ) {
// If value is function, apply function to node
$matches . each ( function ( index , node ) {
var result = value ( $ ( node ) , $ ) ; // If function returns a string, convert node to that value
if ( typeof result === 'string' ) {
convertNodeTo$$1 ( $ ( node ) , $ , result ) ;
}
} ) ;
}
} ) ;
return $content ;
}
function findMatchingSelector ( $ , selectors , extractHtml , allowMultiple ) {
return selectors . find ( function ( selector ) {
if ( _Array$isArray ( selector ) ) {
if ( extractHtml ) {
return selector . reduce ( function ( acc , s ) {
return acc && $ ( s ) . length > 0 ;
} , true ) ;
}
var _selector = _slicedToArray$1 ( selector , 2 ) ,
s = _selector [ 0 ] ,
attr = _selector [ 1 ] ;
return ( allowMultiple || ! allowMultiple && $ ( s ) . length === 1 ) && $ ( s ) . attr ( attr ) && $ ( s ) . attr ( attr ) . trim ( ) !== '' ;
}
return ( allowMultiple || ! allowMultiple && $ ( selector ) . length === 1 ) && $ ( selector ) . text ( ) . trim ( ) !== '' ;
} ) ;
}
function select ( opts ) {
var $ = opts . $ ,
type = opts . type ,
extractionOpts = opts . extractionOpts ,
_opts$extractHtml = opts . extractHtml ,
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml ; // Skip if there's not extraction for this type
if ( ! extractionOpts ) return null ; // If a string is hardcoded for a type (e.g., Wikipedia
// contributors), return the string
if ( typeof extractionOpts === 'string' ) return extractionOpts ;
var selectors = extractionOpts . selectors ,
_extractionOpts$defau = extractionOpts . defaultCleaner ,
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau ,
allowMultiple = extractionOpts . allowMultiple ;
var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple ;
var matchingSelector = findMatchingSelector ( $ , selectors , extractHtml , overrideAllowMultiple ) ;
if ( ! matchingSelector ) return null ;
function transformAndClean ( $node ) {
makeLinksAbsolute$$1 ( $node , $ , opts . url || '' ) ;
cleanBySelectors ( $node , $ , extractionOpts ) ;
transformElements ( $node , $ , extractionOpts ) ;
return $node ;
}
function selectHtml ( ) {
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
var $content ; // If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
// array must match in order for this selector to trigger
if ( _Array$isArray ( matchingSelector ) ) {
$content = $ ( matchingSelector . join ( ',' ) ) ;
var $wrapper = $ ( '<div></div>' ) ;
$content . each ( function ( _ , element ) {
$wrapper . append ( element ) ;
} ) ;
$content = $wrapper ;
} else {
$content = $ ( matchingSelector ) ;
} // Wrap in div so transformation can take place on root element
$content . wrap ( $ ( '<div></div>' ) ) ;
$content = $content . parent ( ) ;
$content = transformAndClean ( $content ) ;
if ( Cleaners [ type ] ) {
Cleaners [ type ] ( $content , _objectSpread ( { } , opts , {
defaultCleaner : defaultCleaner
} ) ) ;
}
if ( allowMultiple ) {
return $content . children ( ) . toArray ( ) . map ( function ( el ) {
return $ . html ( $ ( el ) ) ;
} ) ;
}
return $ . html ( $content ) ;
}
if ( extractHtml ) {
return selectHtml ( matchingSelector ) ;
}
var $match ;
var result ; // if selector is an array (e.g., ['img', 'src']),
// extract the attr
if ( _Array$isArray ( matchingSelector ) ) {
var _matchingSelector = _slicedToArray$1 ( matchingSelector , 3 ) ,
selector = _matchingSelector [ 0 ] ,
attr = _matchingSelector [ 1 ] ,
transform = _matchingSelector [ 2 ] ;
$match = $ ( selector ) ;
$match = transformAndClean ( $match ) ;
result = $match . map ( function ( _ , el ) {
var item = $ ( el ) . attr ( attr ) . trim ( ) ;
return transform ? transform ( item ) : item ;
} ) ;
} else {
$match = $ ( matchingSelector ) ;
$match = transformAndClean ( $match ) ;
result = $match . map ( function ( _ , el ) {
return $ ( el ) . text ( ) . trim ( ) ;
} ) ;
}
result = _Array$isArray ( result . toArray ( ) ) && allowMultiple ? result . toArray ( ) : result [ 0 ] ; // Allow custom extractor to skip default cleaner
// for this type; defaults to true
if ( defaultCleaner && Cleaners [ type ] ) {
return Cleaners [ type ] ( result , _objectSpread ( { } , opts , extractionOpts ) ) ;
}
return result ;
}
function selectExtendedTypes ( extend , opts ) {
var results = { } ;
_Reflect$ownKeys$1 ( extend ) . forEach ( function ( t ) {
if ( ! results [ t ] ) {
results [ t ] = select ( _objectSpread ( { } , opts , {
type : t ,
extractionOpts : extend [ t ]
} ) ) ;
}
} ) ;
return results ;
}
function extractResult ( opts ) {
var type = opts . type ,
extractor = opts . extractor ,
_opts$fallback = opts . fallback ,
fallback = _opts$fallback === void 0 ? true : _opts$fallback ;
var result = select ( _objectSpread ( { } , opts , {
extractionOpts : extractor [ type ]
} ) ) ; // If custom parser succeeds, return the result
if ( result ) {
return result ;
} // If nothing matches the selector, and fallback is enabled,
// run the Generic extraction
if ( fallback ) return GenericExtractor [ type ] ( opts ) ;
return null ;
}
var RootExtractor = {
extract : function extract ( ) {
var extractor = arguments . length > 0 && arguments [ 0 ] !== undefined ? arguments [ 0 ] : GenericExtractor ;
var opts = arguments . length > 1 ? arguments [ 1 ] : undefined ;
var _opts = opts ,
contentOnly = _opts . contentOnly ,
extractedTitle = _opts . extractedTitle ; // This is the generic extractor. Run its extract method
if ( extractor . domain === '*' ) return extractor . extract ( opts ) ;
opts = _objectSpread ( { } , opts , {
extractor : extractor
} ) ;
if ( contentOnly ) {
var _content = extractResult ( _objectSpread ( { } , opts , {
type : 'content' ,
extractHtml : true ,
title : extractedTitle
} ) ) ;
return {
content : _content
} ;
}
var title = extractResult ( _objectSpread ( { } , opts , {
type : 'title'
} ) ) ;
var date _published = extractResult ( _objectSpread ( { } , opts , {
type : 'date_published'
} ) ) ;
var author = extractResult ( _objectSpread ( { } , opts , {
type : 'author'
} ) ) ;
var next _page _url = extractResult ( _objectSpread ( { } , opts , {
type : 'next_page_url'
} ) ) ;
var content = extractResult ( _objectSpread ( { } , opts , {
type : 'content' ,
extractHtml : true ,
title : title
} ) ) ;
var lead _image _url = extractResult ( _objectSpread ( { } , opts , {
type : 'lead_image_url' ,
content : content
} ) ) ;
var excerpt = extractResult ( _objectSpread ( { } , opts , {
type : 'excerpt' ,
content : content
} ) ) ;
var dek = extractResult ( _objectSpread ( { } , opts , {
type : 'dek' ,
content : content ,
excerpt : excerpt
} ) ) ;
var word _count = extractResult ( _objectSpread ( { } , opts , {
type : 'word_count' ,
content : content
} ) ) ;
var direction = extractResult ( _objectSpread ( { } , opts , {
type : 'direction' ,
title : title
} ) ) ;
var _ref3 = extractResult ( _objectSpread ( { } , opts , {
type : 'url_and_domain'
} ) ) || {
url : null ,
domain : null
} ,
url = _ref3 . url ,
domain = _ref3 . domain ;
var extendedResults = { } ;
if ( extractor . extend ) {
extendedResults = selectExtendedTypes ( extractor . extend , opts ) ;
}
return _objectSpread ( {
title : title ,
content : content ,
author : author ,
date _published : date _published ,
lead _image _url : lead _image _url ,
dek : dek ,
next _page _url : next _page _url ,
url : url ,
domain : domain ,
excerpt : excerpt ,
word _count : word _count ,
direction : direction
} , extendedResults ) ;
}
} ;
function collectAllPages ( _x ) {
return _collectAllPages . apply ( this , arguments ) ;
}
function _collectAllPages ( ) {
_collectAllPages = _asyncToGenerator (
/*#__PURE__*/
_regeneratorRuntime . mark ( function _callee ( _ref ) {
var next _page _url , html , $ , metaCache , result , Extractor , title , url , pages , previousUrls , extractorOpts , nextPageResult , word _count ;
return _regeneratorRuntime . wrap ( function _callee$ ( _context ) {
while ( 1 ) {
switch ( _context . prev = _context . next ) {
case 0 :
next _page _url = _ref . next _page _url , html = _ref . html , $ = _ref . $ , metaCache = _ref . metaCache , result = _ref . result , Extractor = _ref . Extractor , title = _ref . title , url = _ref . url ; // At this point, we've fetched just the first page
pages = 1 ;
previousUrls = [ removeAnchor$1 ( url ) ] ;
// If we've gone over 26 pages, something has
// likely gone wrong.
case 3 :
if ( ! ( next _page _url && pages < 26 ) ) {
_context . next = 16 ;
break ;
}
pages += 1 ; // eslint-disable-next-line no-await-in-loop
_context . next = 7 ;
return Resource . create ( next _page _url ) ;
case 7 :
$ = _context . sent ;
html = $ . html ( ) ;
extractorOpts = {
url : next _page _url ,
html : html ,
$ : $ ,
metaCache : metaCache ,
extractedTitle : title ,
previousUrls : previousUrls
} ;
nextPageResult = RootExtractor . extract ( Extractor , extractorOpts ) ;
previousUrls . push ( next _page _url ) ;
result = _objectSpread ( { } , result , {
content : "" . concat ( result . content , "<hr><h4>Page " ) . concat ( pages , "</h4>" ) . concat ( nextPageResult . content )
} ) ; // eslint-disable-next-line prefer-destructuring
next _page _url = nextPageResult . next _page _url ;
_context . next = 3 ;
break ;
case 16 :
word _count = GenericExtractor . word _count ( {
content : "<div>" . concat ( result . content , "</div>" )
} ) ;
return _context . abrupt ( "return" , _objectSpread ( { } , result , {
total _pages : pages ,
rendered _pages : pages ,
word _count : word _count
} ) ) ;
case 18 :
case "end" :
return _context . stop ( ) ;
}
}
} , _callee , this ) ;
} ) ) ;
return _collectAllPages . apply ( this , arguments ) ;
}
var Parser = {
parse : function ( ) {
var _parse = _asyncToGenerator (
/*#__PURE__*/
_regeneratorRuntime . mark ( function _callee ( url ) {
var _ref ,
html ,
opts ,
_opts$fetchAllPages ,
fetchAllPages ,
_opts$fallback ,
fallback ,
_opts$contentType ,
contentType ,
_opts$headers ,
headers ,
extend ,
customExtractor ,
parsedUrl ,
$ ,
Extractor ,
metaCache ,
extendedTypes ,
result ,
_result ,
title ,
next _page _url ,
turndownService ,
_args = arguments ;
return _regeneratorRuntime . wrap ( function _callee$ ( _context ) {
while ( 1 ) {
switch ( _context . prev = _context . next ) {
case 0 :
_ref = _args . length > 1 && _args [ 1 ] !== undefined ? _args [ 1 ] : { } , html = _ref . html , opts = _objectWithoutProperties ( _ref , [ "html" ] ) ;
_opts$fetchAllPages = opts . fetchAllPages , fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages , _opts$fallback = opts . fallback , fallback = _opts$fallback === void 0 ? true : _opts$fallback , _opts$contentType = opts . contentType , contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType , _opts$headers = opts . headers , headers = _opts$headers === void 0 ? { } : _opts$headers , extend = opts . extend , customExtractor = opts . customExtractor ; // if no url was passed and this is the browser version,
// set url to window.location.href and load the html
// from the current page
if ( ! url && cheerio$1 . browser ) {
url = window . location . href ; // eslint-disable-line no-undef
html = html || cheerio$1 . html ( ) ;
}
parsedUrl = URL$1 . parse ( url ) ;
if ( validateUrl ( parsedUrl ) ) {
_context . next = 6 ;
break ;
}
return _context . abrupt ( "return" , {
error : true ,
message : 'The url parameter passed does not look like a valid URL. Please check your URL and try again.'
} ) ;
case 6 :
_context . next = 8 ;
return Resource . create ( url , html , parsedUrl , headers ) ;
case 8 :
$ = _context . sent ;
if ( ! $ . failed ) {
_context . next = 11 ;
break ;
}
return _context . abrupt ( "return" , $ ) ;
case 11 :
// Add custom extractor via cli.
if ( customExtractor ) {
addExtractor ( customExtractor ) ;
}
Extractor = getExtractor ( url , parsedUrl , $ ) ; // console.log(`Using extractor for ${Extractor.domain}`);
// if html still has not been set (i.e., url passed to Parser.parse),
// set html from the response of Resource.create
if ( ! html ) {
html = $ . html ( ) ;
} // Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
metaCache = $ ( 'meta' ) . map ( function ( _ , node ) {
return $ ( node ) . attr ( 'name' ) ;
} ) . toArray ( ) ;
extendedTypes = { } ;
if ( extend ) {
extendedTypes = selectExtendedTypes ( extend , {
$ : $ ,
url : url ,
html : html
} ) ;
}
result = RootExtractor . extract ( Extractor , {
url : url ,
html : html ,
$ : $ ,
metaCache : metaCache ,
parsedUrl : parsedUrl ,
fallback : fallback ,
contentType : contentType
} ) ;
_result = result , title = _result . title , next _page _url = _result . next _page _url ; // Fetch more pages if next_page_url found
if ( ! ( fetchAllPages && next _page _url ) ) {
_context . next = 25 ;
break ;
}
_context . next = 22 ;
return collectAllPages ( {
Extractor : Extractor ,
next _page _url : next _page _url ,
html : html ,
$ : $ ,
metaCache : metaCache ,
result : result ,
title : title ,
url : url
} ) ;
case 22 :
result = _context . sent ;
_context . next = 26 ;
break ;
case 25 :
result = _objectSpread ( { } , result , {
total _pages : 1 ,
rendered _pages : 1
} ) ;
case 26 :
if ( contentType === 'markdown' ) {
turndownService = new TurndownService ( ) ;
result . content = turndownService . turndown ( result . content ) ;
} else if ( contentType === 'text' ) {
result . content = $ . text ( $ ( result . content ) ) ;
}
return _context . abrupt ( "return" , _objectSpread ( { } , result , extendedTypes ) ) ;
case 28 :
case "end" :
return _context . stop ( ) ;
}
}
} , _callee , this ) ;
} ) ) ;
function parse ( _x ) {
return _parse . apply ( this , arguments ) ;
}
return parse ;
} ( ) ,
browser : ! ! cheerio$1 . browser ,
// A convenience method for getting a resource
// to work with, e.g., for custom extractor generator
fetchResource : function fetchResource ( url ) {
return Resource . create ( url ) ;
} ,
addExtractor : function addExtractor$$1 ( extractor ) {
return addExtractor ( extractor ) ;
}
} ;
var mercury = Parser ;
function insertValues ( strings ) {
for ( var _len = arguments . length , values = new Array ( _len > 1 ? _len - 1 : 0 ) , _key = 1 ; _key < _len ; _key ++ ) {
values [ _key - 1 ] = arguments [ _key ] ;
}
if ( values . length ) {
return strings . reduce ( function ( result , part , idx ) {
var value = values [ idx ] ;
if ( value && typeof value . toString === 'function' ) {
value = value . toString ( ) ;
} else {
value = '' ;
}
return result + part + value ;
} , '' ) ;
}
return strings . join ( '' ) ;
}
var bodyPattern = /^\n([\s\S]+)\s{2}$/gm ;
var trailingWhitespace = /\s+$/ ;
function template ( strings ) {
for ( var _len = arguments . length , values = new Array ( _len > 1 ? _len - 1 : 0 ) , _key = 1 ; _key < _len ; _key ++ ) {
values [ _key - 1 ] = arguments [ _key ] ;
}
var compiled = insertValues . apply ( void 0 , [ strings ] . concat ( values ) ) ;
var _ref = compiled . match ( bodyPattern ) || [ ] ,
_ref2 = _slicedToArray ( _ref , 1 ) ,
body = _ref2 [ 0 ] ;
var indentLevel = /^\s{0,4}(.+)$/g ;
if ( ! body ) {
body = compiled ;
indentLevel = /^\s{0,2}(.+)$/g ;
}
return body . split ( '\n' ) . slice ( 1 ) . map ( function ( line ) {
line = line . replace ( indentLevel , '$1' ) ;
if ( trailingWhitespace . test ( line ) ) {
line = line . replace ( trailingWhitespace , '' ) ;
}
return line ;
} ) . join ( '\n' ) ;
}
function _templateObject ( ) {
var data = _taggedTemplateLiteral ( [ "\n export const " , " = {\n domain: '" , "',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n " ] ) ;
_templateObject = function _templateObject ( ) {
return data ;
} ;
return data ;
}
function extractorTemplate ( hostname , name ) {
return template ( _templateObject ( ) , name , hostname ) ;
}
function _templateObject2 ( ) {
var data = _taggedTemplateLiteral ( [ "\n import assert from 'assert';\n import URL from 'url';\n import cheerio from 'cheerio';\n\n import Parser from 'mercury';\n import getExtractor from 'extractors/get-extractor';\n import { excerptContent } from 'utils/text';\n\n const fs = require('fs');\n\n describe('" , "', () => {\n describe('initial test case', () => {\n let result;\n let url;\n beforeAll(() => {\n url =\n '" , "';\n const html =\n fs.readFileSync('" , "');\n result =\n Parser.parse(url, { html, fallback: false });\n });\n\n it('is selected properly', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n " , "\n\n it('returns the content', async () => {\n // To pass this test, fill out the content selector\n // in " , "/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const { content } = await result;\n\n const $ = cheerio.load(content || '');\n\n const first13 = excerptContent($('*').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, 'Add the first 13 words of the article here');\n });\n });\n });\n " ] ) ;
_templateObject2 = function _templateObject2 ( ) {
return data ;
} ;
return data ;
}
function _templateObject$1 ( ) {
var data = _taggedTemplateLiteral ( [ "\n it('returns the " , "', async () => {\n // To pass this test, fill out the " , " selector\n // in " , "/index.js.\n const { " , " } = await result\n\n // Update these values with the expected values from\n // the article.\n assert.equal(" , ", " , ")\n });\n " ] ) ;
_templateObject$1 = function _templateObject ( ) {
return data ;
} ;
return data ;
}
var IGNORE = [ 'url' , 'domain' , 'content' , 'word_count' , 'next_page_url' , 'excerpt' , 'direction' , 'total_pages' , 'rendered_pages' ] ;
function testFor ( key , value , dir ) {
if ( IGNORE . find ( function ( k ) {
return k === key ;
} ) ) return '' ;
return template ( _templateObject$1 ( ) , key , key , dir , key , key , value ? "`" . concat ( value , "`" ) : "''" ) ;
}
function extractorTestTemplate ( file , url , dir , result , name ) {
return template ( _templateObject2 ( ) , name , url , file , _Reflect$ownKeys ( result ) . map ( function ( k ) {
return testFor ( k , result [ k ] , dir ) ;
} ) . join ( '\n\n' ) , dir ) ;
}
var questions = [ {
type : 'input' ,
name : 'website' ,
message : "Paste a url to an article you'd like to create or extend a parser for:" ,
validate : function validate ( value ) {
var _URL$parse = URL . parse ( value ) ,
hostname = _URL$parse . hostname ;
if ( hostname ) return true ;
return false ;
}
} ] ;
var spinner ;
function confirm ( fn , args , msg , newParser ) {
spinner = ora ( {
text : msg
} ) ;
spinner . start ( ) ;
var result = fn . apply ( void 0 , _toConsumableArray ( args ) ) ;
if ( result && result . then ) {
result . then ( function ( r ) {
return savePage ( r , args , newParser ) ;
} ) ;
} else {
spinner . succeed ( ) ;
}
return result ;
}
function confirmCreateDir ( dir , msg ) {
if ( ! fs . existsSync ( dir ) ) {
confirm ( fs . mkdirSync , [ dir ] , msg ) ;
}
}
function getDir ( url ) {
var _URL$parse2 = URL . parse ( url ) ,
hostname = _URL$parse2 . hostname ;
return "./src/extractors/custom/" . concat ( hostname ) ;
}
function scaffoldCustomParser ( url ) {
var dir = getDir ( url ) ;
var _URL$parse3 = URL . parse ( url ) ,
hostname = _URL$parse3 . hostname ;
var newParser = false ;
if ( ! fs . existsSync ( dir ) ) {
newParser = true ;
confirmCreateDir ( dir , "Creating " . concat ( hostname , " directory" ) ) ;
confirmCreateDir ( "./fixtures/" . concat ( hostname ) , 'Creating fixtures directory' ) ;
}
confirm ( mercury . fetchResource , [ url ] , 'Fetching fixture' , newParser ) ;
} // if has arg, just assume that arg is a url and skip prmopt
var urlArg = process . argv [ 2 ] ;
if ( urlArg ) {
scaffoldCustomParser ( urlArg ) ;
} else {
inquirer . prompt ( questions ) . then ( function ( answers ) {
scaffoldCustomParser ( answers . website ) ;
} ) ;
}
function generateScaffold ( url , file , result ) {
var _URL$parse4 = URL . parse ( url ) ,
hostname = _URL$parse4 . hostname ;
var extractor = extractorTemplate ( hostname , extractorName ( hostname ) ) ;
var extractorTest = extractorTestTemplate ( file , url , getDir ( url ) , result , extractorName ( hostname ) ) ;
fs . writeFileSync ( "" . concat ( getDir ( url ) , "/index.js" ) , extractor ) ;
fs . writeFileSync ( "" . concat ( getDir ( url ) , "/index.test.js" ) , extractorTest ) ;
fs . appendFileSync ( './src/extractors/custom/index.js' , exportString ( url ) ) ;
child _process . exec ( "npm run lint-fix-quiet -- " . concat ( getDir ( url ) , "/*.js" ) ) ;
}
function savePage ( $ , _ref , newParser ) {
var _ref2 = _slicedToArray ( _ref , 1 ) ,
url = _ref2 [ 0 ] ;
var _URL$parse5 = URL . parse ( url ) ,
hostname = _URL$parse5 . hostname ;
spinner . succeed ( ) ;
var filename = new Date ( ) . getTime ( ) ;
var file = "./fixtures/" . concat ( hostname , "/" ) . concat ( filename , ".html" ) ; // fix http(s) relative links:
makeLinksAbsolute$$1 ( $ ( '*' ) . first ( ) , $ , url ) ;
$ ( '[src], [href]' ) . each ( function ( index , node ) {
var $node = $ ( node ) ;
var link = $node . attr ( 'src' ) ;
if ( link && link . slice ( 0 , 2 ) === '//' ) {
$node . attr ( 'src' , "http:" . concat ( link ) ) ;
}
} ) ;
var html = stripJunkTags ( $ ( '*' ) . first ( ) , $ , [ 'script' ] ) . html ( ) ;
fs . writeFileSync ( file , html ) ;
mercury . parse ( url , {
html : html
} ) . then ( function ( result ) {
if ( newParser ) {
confirm ( generateScaffold , [ url , file , result ] , 'Generating parser and tests' ) ;
console . log ( "Your custom site extractor has been set up. To get started building it, run\n yarn watch:test -- " . concat ( hostname , "\n -- OR --\n npm run watch:test -- " ) . concat ( hostname ) ) ;
} else {
console . log ( "\n It looks like you already have a custom parser for this url.\n The page you linked to has been added to " . concat ( file , ". Copy and paste\n the following code to use that page in your tests:\n const html = fs.readFileSync('" ) . concat ( file , "');" ) ) ;
}
} ) ;
}
function exportString ( url ) {
var _URL$parse6 = URL . parse ( url ) ,
hostname = _URL$parse6 . hostname ;
return "export * from './" . concat ( hostname , "';" ) ;
}
function extractorName ( hostname ) {
var name = hostname . split ( '.' ) . map ( function ( w ) {
return "" . concat ( w . charAt ( 0 ) . toUpperCase ( ) ) . concat ( w . slice ( 1 ) ) ;
} ) . join ( '' ) ;
return "" . concat ( name , "Extractor" ) ;
}
//# sourceMappingURL=generate-custom-parser.js.map