@ -900,7 +903,6 @@ function removeUnlessContent($node, $, weight) {
}
}
/* eslint-disable */
functionabsolutize($,rootUrl,attr,$content){
$('['+attr+']',$content).each(function(_,node){
varattrs=getAttrs(node);
@ -1013,14 +1015,15 @@ function setAttr(node, attr, val) {
returnnode;
}
/* eslint-disable */
functionsetAttrs(node,attrs){
if(node.attribs){
node.attribs=attrs;
}elseif(node.attributes){
while(node.attributes.length>0){
node.removeAttribute(node.attributes[0].name);
}_Reflect$ownKeys(attrs).forEach(function(key){
}
_Reflect$ownKeys(attrs).forEach(function(key){
node.setAttribute(key,attrs[key]);
});
}
@ -1030,17 +1033,62 @@ function setAttrs(node, attrs) {
// DOM manipulation
function_interopDefault$1(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault$1(regenerator);var_extends$1=_interopDefault$1(_extends);var_asyncToGenerator=_interopDefault$1(asyncToGenerator);varURL$1=_interopDefault$1(URL);varcheerio$1=_interopDefault$1(cheerio);var_Promise=_interopDefault$1(promise);varrequest$1=_interopDefault$1(request);var _Reflect$ownKeys$1=_interopDefault$1(_Reflect$ownKeys);var_toConsumableArray$1=_interopDefault$1(_toConsumableArray);var_defineProperty$1=_interopDefault$1(_defineProperty);var_slicedToArray$1=_interopDefault$1(_slicedToArray);var_typeof$1=_interopDefault$1(_typeof);var_getIterator$1=_interopDefault$1(_getIterator);var_Object$keys=_interopDefault$1(keys);varstringDirection$1=_interopDefault$1(stringDirection);varvalidUrl$1=_interopDefault$1(validUrl);varmoment$1=_interopDefault$1(moment);varwuzzy$1=_interopDefault$1(wuzzy);vardifflib$1=_interopDefault$1(difflib);var_Array$from=_interopDefault$1(from);varellipsize$1=_interopDefault$1(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length>0&&arguments[0]!==undefined?arguments[0]:1;varend=arguments.length>1&&arguments[1]!==undefined?arguments[1]:1;return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
function_interopDefault$1(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault$1(regenerator);var_extends$1=_interopDefault$1(_extends);var_asyncToGenerator=_interopDefault$1(asyncToGenerator);varURL$1=_interopDefault$1(URL);varcheerio$1=_interopDefault$1(cheerio);var_Promise=_interopDefault$1(promise);varrequest$1=_interopDefault$1(request);variconv=_interopDefault$1(iconvLite);var_slicedToArray$1=_interopDefault$1(_slicedToArray);var_Reflect$ownKeys$1=_interopDefault$1(_Reflect$ownKeys);var_toConsumableArray$1=_interopDefault$1(_toConsumableArray);var_defineProperty$1=_interopDefault$1(_defineProperty);var_typeof$1=_interopDefault$1(_typeof);var_getIterator$1=_interopDefault$1(_getIterator);var_Object$keys=_interopDefault$1(keys);varstringDirection$1=_interopDefault$1(stringDirection);varvalidUrl$1=_interopDefault$1(validUrl);varmoment=_interopDefault$1(momentTimezone);varparseFormat=_interopDefault$1(momentParseformat);varwuzzy$1=_interopDefault$1(wuzzy);vardifflib$1=_interopDefault$1(difflib);var_Array$from=_interopDefault$1(from);varellipsize$1=_interopDefault$1(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length>0&&arguments[0]!==undefined?arguments[0]:1;varend=arguments.length>1&&arguments[1]!==undefined?arguments[1]:1;return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
functionvalidateUrl(_ref){varhostname=_ref.hostname;// If this isn't a valid url, return an error message
return!!hostname;}varErrors={badUrl:{error:true,messages:'The url parameter passed does not look like a valid URL. Please check your data and try again.'}};varREQUEST_HEADERS={'User-Agent':'Readability - http://readability.com/about/'};// The number of milliseconds to attempt to fetch a resource before timing out.
return!!hostname;}varErrors={badUrl:{error:true,messages:'The url parameter passed does not look like a valid URL. Please check your data and try again.'}};varNORMALIZE_RE$1=/\s{2,}/g;functionnormalizeSpaces$1(text){returntext.replace(NORMALIZE_RE$1,' ').trim();}// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
functionextractFromUrl$1(url,regexList){varmatchRe=regexList.find(function(re){returnre.test(url);});if(matchRe){returnmatchRe.exec(url)[1];}returnnull;}// An expression that looks to try to find the page digit within a URL, if
returnpageNum<100?pageNum:null;}functionremoveAnchor$1(url){returnurl.split('#')[0].replace(/\/$/,'');}functionisGoodSegment$1(segment,index,firstSegmentHasLetters){vargoodSegment=true;// If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if(index<2&&IS_DIGIT_RE$1.test(segment)&&segment.length<3){goodSegment=true;}// If this is the first url_segment and it's just "index",
// remove it
if(index===0&&segment.toLowerCase()==='index'){goodSegment=false;}// If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if(index<2&&segment.length<3&&!firstSegmentHasLetters){goodSegment=false;}returngoodSegment;}// Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
functionarticleBaseUrl$1(url,parsed){varparsedUrl=parsed||URL$1.parse(url);varprotocol=parsedUrl.protocol,host=parsedUrl.host,path=parsedUrl.path;varfirstSegmentHasLetters=false;varcleanedSegments=path.split('/').reverse().reduce(function(acc,rawSegment,index){varsegment=rawSegment;// Split off and save anything that looks like a file type.
if(segment.includes('.')){var_segment$split=segment.split('.'),_segment$split2=_slicedToArray$1(_segment$split,2),possibleSegment=_segment$split2[0],fileExt=_segment$split2[1];if(IS_ALPHA_RE$1.test(fileExt)){segment=possibleSegment;}}// If our first or second segment has anything looking like a page
// number, remove it.
if(PAGE_IN_HREF_RE$1.test(segment)&&index<2){segment=segment.replace(PAGE_IN_HREF_RE$1,'');}// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if(index===0){firstSegmentHasLetters=HAS_ALPHA_RE$1.test(segment);}// If it's not marked for deletion, push it to cleaned_segments.
if(isGoodSegment$1(segment,index,firstSegmentHasLetters)){acc.push(segment);}returnacc;},[]);returnprotocol+'//'+host+cleanedSegments.reverse().join('/');}// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
varSENTENCE_END_RE$1=newRegExp('.( |$)');functionhasSentenceEnd$1(text){returnSENTENCE_END_RE$1.test(text);}functionexcerptContent$1(content){varwords=arguments.length>1&&arguments[1]!==undefined?arguments[1]:10;returncontent.trim().split(/\s+/).slice(0,words).join(' ');}// check a string for encoding; this is
// used in our fetchResource function to
// ensure correctly encoded responses
functiongetEncoding$1(str){if(ENCODING_RE$1.test(str)){returnENCODING_RE$1.exec(str)[1];}returnnull;}// Browser does not like us setting user agent
varREQUEST_HEADERS=cheerio$1.browser?{}:{'User-Agent':'Mercury - https://mercury.postlight.com/web-parser/'};// The number of milliseconds to attempt to fetch a resource before timing out.
varFETCH_TIMEOUT=10000;// Content types that we do not extract content from
varBAD_CONTENT_TYPES=['audio/mpeg','image/gif','image/jpeg','image/jpg'];varBAD_CONTENT_TYPES_RE=newRegExp('^('+BAD_CONTENT_TYPES.join('|')+')$','i');// Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.
varMAX_CONTENT_LENGTH=5242880;// Turn the global proxy on or off
// Proxying is not currently enabled in Python source
// so not implementing logic in port.
functionget(options){// eslint-disable-line
returnnew_Promise(function(resolve,reject){request$1(options,function(err,response,body){if(err){reject(err);}else{resolve({body:body,response:response});}});});}// Evaluate a response to ensure it's something we should be keeping.
functionget(options){returnnew_Promise(function(resolve,reject){request$1(options,function(err,response,body){if(err){reject(err);}else{varencoding=getEncoding$1(response.headers['content-type']);if(iconv.encodingExists(encoding)){body=iconv.decode(body,encoding);}resolve({body:body,response:response});}});});}// Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 level or
// not. Validation here means that we haven't found reason to bail from
// further processing of this url.
@ -1059,7 +1107,7 @@ if(contentLength>MAX_CONTENT_LENGTH){throw new Error('Content for this resource
// proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
varfetchResource$1=function(){var_ref2=_asyncToGenerator(_regeneratorRuntime.mark(function_callee(url,parsedUrl){varoptions,_ref3,response,body;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:parsedUrl=parsedUrl||URL$1.parse(encodeURI(url));options={url:parsedUrl,headers:_extends$1({},REQUEST_HEADERS),timeout:FETCH_TIMEOUT,// Don't set encoding; fixes issues
varfetchResource$1=function(){var_ref2=_asyncToGenerator(_regeneratorRuntime.mark(function_callee(url,parsedUrl){varoptions,_ref3,response,body;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:parsedUrl=parsedUrl||URL$1.parse(encodeURI(url));options={url:parsedUrl.href,headers:_extends$1({},REQUEST_HEADERS),timeout:FETCH_TIMEOUT,// Don't set encoding; fixes issues
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.
functionnormalizeMetaTags($){$=convertMetaProp($,'content','value');$=convertMetaProp($,'property','name');return$;}// Spacer images to be removed
varSPACER_RE$1=newRegExp('trans|transparent|spacer|blank','i');// The class we will use to mark elements we want to keep
varSPACER_RE$1=newRegExp('transparent|spacer|blank','i');// The class we will use to mark elements we want to keep
// but would normally remove
varKEEP_CLASS$1='mercury-parser-keep';varKEEP_SELECTORS$1=['iframe[src^="https://www.youtube.com"]','iframe[src^="http://www.youtube.com"]','iframe[src^="https://player.vimeo"]','iframe[src^="http://player.vimeo"]'];// A list of tags to strip from the output if we encounter them.
// (By-reference mutation, though. Returned just for convenience.)
functionconvertToParagraphs$$1($){$=brsToPs$$1($);$=convertDivs$1($);$=convertSpans$2($);return$;}functionconvertNodeTo$$1($node,$){vartag=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'p';varnode=$node.get(0);if(!node){return$;}varattrs=getAttrs$1(node)||{};varattribString=_Reflect$ownKeys$1(attrs).map(function(key){returnkey+'='+attrs[key];}).join(' ');varhtml=void0;if($.browser){// In the browser, the contents of noscript tags aren't rendered, therefore
varattribString=_Reflect$ownKeys$1(attrs).map(function(key){returnkey+'='+attrs[key];}).join(' ');varhtml=void0;if($.browser){// In the browser, the contents of noscript tags aren't rendered, therefore
// transforms on the noscript tag (commonly used for lazy-loading) don't work
// as expected. This test case handles that
html=node.tagName.toLowerCase()==='noscript'?$node.text():$node.html();}else{html=$node.contents();}$node.replaceWith('<'+tag+' '+attribString+'>'+html+'</'+tag+'>');return$;}functioncleanForHeight$1($img,$){varheight=parseInt($img.attr('height'),10);varwidth=parseInt($img.attr('width'),10)||20;// Remove images that explicitly have very small heights or
@ -1279,8 +1328,7 @@ if(PHOTO_HINTS_RE$1$1.test(classes)){score+=10;}// add 25 if class matches entry
if(READABILITY_ASSET$1$1.test(classes)){score+=25;}}returnscore;}// returns the score of a node based on
returnparseFloat($node.attr('score'))||null;}// return 1 for every comma in text
functiongetScore$1($node){returnparseFloat($node.attr('score'))||null;}// return 1 for every comma in text
functionscoreCommas$1(text){return(text.match(/,/g)||[]).length;}varidkRe$1=newRegExp('^(p|pre)$','i');functionscoreLength$1(textLength){vartagName=arguments.length>1&&arguments[1]!==undefined?arguments[1]:'p';varchunks=textLength/50;if(chunks>0){varlengthBonus=void0;// No idea why p or pre are being tamped down here
scorePs$1($,weightNodes);scorePs$1($,weightNodes);return$;}varNORMALIZE_RE$1=/\s{2,}/g;functionnormalizeSpaces$1(text){returntext.replace(NORMALIZE_RE$1,' ').trim();}// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
returnpageNum<100?pageNum:null;}functionremoveAnchor$1(url){returnurl.split('#')[0].replace(/\/$/,'');}functionisGoodSegment$1(segment,index,firstSegmentHasLetters){vargoodSegment=true;// If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if(index<2&&IS_DIGIT_RE$1.test(segment)&&segment.length<3){goodSegment=true;}// If this is the first url_segment and it's just "index",
// remove it
if(index===0&&segment.toLowerCase()==='index'){goodSegment=false;}// If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if(index<2&&segment.length<3&&!firstSegmentHasLetters){goodSegment=false;}returngoodSegment;}// Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
functionarticleBaseUrl$1(url,parsed){varparsedUrl=parsed||URL$1.parse(url);varprotocol=parsedUrl.protocol,host=parsedUrl.host,path=parsedUrl.path;varfirstSegmentHasLetters=false;varcleanedSegments=path.split('/').reverse().reduce(function(acc,rawSegment,index){varsegment=rawSegment;// Split off and save anything that looks like a file type.
if(segment.includes('.')){var_segment$split=segment.split('.'),_segment$split2=_slicedToArray$1(_segment$split,2),possibleSegment=_segment$split2[0],fileExt=_segment$split2[1];if(IS_ALPHA_RE$1.test(fileExt)){segment=possibleSegment;}}// If our first or second segment has anything looking like a page
// number, remove it.
if(PAGE_IN_HREF_RE$1.test(segment)&&index<2){segment=segment.replace(PAGE_IN_HREF_RE$1,'');}// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if(index===0){firstSegmentHasLetters=HAS_ALPHA_RE$1.test(segment);}// If it's not marked for deletion, push it to cleaned_segments.
if(isGoodSegment$1(segment,index,firstSegmentHasLetters)){acc.push(segment);}returnacc;},[]);returnprotocol+'//'+host+cleanedSegments.reverse().join('/');}// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
varSENTENCE_END_RE$1=newRegExp('.( |$)');functionhasSentenceEnd$1(text){returnSENTENCE_END_RE$1.test(text);}functionexcerptContent$1(content){varwords=arguments.length>1&&arguments[1]!==undefined?arguments[1]:10;returncontent.trim().split(/\s+/).slice(0,words).join(' ');}// Now that we have a top_candidate, look through the siblings of
scorePs$1($,weightNodes);scorePs$1($,weightNodes);return$;}// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
functionrewriteTopLevel$$1(article,$){// I'm not using context here because
// it's problematic when converting the
// top-level/root node - AP
$=convertNodeTo$$1($('html'),$,'div');$=convertNodeTo$$1($('body'),$,'div');return$;}/* eslint-disable */functionabsolutize$1($,rootUrl,attr,$content){$('['+attr+']',$content).each(function(_,node){varattrs=getAttrs$1(node);varurl=attrs[attr];if(url){varabsoluteUrl=URL$1.resolve(rootUrl,url);setAttr$1(node,attr,absoluteUrl);}});}functionmakeLinksAbsolute$$1($content,$,url){['href','src'].forEach(function(attr){returnabsolutize$1($,url,attr,$content);});return$content;}functiontextLength$1(text){returntext.trim().replace(/\s+/g,' ').length;}// Determines what percentage of the text
$=convertNodeTo$$1($('html'),$,'div');$=convertNodeTo$$1($('body'),$,'div');return$;}functionabsolutize$1($,rootUrl,attr,$content){$('['+attr+']',$content).each(function(_,node){varattrs=getAttrs$1(node);varurl=attrs[attr];if(url){varabsoluteUrl=URL$1.resolve(rootUrl,url);setAttr$1(node,attr,absoluteUrl);}});}functionmakeLinksAbsolute$$1($content,$,url){['href','src'].forEach(function(attr){returnabsolutize$1($,url,attr,$content);});return$content;}functiontextLength$1(text){returntext.trim().replace(/\s+/g,' ').length;}// Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float
functionlinkDensity$1($node){vartotalTextLength=textLength$1($node.text());varlinkText=$node.find('a').text();varlinkLength=textLength$1(linkText);if(totalTextLength>0){returnlinkLength/totalTextLength;}elseif(totalTextLength===0&&linkLength>0){return1;}return0;}// Given a node type to search for, and a list of meta tag names to
@ -1447,21 +1452,21 @@ function stripTags$1(text,$){// Wrapping text in html element prevents errors wh
varcleanText=$('<span>'+text+'</span>').text();returncleanText===''?text:cleanText;}functionwithinComment$$1($node){varparents=$node.parents().toArray();varcommentParent=parents.find(function(parent){varattrs=getAttrs$1(parent);varnodeClass=attrs.class,id=attrs.id;varclassAndId=nodeClass+' '+id;returnclassAndId.includes('comment');});returncommentParent!==undefined;}// Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean
functionnodeIsSufficient$1($node){return$node.text().trim().length>=100;}functionisWordpress$1($){return$(IS_WP_SELECTOR$1).length>0;}functiongetAttrs$1(node){varattribs=node.attribs,attributes=node.attributes;if(!attribs&&attributes){varattrs=_Reflect$ownKeys$1(attributes).reduce(function(acc,index){varattr=attributes[index];acc[attr.name]=attr.value;returnacc;},{});returnattrs;}returnattribs;}functionsetAttr$1(node,attr,val){if(node.attribs){node.attribs[attr]=val;}elseif(node.attributes){node.setAttribute(attr,val);}returnnode;}/* eslint-disable */functionsetAttrs$1(node,attrs){if(node.attribs){node.attribs=attrs;}elseif(node.attributes){while(node.attributes.length>0){node.removeAttribute(node.attributes[0].name);}_Reflect$ownKeys$1(attrs).forEach(function(key){node.setAttribute(key,attrs[key]);});}returnnode;}// DOM manipulation
functionnodeIsSufficient$1($node){return$node.text().trim().length>=100;}functionisWordpress$1($){return$(IS_WP_SELECTOR$1).length>0;}functiongetAttrs$1(node){varattribs=node.attribs,attributes=node.attributes;if(!attribs&&attributes){varattrs=_Reflect$ownKeys$1(attributes).reduce(function(acc,index){varattr=attributes[index];if(!attr.name||!attr.value)returnacc;acc[attr.name]=attr.value;returnacc;},{});returnattrs;}returnattribs;}functionsetAttr$1(node,attr,val){if(node.attribs){node.attribs[attr]=val;}elseif(node.attributes){node.setAttribute(attr,val);}returnnode;}functionsetAttrs$1(node,attrs){if(node.attribs){node.attribs=attrs;}elseif(node.attributes){while(node.attributes.length>0){node.removeAttribute(node.attributes[0].name);}_Reflect$ownKeys$1(attrs).forEach(function(key){node.setAttribute(key,attrs[key]);});}returnnode;}// DOM manipulation
varIS_LINK=newRegExp('https?://','i');varIS_IMAGE=newRegExp('.(png|gif|jpe?g)','i');varTAGS_TO_REMOVE=['script','style','form'].join(',');// Convert all instances of images with potentially
// lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
functionconvertLazyLoadedImages($){$('img').each(function(_,img){varattrs=getAttrs$1(img);_Reflect$ownKeys$1(attrs).forEach(function(attr){varvalue=attrs[attr];if(attr!=='src'&&IS_LINK.test(value)&&IS_IMAGE.test(value)){$(img).attr('src',value);}});});return$;}functionisComment(index,node){returnnode.type==='comment';}functioncleanComments($){$('*').first().contents().filter(isComment).remove();return$;}functionclean($){$(TAGS_TO_REMOVE).remove();$=cleanComments($);return$;}varResource={// Create a Resource.
functionconvertLazyLoadedImages($){$('img').each(function(_,img){varattrs=getAttrs$1(img);_Reflect$ownKeys$1(attrs).forEach(function(attr){varvalue=attrs[attr];if(attr!=='src'&&IS_LINK.test(value)&&IS_IMAGE.test(value)){$(img).attr('src',value);}});});return$;}functionisComment(index,node){returnnode.type==='comment';}functioncleanComments($){$.root().find('*').contents().filter(isComment).remove();return$;}functionclean($){$(TAGS_TO_REMOVE).remove();$=cleanComments($);return$;}varResource={// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
create:functioncreate(url,preparedResponse,parsedUrl){var_this=this;return_asyncToGenerator(_regeneratorRuntime.mark(function_callee(){varresult,validResponse;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:result=void0;if(!preparedResponse){_context.next=6;break;}validResponse={statusMessage:'OK',statusCode:200,headers:{'content-type':'text/html','content-length':500}};result={body:preparedResponse,response:validResponse};_context.next=9;break;case6:_context.next=8;returnfetchResource$1(url,parsedUrl);case8:result=_context.sent;case9:if(!result.error){_context.next=11;break;}return_context.abrupt('return',result);case11:return_context.abrupt('return',_this.generateDoc(result));case12:case'end':return_context.stop();}}},_callee,_this);}))();},generateDoc:functiongenerateDoc(_ref){varcontent=_ref.body,response=_ref.response;varcontentType=response.headers['content-type'];// TODO: Implement is_text function from
create:functioncreate(url,preparedResponse,parsedUrl){var_this=this;return_asyncToGenerator(_regeneratorRuntime.mark(function_callee(){varresult,validResponse;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:result=void0;if(!preparedResponse){_context.next=6;break;}validResponse={statusMessage:'OK',statusCode:200,headers:{'content-type':'text/html','content-length':500}};result={body:preparedResponse,response:validResponse};_context.next=9;break;case6:_context.next=8;returnfetchResource$1(url,parsedUrl);case8:result=_context.sent;case9:if(!result.error){_context.next=12;break;}result.failed=true;return_context.abrupt('return',result);case12:return_context.abrupt('return',_this.generateDoc(result));case13:case'end':return_context.stop();}}},_callee,_this);}))();},generateDoc:functiongenerateDoc(_ref){varcontent=_ref.body,response=_ref.response;varcontentType=response.headers['content-type'];// TODO: Implement is_text function from
if(!contentType.includes('html')&&!contentType.includes('text')){thrownewError('Content does not appear to be text.');}var$=cheerio$1.load(content,{normalizeWhitespace:true});if($('*').first().children().length===0){thrownewError('No children, likely a bad parse.');}$=normalizeMetaTags($);$=convertLazyLoadedImages($);$=clean($);return$;}};varmerge=functionmerge(extractor,domains){returndomains.reduce(function(acc,domain){acc[domain]=extractor;returnacc;},{});};functionmergeSupportedDomains(extractor){returnextractor.supportedDomains?merge(extractor,[extractor.domain].concat(_toConsumableArray$1(extractor.supportedDomains))):merge(extractor,[extractor.domain]);}varBloggerExtractor={domain:'blogspot.com',content:{// Blogger is insane and does not load its content
if(!contentType.includes('html')&&!contentType.includes('text')){thrownewError('Content does not appear to be text.');}var$=cheerio$1.load(content,{normalizeWhitespace:true});if($.root().children().length===0){thrownewError('No children, likely a bad parse.');}$=normalizeMetaTags($);$=convertLazyLoadedImages($);$=clean($);return$;}};varmerge=functionmerge(extractor,domains){returndomains.reduce(function(acc,domain){acc[domain]=extractor;returnacc;},{});};functionmergeSupportedDomains(extractor){returnextractor.supportedDomains?merge(extractor,[extractor.domain].concat(_toConsumableArray$1(extractor.supportedDomains))):merge(extractor,[extractor.domain]);}varBloggerExtractor={domain:'blogspot.com',content:{// Blogger is insane and does not load its content
// initially in the page, but it's all there
// in noscript
selectors:['.post-content noscript'],// Selectors to remove from the extracted content
@ -1476,7 +1481,7 @@ clean:['.ad','.single-related-story'],// Object of tranformations to make on mat
// the transformation.
transforms:{// Convert h1s to h2s
h1:'h2',// Convert lazy-loaded noscript images to figures
noscript:functionnoscript($node,$){if($.browser){var$children=$($node.text());if($children.length===1&&$children.get(0)!==undefined&&$children.get(0).tagName.toLowerCase()==='img'){return'figure';}}else{var_$children=$node.children();if(_$children.length===1&&_$children.get(0).tagName==='img'){return'figure';}}returnnull;}}},title:{selectors:['h1.lede-feature-title','h1.headline-primary','h1']},author:{selectors:['.by-authors','.lede-feature-author']},dek:{selectors:['.lede-feature-teaser']},date_published:{selectors:[['time.article-timestamp[datetime]','datetime'],'time.article-timestamp']}};varWikipediaExtractor={domain:'wikipedia.org',content:{selectors:['#mw-content-text'],defaultCleaner:false,// transform top infobox to an image with caption
noscript:functionnoscript($node,$){var$children=$.browser?$($node.text()):$node.children();if($children.length===1&&$children.get(0)!==undefined&&$children.get(0).tagName.toLowerCase()==='img'){return'figure';}returnnull;}}},title:{selectors:['h1.lede-feature-title','h1.headline-primary','h1']},author:{selectors:['.by-authors','.lede-feature-author']},dek:{selectors:['.lede-feature-teaser']},date_published:{selectors:[['time.article-timestamp[datetime]','datetime'],'time.article-timestamp']}};varWikipediaExtractor={domain:'wikipedia.org',content:{selectors:['#mw-content-text'],defaultCleaner:false,// transform top infobox to an image with caption
transforms:{'.infobox img':functioninfoboxImg($node){var$parent=$node.parents('.infobox');// Only prepend the first image in .infobox
if($parent.children('img').length===0){$parent.prepend($node);}},'.infobox caption':'figcaption','.infobox':'figure'},// Selectors to remove from the extracted content
'.permalink[role=main]':functionpermalinkRoleMain($node,$){vartweets=$node.find('.tweet');var$tweetContainer=$('<div id="TWEETS_GO_HERE"></div>');$tweetContainer.append(tweets);$node.replaceWith($tweetContainer);},// Twitter wraps @ with s, which
varTheAtlanticExtractor={domain:'www.theatlantic.com',title:{selectors:['h1.hed']},author:{selectors:['article#article .article-cover-extra .metadata .byline a']},content:{selectors:['.article-body'],// Is there anything in the content you selected that needs transformed
varTheAtlanticExtractor={domain:'www.theatlantic.com',title:{selectors:['h1.hed']},author:{selectors:['article#article .article-cover-extra .metadata .byline a']},content:{selectors:[['.article-cover figure.lead-img','.article-body'],'.article-body'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varNewYorkerExtractor={domain:'www.newyorker.com',title:{selectors:['h1.title']},author:{selectors:['.contributors']},content:{selectors:['div#articleBody','div.articleBody'],// Is there anything in the content you selected that needs transformed
@ -1509,7 +1514,7 @@ var NewYorkerExtractor={domain:'www.newyorker.com',title:{selectors:['h1.title']
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varWiredExtractor={domain:'www.wired.com',title:{selectors:['h1.post-title']},author:{selectors:['a[rel="author"]']},content:{selectors:['article.content'],// Is there anything in the content you selected that needs transformed
@ -1517,7 +1522,7 @@ var WiredExtractor={domain:'www.wired.com',title:{selectors:['h1.post-title']},a
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varMSNExtractor={domain:'www.msn.com',title:{selectors:['h1']},author:{selectors:['span.authorname-txt']},content:{selectors:['div.richtext'],// Is there anything in the content you selected that needs transformed
@ -1525,7 +1530,7 @@ var MSNExtractor={domain:'www.msn.com',title:{selectors:['h1']},author:{selector
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varYahooExtractor={domain:'www.yahoo.com',title:{selectors:['header.canvas-header']},author:{selectors:['span.provider-name']},content:{selectors:[// enter content selectors
@ -1534,15 +1539,16 @@ var YahooExtractor={domain:'www.yahoo.com',title:{selectors:['header.canvas-head
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
clean:['.figure-caption']},date_published:{selectors:[['time.date[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[// enter dek selectors
varBuzzfeedExtractor={domain:'www.buzzfeed.com',title:{selectors:['h1[id="post-title"]']},author:{selectors:['a[data-action="user/username"]','byline__author']},content:{selectors:['#buzz_sub_buzz'],defaultCleaner:false,// Is there anything in the content you selected that needs transformed
varBuzzfeedExtractor={domain:'www.buzzfeed.com',title:{selectors:['h1[id="post-title"]']},author:{selectors:['a[data-action="user/username"]','byline__author']},content:{selectors:[['.longform_custom_header_media','#buzz_sub_buzz'],'#buzz_sub_buzz'],defaultCleaner:false,// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{h2:'b'},// Is there anything that is in the result that shouldn't be?
transforms:{h2:'b','div.longform_custom_header_media':functiondivLongform_custom_header_media($node){if($node.has('img')&&$node.has('.longform_header_image_source')){return'figure';}returnnull;},'figure.longform_custom_header_media .longform_header_image_source':'figcaption'},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varWikiaExtractor={domain:'fandom.wikia.com',title:{selectors:['h1.entry-title']},author:{selectors:['.author vcard','.fn']},content:{selectors:['.grid-content','.entry-content'],// Is there anything in the content you selected that needs transformed
@ -1550,7 +1556,7 @@ var WikiaExtractor={domain:'fandom.wikia.com',title:{selectors:['h1.entry-title'
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varLittleThingsExtractor={domain:'www.littlethings.com',title:{selectors:['h1.post-title']},author:{selectors:[['meta[name="author"]','value']]},content:{selectors:[// enter content selectors
@ -1570,12 +1576,12 @@ transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['figcaption']},date_published:{selectors:[['.story-main-content .timestamp time[datetime]','datetime']]},lead_image_url:{selectors:[// enter lead_image_url selectors
['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name="description"]','value']]},next_page_url:null,excerpt:null};varDeadspinExtractor={domain:'deadspin.com',supportedDomains:['jezebel.com','lifehacker.com','kotaku.com','gizmodo.com','jalopnik.com','kinja.com'],title:{selectors:['h1.headline']},author:{selectors:['.author']},content:{selectors:['.post-content','.entry-content'],// Is there anything in the content you selected that needs transformed
['meta[name="og:image"]','value']]},dek:{selectors:[]},next_page_url:null,excerpt:null};varDeadspinExtractor={domain:'deadspin.com',supportedDomains:['jezebel.com','lifehacker.com','kotaku.com','gizmodo.com','jalopnik.com','kinja.com'],title:{selectors:['h1.headline']},author:{selectors:['.author']},content:{selectors:['.post-content','.entry-content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{'iframe.lazyload[data-recommend-id^="youtube://"]':functioniframeLazyloadDataRecommendIdYoutube($node){varyoutubeId=$node.attr('id').split('youtube-')[1];$node.attr('src','https://www.youtube.com/embed/'+youtubeId);}},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['time.updated[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[// enter selectors
clean:['.magnifier','.lightbox']},date_published:{selectors:[['time.updated[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[// enter selectors
]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};// Rename CustomExtractor
@ -1586,7 +1592,7 @@ var BroadwayWorldExtractor={domain:'www.broadwayworld.com',title:{selectors:['h1
transforms:{},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['meta[itemprop=datePublished]','value']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name="og:description"]','value']]},next_page_url:{selectors:[// enter selectors
clean:[]},date_published:{selectors:[['meta[itemprop=datePublished]','value']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};// Rename CustomExtractor
// to fit your publication
@ -1596,7 +1602,7 @@ var ApartmentTherapyExtractor={domain:'www.apartmenttherapy.com',title:{selector
transforms:{'div[data-render-react-id="images/LazyPicture"]':functiondivDataRenderReactIdImagesLazyPicture($node,$){vardata=JSON.parse($node.attr('data-props'));varsrc=data.sources[0].src;var$img=$('<img />').attr('src',src);$node.replaceWith($img);}},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['.PostByline__timestamp[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name=description]','value']]},next_page_url:{selectors:[// enter selectors
clean:[]},date_published:{selectors:[['.PostByline__timestamp[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};varMediumExtractor={domain:'medium.com',supportedDomains:['trackchanges.postlight.com'],title:{selectors:['h1']},author:{selectors:[['meta[name="author"]','value']]},content:{selectors:['.section-content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
clean:[]},date_published:{selectors:[['time[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[// enter selectors
]}};varWwwTmzComExtractor={domain:'www.tmz.com',title:{selectors:['.post-title-breadcrumb','h1','.headline']},author:'TMZ STAFF',date_published:{selectors:['.article-posted-date'],timezone:'America/Los_Angeles'},dek:{selectors:[// enter selectors
]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:['.article-content','.all-post-body'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['.lightbox-link']}};varWwwWashingtonpostComExtractor={domain:'www.washingtonpost.com',title:{selectors:['h1','#topper-headline-wrapper']},author:{selectors:['.pb-byline']},date_published:{selectors:[['.pb-timestamp[itemprop="datePublished"]','content']]},dek:{selectors:[]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:['.article-body'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{'div.inline-content':functiondivInlineContent($node){if($node.has('img,iframe,video').length>0){return'figure';}$node.remove();returnnull;},'.pb-caption':'figcaption'},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['.interstitial-link','.newsletter-inline-unit']}};varWwwHuffingtonpostComExtractor={domain:'www.huffingtonpost.com',title:{selectors:['h1.headline__title']},author:{selectors:['span.author-card__details__name']},date_published:{selectors:[['meta[name="article:modified_time"]','value'],['meta[name="article:published_time"]','value']]},dek:{selectors:['h2.headline__subtitle']},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:['div.entry__body'],defaultCleaner:false,// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{// 'div.top-media': ($node) => {
// const $figure = $node.children('figure');
// $node.replaceWith($figure);
// },
},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['.pull-quote','.tag-cloud','.embed-asset','.below-entry','.entry-corrections','#suggested-story']}};varNewrepublicComExtractor={domain:'newrepublic.com',title:{selectors:['h1.article-headline','.minutes-primary h1.minute-title']},author:{selectors:['div.author-list','.minutes-primary h3.minute-byline']},date_published:{selectors:[['meta[name="article:published_time"]','value']],timezone:'America/New_York'},dek:{selectors:['h2.article-subhead']},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:['div.content-body','.minutes-primary div.content-body'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['aside']}};varMoneyCnnComExtractor={domain:'money.cnn.com',title:{selectors:['.article-title']},author:{selectors:['.byline a']},date_published:{selectors:[['meta[name="date"]','value']],timezone:'GMT'},dek:{selectors:['#storytext h2']},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:['#storytext'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
transforms:{noscript:functionnoscript($node){var$children=$node.children();if($children.length===1&&$children.get(0).tagName==='img'){return'span';}returnnull;}},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['.aside','img.c-dynamic-image']}};varWwwCnnComExtractor={domain:'www.cnn.com',title:{selectors:['h1.pg-headline','h1']},author:{selectors:['.metadata__byline__author']},date_published:{selectors:[['meta[name="pubdate"]','value']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:[// a more specific selector to grab the lead image and the body
['.media__video--thumbnail','.zn-body-text'],// a fallback for the above
'.zn-body-text','div[itemprop="articleBody"]'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{'.zn-body__paragraph, .el__leafmedia--sourced-paragraph':functionznBody__paragraphEl__leafmediaSourcedParagraph($node){var$text=$node.html();if($text){return'p';}returnnull;},// this transform cleans the short, all-link sections linking
// to related content but not marked as such in any way.
'.zn-body__paragraph':functionznBody__paragraph($node){if($node.has('a')){if($node.text().trim()===$node.find('a').text().trim()){$node.remove();}}},'.media__video--thumbnail':'figure'},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]}};varWwwAolComExtractor={domain:'www.aol.com',title:{selectors:['h1.p-article__title']},author:{selectors:[['meta[name="author"]','value']]},date_published:{selectors:['.p-article__byline__date'],timezone:'America/New_York'},dek:{selectors:[// enter selectors
]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},content:{selectors:['.article-content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varTEXT_LINK_RE=newRegExp('http(s)?://','i');// An ordered list of meta tag names that denote likely article deks.
@ -1626,24 +1685,26 @@ var TEXT_LINK_RE=new RegExp('http(s)?://','i');// An ordered list of meta tag na
// Should be more restrictive than not, as a failed dek can be pretty
// detrimental to the aesthetics of an article.
// CLEAN DATE PUBLISHED CONSTANTS
varMS_DATE_STRING=/^\d{13}$/i;varSEC_DATE_STRING=/^\d{10}$/i;varCLEAN_DATE_STRING_RE=/^\s*published\s*:?\s*(.*)/i;varTIME_MERIDIAN_SPACE_RE=/(.*\d)(am|pm)(.*)/i;varTIME_MERIDIAN_DOTS_RE=/\.m\./i;varmonths=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'];varallMonths=months.join('|');vartimestamp1='[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';vartimestamp2='[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';varSPLIT_DATE_STRING=newRegExp('('+timestamp1+')|('+timestamp2+')|([0-9]{1,4})|('+allMonths+')','ig');// CLEAN TITLE CONSTANTS
// Check if datetime string has an offset at the end
varTIME_WITH_OFFSET_RE=/-\d{3,4}$/;// CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
varTITLE_SPLITTERS_RE=/(: | - | \| )/g;varDOMAIN_ENDINGS_RE=newRegExp('.com$|.net$|.org$|.co.uk$','g');// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
functioncleanAuthor(author){return author.replace(CLEAN_AUTHOR_RE,'$2').trim();}functionclean$1(leadImageUrl){leadImageUrl=leadImageUrl.trim();if(validUrl$1.isWebUri(leadImageUrl)){returnleadImageUrl;}returnnull;}// Take a dek HTML fragment, and return the cleaned version of it.
functioncleanAuthor(author){returnnormalizeSpaces$1(author.replace(CLEAN_AUTHOR_RE,'$2').trim());}functionclean$1(leadImageUrl){leadImageUrl=leadImageUrl.trim();if(validUrl$1.isWebUri(leadImageUrl)){returnleadImageUrl;}returnnull;}// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.
functioncleanDek(dek,_ref){var$=_ref.$,excerpt=_ref.excerpt;// Sanity check that we didn't get too short or long of a dek.
if(dek.length>1000||dek.length<5)returnnull;// Check that dek isn't the same as excerpt
if(excerpt&&excerptContent$1(excerpt,10)===excerptContent$1(dek,10))returnnull;vardekText=stripTags$1(dek,$);// Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail.
if(TEXT_LINK_RE.test(dekText))returnnull;return dekText.trim();}// Is there a compelling reason to use moment here?
if(TEXT_LINK_RE.test(dekText))returnnull;returnnormalizeSpaces$1(dekText.trim());}// Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string.
functioncleanDateString(dateString){return(dateString.match(SPLIT_DATE_STRING)||[]).join(' ').replace(TIME_MERIDIAN_DOTS_RE,'m').replace(TIME_MERIDIAN_SPACE_RE,'$1 $2 $3').replace(CLEAN_DATE_STRING_RE,'$1').trim();}// Take a date published string, and hopefully return a date out of
functioncleanDateString(dateString){return(dateString.match(SPLIT_DATE_STRING)||[]).join(' ').replace(TIME_MERIDIAN_DOTS_RE,'m').replace(TIME_MERIDIAN_SPACE_RE,'$1 $2 $3').replace(CLEAN_DATE_STRING_RE,'$1').trim();}functioncreateDate(dateString,timezone){if(TIME_WITH_OFFSET_RE.test(dateString)){returnmoment(newDate(dateString));}returntimezone?moment.tz(dateString,parseFormat(dateString),timezone):moment(dateString,parseFormat(dateString));}// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
functioncleanDatePublished(dateString){// If string is in milliseconds or seconds, convert to int
if(MS_DATE_STRING.test(dateString)||SEC_DATE_STRING.test(dateString)){dateString=parseInt(dateString,10);}vardate=moment$1(newDate(dateString));if(!date.isValid()){dateString=cleanDateString(dateString);date=moment$1(newDate(dateString));}returndate.isValid()?date.toISOString():null;}// Clean our article content, returning a new, cleaned node.
functioncleanDatePublished(dateString){var_ref=arguments.length>1&&arguments[1]!==undefined?arguments[1]:{},timezone=_ref.timezone;// If string is in milliseconds or seconds, convert to int and return
if(MS_DATE_STRING.test(dateString)||SEC_DATE_STRING.test(dateString)){returnnewDate(parseInt(dateString,10)).toISOString();}vardate=createDate(dateString,timezone);if(!date.isValid()){dateString=cleanDateString(dateString);date=createDate(dateString,timezone);}returndate.isValid()?date.toISOString():null;}// Clean our article content, returning a new, cleaned node.
functionextractCleanNode(article,_ref){var$=_ref.$,_ref$cleanConditional=_ref.cleanConditionally,cleanConditionally=_ref$cleanConditional===undefined?true:_ref$cleanConditional,_ref$title=_ref.title,title=_ref$title===undefined?'':_ref$title,_ref$url=_ref.url,url=_ref$url===undefined?'':_ref$url,_ref$defaultCleaner=_ref.defaultCleaner,defaultCleaner=_ref$defaultCleaner===undefined?true:_ref$defaultCleaner;// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
rewriteTopLevel$$1(article,$);// Drop small images and spacer images
@ -1671,7 +1732,7 @@ if(TITLE_SPLITTERS_RE.test(title)){title=resolveSplitTitle(title,url);}// Final
// if (title.length > 150 || title.length < 15) {
if(title.length>150){// If we did, return h1 from the document if it exists
varh1=$('h1');if(h1.length===1){title=h1.text();}}// strip any html tags in the title text
return stripTags$1(title,$).trim();}functionextractBreadcrumbTitle(splitTitle,text){// This must be a very breadcrumbed title, like:
returnnormalizeSpaces$1(stripTags$1(title,$).trim());}functionextractBreadcrumbTitle(splitTitle,text){// This must be a very breadcrumbed title, like:
// The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
// NYTimes - Blogs - Bits - The Best Gadgets on Earth
if(splitTitle.length>=6){var_ret=function(){// Look to see if we can find a breadcrumb splitter that happens
// cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
extract:functionextract(_ref,opts){var$=_ref.$,html=_ref.html,title=_ref.title,url=_ref.url,cheerio$$1=_ref.cheerio;opts=_extends$1({},this.defaultOpts,opts);$=$||cheerio$$1.load(html);// Cascade through our extraction-specific opts in an ordered fashion,
extract:functionextract(_ref,opts){var$=_ref.$,html=_ref.html,title=_ref.title,url=_ref.url;opts=_extends$1({},this.defaultOpts,opts);$=$||cheerio$1.load(html);// Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content.
varnode=this.getContentNode($,title,url,opts);if(nodeIsSufficient$1(node)){returnthis.cleanAndReturnNode(node,$);}// We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$1(_Reflect$ownKeys$1(opts).filter(function(k){returnopts[k]===true;})),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varkey=_step.value;opts[key]=false;$=cheerio$$1.load(html);node=this.getContentNode($,title,url,opts);if(nodeIsSufficient$1(node)){break;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnthis.cleanAndReturnNode(node,$);},// Get node given current options
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$1(_Reflect$ownKeys$1(opts).filter(function(k){returnopts[k]===true;})),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varkey=_step.value;opts[key]=false;$=cheerio$1.load(html);node=this.getContentNode($,title,url,opts);if(nodeIsSufficient$1(node)){break;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnthis.cleanAndReturnNode(node,$);},// Get node given current options
getContentNode:functiongetContentNode($,title,url,opts){returnextractCleanNode(extractBestNode($,opts),{$:$,cleanConditionally:opts.cleanConditionally,title:title,url:url});},// Once we got here, either we're at our last-resort node, or
// we broke early. Make sure we at least have -something- before we
// move forward.
@ -1854,7 +1915,7 @@ function scoreAttr($img){if($img.attr('alt')){return 5;}return 0;}// Look throug
// container elements, give a bonus if we find them
functionscoreByParents($img){varscore=0;var$figParent=$img.parents('figure').first();if($figParent.length===1){score+=25;}var$parent=$img.parent();var$gParent=void0;if($parent.length===1){$gParent=$parent.parent();}[$parent,$gParent].forEach(function($node){if(PHOTO_HINTS_RE$1$1.test(getSig($node))){score+=15;}});returnscore;}// Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
functionscoreBySibling($img){varscore=0;var$sibling=$img.next();varsibling=$sibling.get(0);if(sibling&&sibling.tagName==='figcaption'){score+=25;}if(PHOTO_HINTS_RE$1$1.test(getSig($sibling))){score+=15;}returnscore;}functionscoreByDimensions($img){varscore=0;varwidth=parseFloat($img.attr('width'));varheight=parseFloat($img.attr('height'));varsrc=$img.attr('src');// Penalty for skinny images
functionscoreBySibling($img){varscore=0;var$sibling=$img.next();varsibling=$sibling.get(0);if(sibling&&sibling.tagName.toLowerCase()==='figcaption'){score+=25;}if(PHOTO_HINTS_RE$1$1.test(getSig($sibling))){score+=15;}returnscore;}functionscoreByDimensions($img){varscore=0;varwidth=parseFloat($img.attr('width'));varheight=parseFloat($img.attr('height'));varsrc=$img.attr('src');// Penalty for skinny images
if(width&&width<=50){score-=50;}// Penalty for short images
if(height&&height<=50){score-=50;}if(width&&height&&!src.includes('sprite')){vararea=width*height;if(area<5000){// Smaller than 50 x 100
score-=100;}else{score+=Math.round(area/1000);}}returnscore;}functionscoreByPosition($imgs,index){return$imgs.length/2-index;}// Given a resource, try to find the lead image URL from within
varscoredPages=links.reduce(function(possiblePages,link){// Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
varattrs=getAttrs$1(link);varhref=removeAnchor$1(attrs.href);var$link=$(link);varlinkText=$link.text();if(!shouldScore(href,articleUrl,baseUrl,parsedUrl,linkText,previousUrls)){returnpossiblePages;}// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if(!possiblePages[href]){possiblePages[href]={score:0,linkText:linkText,href:href};}else{possiblePages[href].linkText=possiblePages[href].linkText+'|'+linkText;}varpossiblePage=possiblePages[href];varlinkData=makeSig($link,linkText);varpageNum=pageNumFromUrl$1(href);varscore=scoreBaseUrl(href,baseRegex);score+=scoreNextLinkText(linkData);score+=scoreCapLinks(linkData);score+=scorePrevLink(linkData);score+=scoreByParents$1($link);score+=scoreExtraneousLinks(href);score+=scorePageInLink(pageNum,isWp);score+=scoreLinkText(linkText,pageNum);score+=scoreSimilarity(score,articleUrl,href);possiblePage.score=score;returnpossiblePages;},{});return_Reflect$ownKeys$1(scoredPages).length===0?null:scoredPages;}/* eslint-disable */// Looks for and returns next page url
varattrs=getAttrs$1(link);// if href is undefined, return
if(!attrs.href)returnpossiblePages;varhref=removeAnchor$1(attrs.href);var$link=$(link);varlinkText=$link.text();if(!shouldScore(href,articleUrl,baseUrl,parsedUrl,linkText,previousUrls)){returnpossiblePages;}// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if(!possiblePages[href]){possiblePages[href]={score:0,linkText:linkText,href:href};}else{possiblePages[href].linkText=possiblePages[href].linkText+'|'+linkText;}varpossiblePage=possiblePages[href];varlinkData=makeSig($link,linkText);varpageNum=pageNumFromUrl$1(href);varscore=scoreBaseUrl(href,baseRegex);score+=scoreNextLinkText(linkData);score+=scoreCapLinks(linkData);score+=scorePrevLink(linkData);score+=scoreByParents$1($link);score+=scoreExtraneousLinks(href);score+=scorePageInLink(pageNum,isWp);score+=scoreLinkText(linkText,pageNum);score+=scoreSimilarity(score,articleUrl,href);possiblePage.score=score;returnpossiblePages;},{});return_Reflect$ownKeys$1(scoredPages).length===0?null:scoredPages;}// Looks for and returns next page url
// for multi-page articles
varGenericNextPageUrlExtractor={extract:functionextract(_ref){var$=_ref.$,url=_ref.url,parsedUrl=_ref.parsedUrl,_ref$previousUrls=_ref.previousUrls,previousUrls=_ref$previousUrls===undefined?[]:_ref$previousUrls;parsedUrl=parsedUrl||URL$1.parse(url);vararticleUrl=removeAnchor$1(url);varbaseUrl=articleBaseUrl$1(url,parsedUrl);varlinks=$('a[href]').toArray();varscoredLinks=scoreLinks({links:links,articleUrl:articleUrl,baseUrl:baseUrl,parsedUrl:parsedUrl,$:$,previousUrls:previousUrls});// If no links were scored, return null
if(!scoredLinks)returnnull;// now that we've scored all possible pages,
@ -2112,37 +2174,48 @@ var topPage=_Reflect$ownKeys$1(scoredLinks).reduce(function(acc,link){var scored
// so we fail.
if(topPage.score>=50){returntopPage.href;}returnnull;}};varCANONICAL_META_SELECTORS=['og:url'];functionparseDomain(url){varparsedUrl=URL$1.parse(url);varhostname=parsedUrl.hostname;returnhostname;}functionresult(url){return{url:url,domain:parseDomain(url)};}varGenericUrlExtractor={extract:functionextract(_ref){var$=_ref.$,url=_ref.url,metaCache=_ref.metaCache;var$canonical=$('link[rel=canonical]');if($canonical.length!==0){varhref=$canonical.attr('href');if(href){returnresult(href);}}varmetaUrl=extractFromMeta$$1($,CANONICAL_META_SELECTORS,metaCache);if(metaUrl){returnresult(metaUrl);}returnresult(url);}};varEXCERPT_META_SELECTORS=['og:description','twitter:description'];functionclean$2(content,$){varmaxLength=arguments.length>2&&arguments[2]!==undefined?arguments[2]:200;content=content.replace(/[\s\n]+/g,' ').trim();returnellipsize$1(content,maxLength,{ellipse:'…'});}varGenericExcerptExtractor={extract:functionextract(_ref){var$=_ref.$,content=_ref.content,metaCache=_ref.metaCache;varexcerpt=extractFromMeta$$1($,EXCERPT_META_SELECTORS,metaCache);if(excerpt){returnclean$2(stripTags$1(excerpt,$));}// Fall back to excerpting from the extracted content
varmaxLength=200;varshortContent=content.slice(0,maxLength*5);returnclean$2($(shortContent).text(),$,maxLength);}};varGenericWordCountExtractor={extract:functionextract(_ref){varcontent=_ref.content;var$=cheerio$1.load(content);var$content=$('div').first();vartext=normalizeSpaces$1($content.text());returntext.split(/\s/).length;}};varGenericExtractor={// This extractor is the default for all domains
domain:'*',title:GenericTitleExtractor.extract,date_published:GenericDatePublishedExtractor.extract,author:GenericAuthorExtractor.extract,content:GenericContentExtractor.extract.bind(GenericContentExtractor),lead_image_url:GenericLeadImageUrlExtractor.extract,dek:GenericDekExtractor.extract,next_page_url:GenericNextPageUrlExtractor.extract,url_and_domain:GenericUrlExtractor.extract,excerpt:GenericExcerptExtractor.extract,word_count:GenericWordCountExtractor.extract,direction:functiondirection(_ref){vartitle=_ref.title;returnstringDirection$1.getDirection(title);},extract:functionextract(options){varhtml=options.html,cheerio$$1=options.cheerio,$=options.$;if(html&&!$){varloaded=cheerio$$1.load(html);options.$=loaded;}vartitle=this.title(options);vardate_published=this.date_published(options);varauthor=this.author(options);varcontent=this.content(_extends$1({},options,{title:title}));varlead_image_url=this.lead_image_url(_extends$1({},options,{content:content}));vardek=this.dek(_extends$1({},options,{content:content}));varnext_page_url=this.next_page_url(options);varexcerpt=this.excerpt(_extends$1({},options,{content:content}));varword_count=this.word_count(_extends$1({},options,{content:content}));vardirection=this.direction({title:title});var_url_and_domain=this.url_and_domain(options),url=_url_and_domain.url,domain=_url_and_domain.domain;return{title:title,author:author,date_published:date_published||null,dek:dek,lead_image_url:lead_image_url,content:content,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};functiongetExtractor(url,parsedUrl){parsedUrl=parsedUrl||URL$1.parse(url);var_parsedUrl=parsedUrl,hostname=_parsedUrl.hostname;varbaseDomain=hostname.split('.').slice(-2).join('.');returnExtractors[hostname]||Extractors[baseDomain]||GenericExtractor;}/* eslint-disable */// Remove elements by an array of selectors
domain:'*',title:GenericTitleExtractor.extract,date_published:GenericDatePublishedExtractor.extract,author:GenericAuthorExtractor.extract,content:GenericContentExtractor.extract.bind(GenericContentExtractor),lead_image_url:GenericLeadImageUrlExtractor.extract,dek:GenericDekExtractor.extract,next_page_url:GenericNextPageUrlExtractor.extract,url_and_domain:GenericUrlExtractor.extract,excerpt:GenericExcerptExtractor.extract,word_count:GenericWordCountExtractor.extract,direction:functiondirection(_ref){vartitle=_ref.title;returnstringDirection$1.getDirection(title);},extract:functionextract(options){varhtml=options.html,$=options.$;if(html&&!$){varloaded=cheerio$1.load(html);options.$=loaded;}vartitle=this.title(options);vardate_published=this.date_published(options);varauthor=this.author(options);varcontent=this.content(_extends$1({},options,{title:title}));varlead_image_url=this.lead_image_url(_extends$1({},options,{content:content}));vardek=this.dek(_extends$1({},options,{content:content}));varnext_page_url=this.next_page_url(options);varexcerpt=this.excerpt(_extends$1({},options,{content:content}));varword_count=this.word_count(_extends$1({},options,{content:content}));vardirection=this.direction({title:title});var_url_and_domain=this.url_and_domain(options),url=_url_and_domain.url,domain=_url_and_domain.domain;return{title:title,author:author,date_published:date_published||null,dek:dek,lead_image_url:lead_image_url,content:content,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};functiongetExtractor(url,parsedUrl){parsedUrl=parsedUrl||URL$1.parse(url);var_parsedUrl=parsedUrl,hostname=_parsedUrl.hostname;varbaseDomain=hostname.split('.').slice(-2).join('.');returnExtractors[hostname]||Extractors[baseDomain]||GenericExtractor;}// Remove elements by an array of selectors
functioncleanBySelectors($content,$,_ref){varclean=_ref.clean;if(!clean)return$content;$(clean.join(','),$content).remove();return$content;}// Transform matching elements
functiontransformElements($content,$,_ref2){vartransforms=_ref2.transforms;if(!transforms)return$content;_Reflect$ownKeys$1(transforms).forEach(function(key){var$matches=$(key,$content);varvalue=transforms[key];// If value is a string, convert directly
if(typeofvalue==='string'){$matches.each(function(index,node){convertNodeTo$$1($(node),$,transforms[key]);});}elseif(typeofvalue==='function'){// If value is function, apply function to node
$matches.each(function(index,node){varresult=value($(node),$);// If function returns a string, convert node to that value
return$(selector).length===1&&$(selector).text().trim()!=='';});}functionselect(opts){var$=opts.$,type=opts.type,extractionOpts=opts.extractionOpts,_opts$extractHtml=opts.extractHtml,extractHtml=_opts$extractHtml===undefined?false:_opts$extractHtml;// Skip if there's not extraction for this type
if(typeofresult==='string'){convertNodeTo$$1($(node),$,result);}});}});return$content;}functionfindMatchingSelector($,selectors,extractHtml){returnselectors.find(function(selector){if(Array.isArray(selector)){if(extractHtml){returnselector.reduce(function(acc,s){returnacc&&$(s).length>0;},true);}var_selector=_slicedToArray$1(selector,2),s=_selector[0],attr=_selector[1];return$(s).length===1&&$(s).attr(attr)&&$(s).attr(attr).trim()!=='';}return$(selector).length===1&&$(selector).text().trim()!=='';});}functionselect(opts){var$=opts.$,type=opts.type,extractionOpts=opts.extractionOpts,_opts$extractHtml=opts.extractHtml,extractHtml=_opts$extractHtml===undefined?false:_opts$extractHtml;// Skip if there's not extraction for this type
if(!extractionOpts)returnnull;// If a string is hardcoded for a type (e.g., Wikipedia
// contributors), return the string
if(typeofextractionOpts==='string')returnextractionOpts;varselectors=extractionOpts.selectors,_extractionOpts$defau=extractionOpts.defaultCleaner,defaultCleaner=_extractionOpts$defau===undefined?true:_extractionOpts$defau;varmatchingSelector=findMatchingSelector($,selectors);if(!matchingSelector)returnnull;// Declaring result; will contain either
if(typeofextractionOpts==='string')returnextractionOpts;varselectors=extractionOpts.selectors,_extractionOpts$defau=extractionOpts.defaultCleaner,defaultCleaner=_extractionOpts$defau===undefined?true:_extractionOpts$defau;varmatchingSelector=findMatchingSelector($,selectors,extractHtml);if(!matchingSelector)returnnull;// Declaring result; will contain either
// text or html, which will be cleaned
// by the appropriate cleaner type
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
if(extractHtml){var$content=$(matchingSelector);// Wrap in div so transformation can take place on root element
var$content=void0;if(extractHtml){// If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
// array must match in order for this selector to trigger
if(Array.isArray(matchingSelector)){(function(){$content=$(matchingSelector.join(','));var$wrapper=$('<div></div>');$content.each(function(index,element){$wrapper.append(element);});$content=$wrapper;})();}else{$content=$(matchingSelector);}// Wrap in div so transformation can take place on root element
$content.wrap($('<div></div>'));$content=$content.parent();$content=transformElements($content,$,extractionOpts);$content=cleanBySelectors($content,$,extractionOpts);$content=Cleaners[type]($content,_extends$1({},opts,{defaultCleaner:defaultCleaner}));return$.html($content);}varresult=void0;// if selector is an array (e.g., ['img', 'src']),
// extract the attr
if(Array.isArray(matchingSelector)){var_matchingSelector=_slicedToArray$1(matchingSelector,2),selector=_matchingSelector[0],attr=_matchingSelector[1];result=$(selector).attr(attr).trim();}else{result=$(matchingSelector).text().trim();}// Allow custom extractor to skip default cleaner
// for this type; defaults to true
if(defaultCleaner){returnCleaners[type](result,opts);}returnresult;}functionextractResult(opts){vartype=opts.type,extractor=opts.extractor,_opts$fallback=opts.fallback,fallback=_opts$fallback===undefined?true:_opts$fallback;varresult=select(_extends$1({},opts,{extractionOpts:extractor[type]}));// If custom parser succeeds, return the result
if(defaultCleaner){returnCleaners[type](result,_extends$1({},opts,extractionOpts));}returnresult;}functionextractResult(opts){vartype=opts.type,extractor=opts.extractor,_opts$fallback=opts.fallback,fallback=_opts$fallback===undefined?true:_opts$fallback;varresult=select(_extends$1({},opts,{extractionOpts:extractor[type]}));// If custom parser succeeds, return the result
if(result){returnresult;}// If nothing matches the selector, and fallback is enabled,
// run the Generic extraction
if(fallback)returnGenericExtractor[type](opts);returnnull;}varRootExtractor={extract:functionextract(){varextractor=arguments.length>0&&arguments[0]!==undefined?arguments[0]:GenericExtractor;varopts=arguments[1];var_opts=opts,contentOnly=_opts.contentOnly,extractedTitle=_opts.extractedTitle;// This is the generic extractor. Run its extract method
if(extractor.domain==='*')returnextractor.extract(opts);opts=_extends$1({},opts,{extractor:extractor});if(contentOnly){var_content=extractResult(_extends$1({},opts,{type:'content',extractHtml:true,title:extractedTitle}));return{content:_content};}vartitle=extractResult(_extends$1({},opts,{type:'title'}));vardate_published=extractResult(_extends$1({},opts,{type:'date_published'}));varauthor=extractResult(_extends$1({},opts,{type:'author'}));varnext_page_url=extractResult(_extends$1({},opts,{type:'next_page_url'}));varcontent=extractResult(_extends$1({},opts,{type:'content',extractHtml:true,title:title}));varlead_image_url=extractResult(_extends$1({},opts,{type:'lead_image_url',content:content}));varexcerpt=extractResult(_extends$1({},opts,{type:'excerpt',content:content}));vardek=extractResult(_extends$1({},opts,{type:'dek',content:content,excerpt:excerpt}));varword_count=extractResult(_extends$1({},opts,{type:'word_count',content:content}));vardirection=extractResult(_extends$1({},opts,{type:'direction',title:title}));var_ref3=extractResult(_extends$1({},opts,{type:'url_and_domain'}))||{url:null,domain:null},url=_ref3.url,domain=_ref3.domain;return{title:title,content:content,author:author,date_published:date_published,lead_image_url:lead_image_url,dek:dek,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};varcollectAllPages=function(){var_ref=_asyncToGenerator(_regeneratorRuntime.mark(function_callee(_ref2){varnext_page_url=_ref2.next_page_url,html=_ref2.html,$=_ref2.$,metaCache=_ref2.metaCache,result=_ref2.result,Extractor=_ref2.Extractor,title=_ref2.title,url=_ref2.url,cheerio$$1=_ref2.cheerio;varpages,previousUrls,extractorOpts,nextPageResult,word_count;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:// At this point, we've fetched just the first page
if(extractor.domain==='*')returnextractor.extract(opts);opts=_extends$1({},opts,{extractor:extractor});if(contentOnly){var_content=extractResult(_extends$1({},opts,{type:'content',extractHtml:true,title:extractedTitle}));return{content:_content};}vartitle=extractResult(_extends$1({},opts,{type:'title'}));vardate_published=extractResult(_extends$1({},opts,{type:'date_published'}));varauthor=extractResult(_extends$1({},opts,{type:'author'}));varnext_page_url=extractResult(_extends$1({},opts,{type:'next_page_url'}));varcontent=extractResult(_extends$1({},opts,{type:'content',extractHtml:true,title:title}));varlead_image_url=extractResult(_extends$1({},opts,{type:'lead_image_url',content:content}));varexcerpt=extractResult(_extends$1({},opts,{type:'excerpt',content:content}));vardek=extractResult(_extends$1({},opts,{type:'dek',content:content,excerpt:excerpt}));varword_count=extractResult(_extends$1({},opts,{type:'word_count',content:content}));vardirection=extractResult(_extends$1({},opts,{type:'direction',title:title}));var_ref3=extractResult(_extends$1({},opts,{type:'url_and_domain'}))||{url:null,domain:null},url=_ref3.url,domain=_ref3.domain;return{title:title,content:content,author:author,date_published:date_published,lead_image_url:lead_image_url,dek:dek,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};varcollectAllPages=function(){var_ref=_asyncToGenerator(_regeneratorRuntime.mark(function_callee(_ref2){varnext_page_url=_ref2.next_page_url,html=_ref2.html,$=_ref2.$,metaCache=_ref2.metaCache,result=_ref2.result,Extractor=_ref2.Extractor,title=_ref2.title,url=_ref2.url;varpages,previousUrls,extractorOpts,nextPageResult,word_count;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:// At this point, we've fetched just the first page
pages=1;previousUrls=[removeAnchor$1(url)];// If we've gone over 26 pages, something has
_context.next=7;returnResource.create(url,html,parsedUrl);case7:$=_context.sent;if(!$.error){_context.next=10;break;}return_context.abrupt('return',$);case10:html=$.html();// Cached value of every meta name in our document.
case2:if(!(next_page_url&&pages<26)){_context.next=15;break;}pages+=1;_context.next=6;returnResource.create(next_page_url);case6:$=_context.sent;html=$.html();extractorOpts={url:next_page_url,html:html,$:$,metaCache:metaCache,contentOnly:true,extractedTitle:title,previousUrls:previousUrls};nextPageResult=RootExtractor.extract(Extractor,extractorOpts);previousUrls.push(next_page_url);result=_extends$1({},result,{content:result.content+'<hr><h4>Page '+pages+'</h4>'+nextPageResult.content});next_page_url=nextPageResult.next_page_url;_context.next=2;break;case15:word_count=GenericExtractor.word_count({content:'<div>'+result.content+'</div>'});return_context.abrupt('return',_extends$1({},result,{total_pages:pages,pages_rendered:pages,word_count:word_count}));case17:case'end':return_context.stop();}}},_callee,this);}));functioncollectAllPages(_x){return_ref.apply(this,arguments);}returncollectAllPages;}();varMercury={parse:functionparse(url,html){var_this=this;varopts=arguments.length>2&&arguments[2]!==undefined?arguments[2]:{};return_asyncToGenerator(_regeneratorRuntime.mark(function_callee(){var_opts$fetchAllPages,fetchAllPages,_opts$fallback,fallback,parsedUrl,Extractor,$,metaCache,result,_result,title,next_page_url;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:_opts$fetchAllPages=opts.fetchAllPages,fetchAllPages=_opts$fetchAllPages===undefined?true:_opts$fetchAllPages,_opts$fallback=opts.fallback,fallback=_opts$fallback===undefined?true:_opts$fallback;// if no url was passed and this is the browser version,
// set url to window.location.href and load the html
html=html||cheerio$1.html();}parsedUrl=URL$1.parse(url);if(validateUrl(parsedUrl)){_context.next=5;break;}return_context.abrupt('return',Errors.badUrl);case5:Extractor=getExtractor(url,parsedUrl);// console.log(`Using extractor for ${Extractor.domain}`);
_context.next=8;returnResource.create(url,html,parsedUrl);case8:$=_context.sent;if(!$.failed){_context.next=11;break;}return_context.abrupt('return',$);case11:// if html still has not been set (i.e., url passed to Mercury.parse),
// set html from the response of Resource.create
if(!html){html=$.html();}// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
metaCache=$('meta').map(function(_,node){return$(node).attr('name');}).toArray();result=RootExtractor.extract(Extractor,{url:url,html:html,$:$,metaCache:metaCache,parsedUrl:parsedUrl,fallback:fallback,cheerio:cheerio$1});_result=result,title=_result.title,next_page_url=_result.next_page_url;// Fetch more pages if next_page_url found
if(!(fetchAllPages&&next_page_url)){_context.next=20;break;}_context.next=17;returncollectAllPages({Extractor:Extractor,next_page_url:next_page_url,html:html,$:$,metaCache:metaCache,result:result,title:title,url:url,cheerio:cheerio$1});case17:result=_context.sent;_context.next=21;break;case20:result=_extends$1({},result,{total_pages:1,rendered_pages:1});case21:return_context.abrupt('return',result);case22:case'end':return_context.stop();}}},_callee,_this);}))();},// A convenience method for getting a resource
metaCache=$('meta').map(function(_,node){return$(node).attr('name');}).toArray();result=RootExtractor.extract(Extractor,{url:url,html:html,$:$,metaCache:metaCache,parsedUrl:parsedUrl,fallback:fallback});_result=result,title=_result.title,next_page_url=_result.next_page_url;// Fetch more pages if next_page_url found
if(!(fetchAllPages&&next_page_url)){_context.next=21;break;}_context.next=18;returncollectAllPages({Extractor:Extractor,next_page_url:next_page_url,html:html,$:$,metaCache:metaCache,result:result,title:title,url:url});case18:result=_context.sent;_context.next=22;break;case21:result=_extends$1({},result,{total_pages:1,rendered_pages:1});case22:// if this parse is happening in the browser,
// clean up any trace from the page.
if(cheerio$1.browser){cheerio$1.cleanup();}return_context.abrupt('return',result);case24:case'end':return_context.stop();}}},_callee,_this);}))();},browser:!!cheerio$1.browser,// A convenience method for getting a resource
// to work with, e.g., for custom extractor generator
@ -2206,23 +2279,23 @@ var extractorTemplate = function (hostname, name) {
returntemplate(_templateObject,name,hostname);
};
var_templateObject$1=_taggedTemplateLiteral(['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n '],['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n ']);
var_templateObject2=_taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n describe(\'','\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n '],['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n describe(\'','\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n ']);
var_templateObject$1=_taggedTemplateLiteral(['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const { ',' } = await result\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n '],['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const { ',' } = await result\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n ']);
var_templateObject2=_taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n describe(\'','\', () => {\n describe(\'initial test case\', () => {\n let result;\n let url;\n beforeAll(() => {\n url =\n \'','\';\n const html =\n fs.readFileSync(\'','\');\n result =\n Mercury.parse(url, html, { fallback: false });\n });\n\n it(\'is selected properly\', () => {\n// This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const { content } = await result;\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n });\n '],['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n describe(\'','\', () => {\n describe(\'initial test case\', () => {\n let result;\n let url;\n beforeAll(() => {\n url =\n \'','\';\n const html =\n fs.readFileSync(\'','\');\n result =\n Mercury.parse(url, html, { fallback: false });\n });\n\n it(\'is selected properly\', () => {\n// This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const { content } = await result;\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n });\n ']);