@ -1264,6 +1264,7 @@ function absolutizeSet($, rootUrl, $content) {
// descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
// space characters inside the URL should be encoded (%20 or +)
var candidates = urlSet . match ( /(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g ) ;
if ( ! candidates ) return ;
var absoluteCandidates = candidates . map ( function ( candidate ) {
// a candidate URL cannot start or end with a comma
// descriptors are separated from the URLs by unescaped whitespace
@ -1529,7 +1530,7 @@ function setAttrs(node, attrs) {
var IS _LINK = new RegExp ( 'https?://' , 'i' ) ;
var IMAGE _RE = '.(png|gif|jpe?g)' ;
var IS _IMAGE = new RegExp ( "" . concat ( IMAGE _RE ) , 'i' ) ;
var IS _SRCSET = new RegExp ( "" . concat ( IMAGE _RE , "(\\ s*[\\d.]+[wx])") , 'i' ) ;
var IS _SRCSET = new RegExp ( "" . concat ( IMAGE _RE , "(\\ ?\\S+)?(\\ s*[\\d.]+[wx])") , 'i' ) ;
var TAGS _TO _REMOVE = [ 'script' , 'style' , 'form' ] . join ( ',' ) ;
// lazy loaded images into normal images.
@ -1934,30 +1935,31 @@ var TheAtlanticExtractor = {
var NewYorkerExtractor = {
domain : 'www.newyorker.com' ,
title : {
selectors : [ 'h1 .title' ]
selectors : [ 'h1 [class^="ArticleHeader__hed"]', [ 'meta[name="og:title"]' , 'value' ] ]
} ,
author : {
selectors : [ ' .contributors ']
selectors : [ ' div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"] ']
} ,
content : {
selectors : [ ' div#articleBody', 'div.articleBody '] ,
selectors : [ ' main[class^="Layout__content"] '] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : [ ] ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
clean : [ 'footer[class^="ArticleFooter__footer"]' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] , [ 'time[itemProp="datePublished"]' , 'content' ] ] ,
selectors : [ [ 'meta[name="pubdate"]' , 'value' ] ] ,
format : 'YYYYMMDD' ,
timezone : 'America/New_York'
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
dek : {
selectors : [ ' .dek', 'h2.dek ']
selectors : [ ' h2[class^="ArticleHeader__dek"] ']
} ,
next _page _url : null ,
excerpt : null
@ -4760,6 +4762,30 @@ var NewsMynaviJpExtractor = {
}
} ;
var ClinicaltrialsGovExtractor = {
domain : 'clinicaltrials.gov' ,
title : {
selectors : [ 'h1.tr-solo_record' ]
} ,
author : {
selectors : [ 'div#sponsor.tr-info-text' ]
} ,
date _published : {
// selectors: ['span.term[data-term="Last Update Posted"]'],
selectors : [ 'div:has(> span.term[data-term="Last Update Posted"])' ]
} ,
content : {
selectors : [ 'div#tab-body' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ '.usa-alert> img' ]
}
} ;
var GithubComExtractor = {
domain : 'github.com' ,
title : {
@ -4865,7 +4891,11 @@ var WwwOssnewsJpExtractor = {
selectors : [ '#alpha-block h1.hxnewstitle' ]
} ,
author : null ,
date _published : null ,
date _published : {
selectors : [ 'p.fs12' ] ,
format : 'YYYY年MM月DD日 HH:mm' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
@ -4931,7 +4961,11 @@ var WwwSanwaCoJpExtractor = {
selectors : [ '#newsContent h1' ]
} ,
author : null ,
date _published : null ,
date _published : {
selectors : [ 'p.date' ] ,
format : 'YYYY.MM.DD' ,
timezone : 'Asia/Tokyo'
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
@ -4952,7 +4986,11 @@ var WwwElecomCoJpExtractor = {
selectors : [ 'title' ]
} ,
author : null ,
date _published : null ,
date _published : {
selectors : [ 'p.section-last' ] ,
format : 'YYYY.MM.DD' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
@ -4996,7 +5034,11 @@ var JvndbJvnJpExtractor = {
selectors : [ 'title' ]
} ,
author : null ,
date _published : null ,
date _published : {
selectors : [ 'div.modifytxt:nth-child(2)' ] ,
format : 'YYYY/MM/DD' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
@ -5064,6 +5106,590 @@ var WwwJnsaOrgExtractor = {
}
} ;
var PhpspotOrgExtractor = {
domain : 'phpspot.org' ,
title : {
selectors : [ 'h3.hl' ]
} ,
author : null ,
date _published : {
selectors : [ 'h4.hl' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ 'div.entrybody' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwInfoqComExtractor = {
domain : 'www.infoq.com' ,
title : {
selectors : [ 'h1.heading' ]
} ,
author : {
selectors : [ 'div.widget.article__authors' ]
} ,
date _published : {
selectors : [ '.article__readTime.date' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article__data' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwMoongiftJpExtractor = {
domain : 'www.moongift.jp' ,
title : {
selectors : [ 'h1.title a' ]
} ,
author : null ,
date _published : {
selectors : [ 'ul.meta li:not(.social):first-of-type' ] ,
timezone : 'Asia/Tokyo'
} ,
dek : {
selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#main' ] ,
transforms : { } ,
clean : [ 'ul.mg_service.cf' ]
}
} ;
var WwwItmediaCoJpExtractor = {
domain : 'www.itmedia.co.jp' ,
supportedDomains : [ 'www.atmarkit.co.jp' , 'techtarget.itmedia.co.jp' , 'nlab.itmedia.co.jp' ] ,
title : {
selectors : [ '#cmsTitle h1' ]
} ,
author : {
selectors : [ '#byline' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:modified_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '#cmsAbstract h2' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#cmsBody' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '#snsSharebox' ]
}
} ;
var WwwPublickey1JpExtractor = {
domain : 'www.publickey1.jp' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '#subcol p:has(img)' ]
} ,
date _published : {
selectors : [ 'div.pubdate' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#maincol' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '#breadcrumbs' , 'div.sbm' , 'div.ad_footer' ]
}
} ;
var TakagihiromitsuJpExtractor = {
domain : 'takagi-hiromitsu.jp' ,
title : {
selectors : [ 'h3' ]
} ,
author : {
selectors : [ [ 'meta[name="author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[http-equiv="Last-Modified"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ 'div.body' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var BookwalkerJpExtractor = {
domain : 'bookwalker.jp' ,
title : {
selectors : [ 'h1.main-heading' ]
} ,
author : {
selectors : [ 'div.authors' ]
} ,
date _published : {
selectors : [ '.work-info .work-detail:first-of-type .work-detail-contents:last-of-type' ] ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ [ 'div.main-info' , 'div.main-cover-inner' ] ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'span.label.label--trial' , 'dt.info-head.info-head--coin' , 'dd.info-contents.info-contents--coin' , 'div.info-notice.fn-toggleClass' ]
}
} ;
var WwwYomiuriCoJpExtractor = {
domain : 'www.yomiuri.co.jp' ,
title : {
selectors : [ 'h1.title-article.c-article-title' ]
} ,
author : null ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.p-main-contents' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var JapanCnetComExtractor = {
domain : 'japan.cnet.com' ,
title : {
selectors : [ '.leaf-headline-ttl' ]
} ,
author : {
selectors : [ '.writer' ]
} ,
date _published : {
selectors : [ '.date' ] ,
format : 'YYYY年MM月DD日 HH時mm分' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article_body' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var DeadlineComExtractor = {
domain : 'deadline.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ 'section.author h3' ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item' ] ,
transforms : {
'.embed-twitter' : function embedTwitter ( $node ) {
var innerHtml = $node . html ( ) ;
$node . replaceWith ( innerHtml ) ;
}
} ,
clean : [ ]
}
} ;
var WwwGizmodoJpExtractor = {
domain : 'www.gizmodo.jp' ,
title : {
selectors : [ 'h1.p-post-title' ]
} ,
author : {
selectors : [ 'li.p-post-AssistAuthor' ]
} ,
date _published : {
selectors : [ [ 'li.p-post-AssistTime time' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article.p-post' ] ,
transforms : {
'img.p-post-thumbnailImage' : function imgPPostThumbnailImage ( $node ) {
var src = $node . attr ( 'src' ) ;
$node . attr ( 'src' , src . replace ( /^.*=%27/ , '' ) . replace ( /%27;$/ , '' ) ) ;
}
} ,
clean : [ 'h1.p-post-title' , 'ul.p-post-Assist' ]
}
} ;
var GetnewsJpExtractor = {
domain : 'getnews.jp' ,
title : {
selectors : [ 'article h1' ]
} ,
author : {
selectors : [ 'span.prof' ]
} ,
date _published : {
selectors : [ [ 'ul.cattag-top time' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.post-bodycopy' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwLifehackerJpExtractor = {
domain : 'www.lifehacker.jp' ,
title : {
selectors : [ 'h1.lh-summary-title' ]
} ,
author : {
selectors : [ 'p.lh-entryDetailInner--credit' ]
} ,
date _published : {
selectors : [ [ 'div.lh-entryDetail-header time' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.lh-entryDetail-body' ] ,
transforms : {
'img.lazyload' : function imgLazyload ( $node ) {
var src = $node . attr ( 'src' ) ;
$node . attr ( 'src' , src . replace ( /^.*=%27/ , '' ) . replace ( /%27;$/ , '' ) ) ;
}
} ,
clean : [ 'p.lh-entryDetailInner--credit' ]
}
} ;
var SectIijAdJpExtractor = {
domain : 'sect.iij.ad.jp' ,
title : {
selectors : [ 'h3' ]
} ,
author : {
selectors : [ 'dl.entrydate dd' ]
} ,
date _published : {
selectors : [ 'dl.entrydate dd' ] ,
format : 'YYYY年MM月DD日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#article' ] ,
transforms : { } ,
clean : [ 'dl.entrydate' ]
}
} ;
var WwwOreillyCoJpExtractor = {
domain : 'www.oreilly.co.jp' ,
title : {
selectors : [ 'h3' ]
} ,
author : {
selectors : [ 'li[itemprop="author"]' ]
} ,
date _published : {
selectors : [ [ 'meta[itemprop="datePublished"]' , 'value' ] ] ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '#content' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ '.social-tools' ]
}
} ;
var WwwIpaGoJpExtractor = {
domain : 'www.ipa.go.jp' ,
title : {
selectors : [ 'h1' ]
} ,
author : null ,
date _published : {
selectors : [ 'p.ipar_text_right' ] ,
format : 'YYYY年M月D日' ,
timezone : 'Asia/Tokyo'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ '#ipar_main' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ 'p.ipar_text_right' ]
}
} ;
var WeeklyAsciiJpExtractor = {
domain : 'weekly.ascii.jp' ,
title : {
selectors : [ 'h1[itemprop="headline"]' ]
} ,
author : {
selectors : [ 'p.author' ]
} ,
date _published : {
selectors : [ [ 'meta[name="odate"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var TechlogIijAdJpExtractor = {
domain : 'techlog.iij.ad.jp' ,
title : {
selectors : [ 'h1.entry-title' ]
} ,
author : {
selectors : [ 'a[rel="author"]' ]
} ,
date _published : {
selectors : [ [ 'time.entry-date' , 'datetime' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.entry-content' ] ,
defaultCleaner : false ,
transforms : { } ,
clean : [ ]
}
} ;
var WiredJpExtractor = {
domain : 'wired.jp' ,
title : {
selectors : [ 'h1.post-title' ]
} ,
author : {
selectors : [ 'p[itemprop="author"]' ]
} ,
date _published : {
selectors : [ [ 'time' , 'datetime' ] ]
} ,
dek : {
selectors : [ '.post-intro' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'article.article-detail' ] ,
transforms : {
'img[data-original]' : function imgDataOriginal ( $node ) {
var dataOriginal = $node . attr ( 'data-original' ) ;
var src = $node . attr ( 'src' ) ;
var url = URL . resolve ( src , dataOriginal ) ;
$node . attr ( 'src' , url ) ;
}
} ,
clean : [ '.post-category' , 'time' , 'h1.post-title' , '.social-area-syncer' ]
}
} ;
var JapanZdnetComExtractor = {
domain : 'japan.zdnet.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ [ 'meta[name="cXenseParse:author"]' , 'value' ] ]
} ,
date _published : {
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
} ,
dek : null ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ 'div.article_body' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwRbbtodayComExtractor = {
domain : 'www.rbbtoday.com' ,
title : {
selectors : [ 'h1' ]
} ,
author : {
selectors : [ '.writer.writer-name' ]
} ,
date _published : {
selectors : [ [ 'header time' , 'datetime' ] ]
} ,
dek : {
selectors : [ '.arti-summary' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.arti-content' ] ,
transforms : { } ,
clean : [ '.arti-giga' ]
}
} ;
var WwwLemondeFrExtractor = {
domain : 'www.lemonde.fr' ,
title : {
selectors : [ 'h1.article__title' ]
} ,
author : {
selectors : [ '.author__name' ]
} ,
date _published : {
selectors : [ [ 'meta[name="og:article:published_time"]' , 'value' ] ]
} ,
dek : {
selectors : [ '.article__desc' ]
} ,
lead _image _url : {
selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
} ,
content : {
selectors : [ '.article__content' ] ,
transforms : { } ,
clean : [ ]
}
} ;
var WwwPhoronixComExtractor = {
domain : 'www.phoronix.com' ,
title : {
selectors : [ 'article header' ]
} ,
author : {
selectors : [ '.author a:first-child' ]
} ,
date _published : {
selectors : [ '.author' ] ,
// 1 June 2019 at 08:34 PM EDT
format : 'D MMMM YYYY at hh:mm' ,
timezone : 'America/New_York'
} ,
dek : null ,
lead _image _url : null ,
content : {
selectors : [ '.content' ] ,
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms : { } ,
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean : [ ]
}
} ;
var PitchforkComExtractor = {
domain : 'pitchfork.com' ,
title : {
selectors : [ 'title' ]
} ,
author : {
selectors : [ '.authors-detail__display-name' ]
} ,
date _published : {
selectors : [ [ '.pub-date' , 'datetime' ] ]
} ,
dek : {
selectors : [ '.review-detail__abstract' ]
} ,
lead _image _url : {
selectors : [ [ '.single-album-tombstone__art img' , 'src' ] ]
} ,
content : {
selectors : [ '.review-detail__text' ]
} ,
extend : {
score : {
selectors : [ '.score' ]
}
}
} ;
var CustomExtractors = /*#__PURE__*/ Object . freeze ( {
@ -5162,6 +5788,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
WwwFastcompanyComExtractor : WwwFastcompanyComExtractor ,
BlisterreviewComExtractor : BlisterreviewComExtractor ,
NewsMynaviJpExtractor : NewsMynaviJpExtractor ,
ClinicaltrialsGovExtractor : ClinicaltrialsGovExtractor ,
GithubComExtractor : GithubComExtractor ,
WwwRedditComExtractor : WwwRedditComExtractor ,
OtrsComExtractor : OtrsComExtractor ,
@ -5173,7 +5800,31 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
ScanNetsecurityNeJpExtractor : ScanNetsecurityNeJpExtractor ,
JvndbJvnJpExtractor : JvndbJvnJpExtractor ,
GeniusComExtractor : GeniusComExtractor ,
WwwJnsaOrgExtractor : WwwJnsaOrgExtractor
WwwJnsaOrgExtractor : WwwJnsaOrgExtractor ,
PhpspotOrgExtractor : PhpspotOrgExtractor ,
WwwInfoqComExtractor : WwwInfoqComExtractor ,
WwwMoongiftJpExtractor : WwwMoongiftJpExtractor ,
WwwItmediaCoJpExtractor : WwwItmediaCoJpExtractor ,
WwwPublickey1JpExtractor : WwwPublickey1JpExtractor ,
TakagihiromitsuJpExtractor : TakagihiromitsuJpExtractor ,
BookwalkerJpExtractor : BookwalkerJpExtractor ,
WwwYomiuriCoJpExtractor : WwwYomiuriCoJpExtractor ,
JapanCnetComExtractor : JapanCnetComExtractor ,
DeadlineComExtractor : DeadlineComExtractor ,
WwwGizmodoJpExtractor : WwwGizmodoJpExtractor ,
GetnewsJpExtractor : GetnewsJpExtractor ,
WwwLifehackerJpExtractor : WwwLifehackerJpExtractor ,
SectIijAdJpExtractor : SectIijAdJpExtractor ,
WwwOreillyCoJpExtractor : WwwOreillyCoJpExtractor ,
WwwIpaGoJpExtractor : WwwIpaGoJpExtractor ,
WeeklyAsciiJpExtractor : WeeklyAsciiJpExtractor ,
TechlogIijAdJpExtractor : TechlogIijAdJpExtractor ,
WiredJpExtractor : WiredJpExtractor ,
JapanZdnetComExtractor : JapanZdnetComExtractor ,
WwwRbbtodayComExtractor : WwwRbbtodayComExtractor ,
WwwLemondeFrExtractor : WwwLemondeFrExtractor ,
WwwPhoronixComExtractor : WwwPhoronixComExtractor ,
PitchforkComExtractor : PitchforkComExtractor
} ) ;
var Extractors = _Object$keys ( CustomExtractors ) . reduce ( function ( acc , key ) {