release: 1.0.3 (#62)

8 years ago · a710efd2d5
parent 861c5f0dcb
commit a710efd2d5
5 changed files with 195 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,18 @@
 # Mercury Parser Changelog

+### 1.0.3 (Dec 9, 2016)
+
+##### Commits
+
+* [[`861c5f0dcb`](https://github.com/postlight/mercury-parser/commit/861c5f0dcb)] - **feat**: bustle extractor (#60) (Janet) 
+* [[`06397a4360`](https://github.com/postlight/mercury-parser/commit/06397a4360)] - **feat**: browser-friendly selector for medium (#61) (Adam Pash) 
+* [[`3297ab079d`](https://github.com/postlight/mercury-parser/commit/3297ab079d)] - **feat**: bloomberg extractor (#59) (Adam Pash) 
+* [[`e55e9da534`](https://github.com/postlight/mercury-parser/commit/e55e9da534)] - **feat**: sbnation extractor (#55) (Janet) 
+* [[`8070e4790b`](https://github.com/postlight/mercury-parser/commit/8070e4790b)] - **test**: streamlined guardian tests w/new single-extraction (#58) (Adam Pash) 
+* [[`bdb751fb53`](https://github.com/postlight/mercury-parser/commit/bdb751fb53)] - **feat**: more cleaning for wired (#56) (Adam Pash) 
+* [[`e7e41bd242`](https://github.com/postlight/mercury-parser/commit/e7e41bd242)] - **feat**: the guardian custom extractor (#41) (Janet) 
+* [[`332f85928f`](https://github.com/postlight/mercury-parser/commit/332f85928f)] - **release**: 1.0.2 (#54) (Adam Pash) 
+
 ### 1.0.2 (Dec 6, 2016)

 ##### Commits
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -422,7 +422,8 @@ var REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(function (selector) {
  return '[' + selector + ']';
 });
 var REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');
-var WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt'];
+var WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
+
 var WHITELIST_ATTRS_RE = new RegExp('^(' + WHITELIST_ATTRS.join('|') + ')$', 'i');

 // removeEmpty
@ -2183,7 +2184,7 @@ var WiredExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['.visually-hidden']
+    clean: ['.visually-hidden', 'figcaption img.photo']
  },

  date_published: {
@ -2661,7 +2662,7 @@ var MediumExtractor = {
  },

  content: {
-    selectors: ['.section-content'],
+    selectors: ['.section-content', 'article > div > section'],

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
@ -3128,6 +3129,170 @@ var WwwYoutubeComExtractor = {
  }
 };

+var WwwTheguardianComExtractor = {
+  domain: 'www.theguardian.com',
+
+  title: {
+    selectors: ['.content__headline']
+  },
+
+  author: {
+    selectors: ['p.byline']
+  },
+
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']]
+  },
+
+  dek: {
+    selectors: ['.content__standfirst']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['.content__article-body'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.hide-on-mobile', '.inline-icon']
+  }
+};
+
+var WwwSbnationComExtractor = {
+  domain: 'www.sbnation.com',
+
+  title: {
+    selectors: ['h1.c-page-title']
+  },
+
+  author: {
+    selectors: [['meta[name="author"]', 'value']]
+  },
+
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']]
+  },
+
+  dek: {
+    selectors: ['h2.c-entry-summary.p-dek']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['div.c-entry-content'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+
+var WwwBloombergComExtractor = {
+  domain: 'www.bloomberg.com',
+
+  title: {
+    selectors: [
+    // normal articles
+    '.lede-headline',
+
+    // /graphics/ template
+    'h1.article-title',
+
+    // /news/ template
+    'h1.lede-text-only__hed']
+  },
+
+  author: {
+    selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link',
+
+    // /graphics/ template
+    '.bydek',
+
+    // /news/ template
+    '.author']
+  },
+
+  date_published: {
+    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value']]
+  },
+
+  dek: {
+    selectors: []
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['.article-body__content',
+
+    // /graphics/ template
+    ['section.copy-block'],
+
+    // /news/ template
+    '.body-copy'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.inline-newsletter', '.page-ad']
+  }
+};
+
+var WwwBustleComExtractor = {
+  domain: 'www.bustle.com',
+
+  title: {
+    selectors: ['h1.post-page__title']
+  },
+
+  author: {
+    selectors: ['div.content-meta__author']
+  },
+
+  date_published: {
+    selectors: [['time.content-meta__published-date[datetime]', 'datetime']]
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['.post-page__body'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+


 var CustomExtractors = Object.freeze({
@ -3157,7 +3322,11 @@ var CustomExtractors = Object.freeze({
 	WwwThevergeComExtractor: WwwThevergeComExtractor,
 	WwwCnnComExtractor: WwwCnnComExtractor,
 	WwwAolComExtractor: WwwAolComExtractor,
-	WwwYoutubeComExtractor: WwwYoutubeComExtractor
+	WwwYoutubeComExtractor: WwwYoutubeComExtractor,
+	WwwTheguardianComExtractor: WwwTheguardianComExtractor,
+	WwwSbnationComExtractor: WwwSbnationComExtractor,
+	WwwBloombergComExtractor: WwwBloombergComExtractor,
+	WwwBustleComExtractor: WwwBustleComExtractor
 });

 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "mercury-parser",
-  "version": "1.0.2",
+  "version": "1.0.3",
  "description": "",
  "repository": "github:postlight/mercury-parser",
  "main": "./dist/mercury.js",