release: 1.0.0 so we can start doing proper releaes (#39)

8 years ago · e9a36d6ebd
parent ff1963bdca
commit e9a36d6ebd
3 changed files with 258 additions and 22 deletions
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -2713,7 +2713,9 @@ var WwwTmzComExtractor = {
  author: 'TMZ STAFF',

  date_published: {
-    selectors: ['.article-posted-date']
+    selectors: ['.article-posted-date'],
+
+    timezone: 'America/Los_Angeles'
  },

  dek: {
@ -2783,7 +2785,90 @@ var WwwWashingtonpostComExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['.interstitial-link']
+    clean: ['.interstitial-link', '.newsletter-inline-unit']
+  }
+};
+
+var WwwHuffingtonpostComExtractor = {
+  domain: 'www.huffingtonpost.com',
+
+  title: {
+    selectors: ['h1.headline__title']
+  },
+
+  author: {
+    selectors: ['span.author-card__details__name']
+  },
+
+  date_published: {
+    selectors: [['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value']]
+  },
+
+  dek: {
+    selectors: ['h2.headline__subtitle']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['div.entry__body'],
+
+    defaultCleaner: false,
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+      // 'div.top-media': ($node) => {
+      //   const $figure = $node.children('figure');
+      //   $node.replaceWith($figure);
+      // },
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story']
+  }
+};
+
+var NewrepublicComExtractor = {
+  domain: 'newrepublic.com',
+
+  title: {
+    selectors: ['h1.article-headline', '.minutes-primary h1.minute-title']
+  },
+
+  author: {
+    selectors: ['div.author-list', '.minutes-primary h3.minute-byline']
+  },
+
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']],
+
+    timezone: 'America/New_York'
+  },
+
+  dek: {
+    selectors: ['h2.article-subhead']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['div.content-body', '.minutes-primary div.content-body'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['aside']
  }
 };

@ -2799,7 +2884,9 @@ var MoneyCnnComExtractor = {
  },

  date_published: {
-    selectors: [['meta[name="date"]', 'value']]
+    selectors: [['meta[name="date"]', 'value']],
+
+    timezone: 'GMT'
  },

  dek: {
@ -2824,6 +2911,120 @@ var MoneyCnnComExtractor = {
  }
 };

+var WwwThevergeComExtractor = {
+  domain: 'www.theverge.com',
+
+  title: {
+    selectors: ['h1']
+  },
+
+  author: {
+    selectors: [['meta[name="author"]', 'value']]
+  },
+
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']]
+  },
+
+  dek: {
+    selectors: ['h2.p-dek']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: [
+    // feature template multi-match
+    ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'],
+    // regular post multi-match
+    ['.e-image--hero', '.c-entry-content'],
+    // feature template fallback
+    '.l-wrapper .l-feature',
+    // regular post fallback
+    'div.c-entry-content'],
+
+    // Transform lazy-loaded images
+    transforms: {
+      noscript: function noscript($node) {
+        var $children = $node.children();
+        if ($children.length === 1 && $children.get(0).tagName === 'img') {
+          return 'span';
+        }
+
+        return null;
+      }
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.aside', 'img.c-dynamic-image']
+  }
+};
+
+var WwwCnnComExtractor = {
+  domain: 'www.cnn.com',
+
+  title: {
+    selectors: ['h1.pg-headline', 'h1']
+  },
+
+  author: {
+    selectors: ['.metadata__byline__author']
+  },
+
+  date_published: {
+    selectors: [['meta[name="pubdate"]', 'value']]
+  },
+
+  dek: null,
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: [
+    // a more specific selector to grab the lead image and the body
+    ['.media__video--thumbnail', '.zn-body-text'],
+    // a fallback for the above
+    '.zn-body-text', 'div[itemprop="articleBody"]'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+      '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph($node) {
+        var $text = $node.html();
+        if ($text) {
+          return 'p';
+        }
+
+        return null;
+      },
+
+      // this transform cleans the short, all-link sections linking
+      // to related content but not marked as such in any way.
+      '.zn-body__paragraph': function znBody__paragraph($node) {
+        if ($node.has('a')) {
+          if ($node.text().trim() === $node.find('a').text().trim()) {
+            $node.remove();
+          }
+        }
+      },
+
+      '.media__video--thumbnail': 'figure'
+
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+


 var CustomExtractors = Object.freeze({
@ -2847,7 +3048,11 @@ var CustomExtractors = Object.freeze({
 	MediumExtractor: MediumExtractor,
 	WwwTmzComExtractor: WwwTmzComExtractor,
 	WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
-	MoneyCnnComExtractor: MoneyCnnComExtractor
+	WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
+	NewrepublicComExtractor: NewrepublicComExtractor,
+	MoneyCnnComExtractor: MoneyCnnComExtractor,
+	WwwThevergeComExtractor: WwwThevergeComExtractor,
+	WwwCnnComExtractor: WwwCnnComExtractor
 });

 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
@ -2906,7 +3111,7 @@ var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g');
 // Take an author string (like 'By David Smith ') and clean it to
 // just the name(s): 'David Smith'.
 function cleanAuthor(author) {
-  return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
+  return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
 }

 function clean$1(leadImageUrl) {
@ -2936,7 +3141,7 @@ function cleanDek(dek, _ref) {
  // not a good dek - bail.
  if (TEXT_LINK_RE.test(dekText)) return null;

-  return dekText.trim();
+  return normalizeSpaces(dekText.trim());
 }

 // Is there a compelling reason to use moment here?
@ -3053,7 +3258,7 @@ function cleanTitle$$1(title, _ref) {
  }

  // strip any html tags in the title text
-  return stripTags(title, $).trim();
+  return normalizeSpaces(stripTags(title, $).trim());
 }

 function extractBreadcrumbTitle(splitTitle, text) {
@ -4506,9 +4711,15 @@ function transformElements($content, $, _ref2) {
  return $content;
 }

-function findMatchingSelector($, selectors) {
+function findMatchingSelector($, selectors, extractHtml) {
  return selectors.find(function (selector) {
    if (Array.isArray(selector)) {
+      if (extractHtml) {
+        return selector.reduce(function (acc, s) {
+          return acc && $(s).length > 0;
+        }, true);
+      }
+
      var _selector = _slicedToArray(selector, 2),
          s = _selector[0],
          attr = _selector[1];
@ -4539,7 +4750,7 @@ function select(opts) {
      defaultCleaner = _extractionOpts$defau === undefined ? true : _extractionOpts$defau;


-  var matchingSelector = findMatchingSelector($, selectors);
+  var matchingSelector = findMatchingSelector($, selectors, extractHtml);

  if (!matchingSelector) return null;

@ -4549,8 +4760,25 @@ function select(opts) {

  // If the selector type requests html as its return type
  // transform and clean the element with provided selectors
+  var $content = void 0;
  if (extractHtml) {
-    var $content = $(matchingSelector);
+    // If matching selector is an array, we're considering this a
+    // multi-match selection, which allows the parser to choose several
+    // selectors to include in the result. Note that all selectors in the
+    // array must match in order for this selector to trigger
+    if (Array.isArray(matchingSelector)) {
+      (function () {
+        $content = $(matchingSelector.join(','));
+        var $wrapper = $('<div></div>');
+        $content.each(function (index, element) {
+          $wrapper.append(element);
+        });
+
+        $content = $wrapper;
+      })();
+    } else {
+      $content = $(matchingSelector);
+    }

    // Wrap in div so transformation can take place on root element
    $content.wrap($('<div></div>'));
@ -4581,7 +4809,7 @@ function select(opts) {
  // Allow custom extractor to skip default cleaner
  // for this type; defaults to true
  if (defaultCleaner) {
-    return Cleaners[type](result, opts);
+    return Cleaners[type](result, _extends({}, opts, extractionOpts));
  }

  return result;
@ -4847,9 +5075,16 @@ var Mercury = {
              });

            case 22:
+
+              // if this parse is happening in the browser,
+              // clean up any trace from the page.
+              if (cheerio.browser) {
+                cheerio.cleanup();
+              }
+
              return _context.abrupt('return', result);

-            case 23:
+            case 24:
            case 'end':
              return _context.stop();
          }
--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js