mercury-parser/dist/mercury.js

'use strict';

function _interopDefault(ex) {
  return ex && typeof ex === 'object' && 'default' in ex ? ex['default'] : ex;
}

var _regeneratorRuntime = _interopDefault(
  require('@babel/runtime-corejs2/regenerator')
);
var _objectSpread = _interopDefault(
  require('@babel/runtime-corejs2/helpers/objectSpread')
);
var _asyncToGenerator = _interopDefault(
  require('@babel/runtime-corejs2/helpers/asyncToGenerator')
);
var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
var iconv = _interopDefault(require('iconv-lite'));
var _parseInt = _interopDefault(
  require('@babel/runtime-corejs2/core-js/parse-int')
);
var _slicedToArray = _interopDefault(
  require('@babel/runtime-corejs2/helpers/slicedToArray')
);
var _Promise = _interopDefault(
  require('@babel/runtime-corejs2/core-js/promise')
);
var request = _interopDefault(require('request'));
var _Reflect$ownKeys = _interopDefault(
  require('@babel/runtime-corejs2/core-js/reflect/own-keys')
);
var _toConsumableArray = _interopDefault(
  require('@babel/runtime-corejs2/helpers/toConsumableArray')
);
var _defineProperty = _interopDefault(
  require('@babel/runtime-corejs2/helpers/defineProperty')
);
var _parseFloat = _interopDefault(
  require('@babel/runtime-corejs2/core-js/parse-float')
);
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
var _getIterator = _interopDefault(
  require('@babel/runtime-corejs2/core-js/get-iterator')
);
var _Object$keys = _interopDefault(
  require('@babel/runtime-corejs2/core-js/object/keys')
);
var stringDirection = _interopDefault(require('string-direction'));
var validUrl = _interopDefault(require('valid-url'));
var moment = _interopDefault(require('moment-timezone'));
var parseFormat = _interopDefault(require('moment-parseformat'));
var wuzzy = _interopDefault(require('wuzzy'));
var difflib = _interopDefault(require('difflib'));
var _Array$from = _interopDefault(
  require('@babel/runtime-corejs2/core-js/array/from')
);
var ellipsize = _interopDefault(require('ellipsize'));
var _Array$isArray = _interopDefault(
  require('@babel/runtime-corejs2/core-js/array/is-array')
);

var NORMALIZE_RE = /\s{2,}/g;
function normalizeSpaces(text) {
  return text.replace(NORMALIZE_RE, ' ').trim();
}

// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
function extractFromUrl(url, regexList) {
  var matchRe = regexList.find(function(re) {
    return re.test(url);
  });

  if (matchRe) {
    return matchRe.exec(url)[1];
  }

  return null;
}

// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
//  page=1
//  pg=1
//  p=1
//  paging=12
//  pag=7
//  pagination/1
//  paging/88
//  pa/83
//  p/11
//
// Does not match:
//  pg=102
//  page:2
var PAGE_IN_HREF_RE = new RegExp(
  '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})',
  'i'
);
var HAS_ALPHA_RE = /[a-z]/i;
var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
var ENCODING_RE = /charset=([\w-]+)\b/;
var DEFAULT_ENCODING = 'utf-8';

function pageNumFromUrl(url) {
  var matches = url.match(PAGE_IN_HREF_RE);
  if (!matches) return null;

  var pageNum = _parseInt(matches[6], 10); // Return pageNum < 100, otherwise
  // return null

  return pageNum < 100 ? pageNum : null;
}

function removeAnchor(url) {
  return url.split('#')[0].replace(/\/$/, '');
}

function isGoodSegment(segment, index, firstSegmentHasLetters) {
  var goodSegment = true; // If this is purely a number, and it's the first or second
  // url_segment, it's probably a page number. Remove it.

  if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
    goodSegment = true;
  } // If this is the first url_segment and it's just "index",
  // remove it

  if (index === 0 && segment.toLowerCase() === 'index') {
    goodSegment = false;
  } // If our first or second url_segment is smaller than 3 characters,
  // and the first url_segment had no alphas, remove it.

  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
    goodSegment = false;
  }

  return goodSegment;
} // Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.

function articleBaseUrl(url, parsed) {
  var parsedUrl = parsed || URL.parse(url);
  var protocol = parsedUrl.protocol,
    host = parsedUrl.host,
    path = parsedUrl.path;
  var firstSegmentHasLetters = false;
  var cleanedSegments = path
    .split('/')
    .reverse()
    .reduce(function(acc, rawSegment, index) {
      var segment = rawSegment; // Split off and save anything that looks like a file type.

      if (segment.includes('.')) {
        var _segment$split = segment.split('.'),
          _segment$split2 = _slicedToArray(_segment$split, 2),
          possibleSegment = _segment$split2[0],
          fileExt = _segment$split2[1];

        if (IS_ALPHA_RE.test(fileExt)) {
          segment = possibleSegment;
        }
      } // If our first or second segment has anything looking like a page
      // number, remove it.

      if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
        segment = segment.replace(PAGE_IN_HREF_RE, '');
      } // If we're on the first segment, check to see if we have any
      // characters in it. The first segment is actually the last bit of
      // the URL, and this will be helpful to determine if we're on a URL
      // segment that looks like "/2/" for example.

      if (index === 0) {
        firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
      } // If it's not marked for deletion, push it to cleaned_segments.

      if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
        acc.push(segment);
      }

      return acc;
    }, []);
  return ''
    .concat(protocol, '//')
    .concat(host)
    .concat(cleanedSegments.reverse().join('/'));
}

// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
var SENTENCE_END_RE = new RegExp('.( |$)');
function hasSentenceEnd(text) {
  return SENTENCE_END_RE.test(text);
}

function excerptContent(content) {
  var words =
    arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
  return content
    .trim()
    .split(/\s+/)
    .slice(0, words)
    .join(' ');
}

// used in our fetchResource function to
// ensure correctly encoded responses

function getEncoding(str) {
  var encoding = DEFAULT_ENCODING;

  if (ENCODING_RE.test(str)) {
    var testEncode = ENCODING_RE.exec(str)[1];

    if (iconv.encodingExists(testEncode)) {
      encoding = testEncode;
    }
  }

  return encoding;
}

var _marked =
  /*#__PURE__*/
  _regeneratorRuntime.mark(range);

function range() {
  var start,
    end,
    _args = arguments;
  return _regeneratorRuntime.wrap(
    function range$(_context) {
      while (1) {
        switch ((_context.prev = _context.next)) {
          case 0:
            start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
            end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;

          case 2:
            if (!(start <= end)) {
              _context.next = 7;
              break;
            }

            _context.next = 5;
            return (start += 1);

          case 5:
            _context.next = 2;
            break;

          case 7:
          case 'end':
            return _context.stop();
        }
      }
    },
    _marked,
    this
  );
}

// extremely simple url validation as a first step
function validateUrl(_ref) {
  var hostname = _ref.hostname;
  // If this isn't a valid url, return an error message
  return !!hostname;
}

var Errors = {
  badUrl: {
    error: true,
    messages:
      'The url parameter passed does not look like a valid URL. Please check your data and try again.',
  },
};

var REQUEST_HEADERS = cheerio.browser
  ? {}
  : {
      'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    }; // The number of milliseconds to attempt to fetch a resource before timing out.

var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from

var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];
var BAD_CONTENT_TYPES_RE = new RegExp(
  '^('.concat(BAD_CONTENT_TYPES.join('|'), ')$'),
  'i'
); // Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.

var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off

function get(options) {
  return new _Promise(function(resolve, reject) {
    request(options, function(err, response, body) {
      if (err) {
        reject(err);
      } else {
        resolve({
          body: body,
          response: response,
        });
      }
    });
  });
} // Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 level or
// not. Validation here means that we haven't found reason to bail from
// further processing of this url.

function validateResponse(response) {
  var parseNon2xx =
    arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;

  // Check if we got a valid status code
  // This isn't great, but I'm requiring a statusMessage to be set
  // before short circuiting b/c nock doesn't set it in tests
  // statusMessage only not set in nock response, in which case
  // I check statusCode, which is currently only 200 for OK responses
  // in tests
  if (
    (response.statusMessage && response.statusMessage !== 'OK') ||
    response.statusCode !== 200
  ) {
    if (!response.statusCode) {
      throw new Error(
        'Unable to fetch content. Original exception was '.concat(
          response.error
        )
      );
    } else if (!parseNon2xx) {
      throw new Error(
        'Resource returned a response status code of '.concat(
          response.statusCode,
          ' and resource was instructed to reject non-2xx level status codes.'
        )
      );
    }
  }

  var _response$headers = response.headers,
    contentType = _response$headers['content-type'],
    contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES

  if (BAD_CONTENT_TYPES_RE.test(contentType)) {
    throw new Error(
      'Content-type for this resource was '.concat(
        contentType,
        ' and is not allowed.'
      )
    );
  } // Check that the content length is below maximum

  if (contentLength > MAX_CONTENT_LENGTH) {
    throw new Error(
      'Content for this resource was too large. Maximum content length is '.concat(
        MAX_CONTENT_LENGTH,
        '.'
      )
    );
  }

  return true;
} // Grabs the last two pieces of the URL and joins them back together
// TODO: This should gracefully handle timeouts and raise the
//       proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
//       unicode content for HTML, with charset conversion.

function fetchResource(_x, _x2) {
  return _fetchResource.apply(this, arguments);
}

function _fetchResource() {
  _fetchResource = _asyncToGenerator(
    /*#__PURE__*/
    _regeneratorRuntime.mark(function _callee(url, parsedUrl) {
      var options, _ref2, response, body;

      return _regeneratorRuntime.wrap(
        function _callee$(_context) {
          while (1) {
            switch ((_context.prev = _context.next)) {
              case 0:
                parsedUrl = parsedUrl || URL.parse(encodeURI(url));
                options = {
                  url: parsedUrl.href,
                  headers: _objectSpread({}, REQUEST_HEADERS),
                  timeout: FETCH_TIMEOUT,
                  // Accept cookies
                  jar: true,
                  // Set to null so the response returns as binary and body as buffer
                  // https://github.com/request/request#requestoptions-callback
                  encoding: null,
                  // Accept and decode gzip
                  gzip: true,
                  // Follow any redirect
                  followAllRedirects: true,
                };
                _context.next = 4;
                return get(options);

              case 4:
                _ref2 = _context.sent;
                response = _ref2.response;
                body = _ref2.body;
                _context.prev = 7;
                validateResponse(response);
                return _context.abrupt('return', {
                  body: body,
                  response: response,
                });

              case 12:
                _context.prev = 12;
                _context.t0 = _context['catch'](7);
                return _context.abrupt('return', Errors.badUrl);

              case 15:
              case 'end':
                return _context.stop();
            }
          }
        },
        _callee,
        this,
        [[7, 12]]
      );
    })
  );
  return _fetchResource.apply(this, arguments);
}

function convertMetaProp($, from, to) {
  $('meta['.concat(from, ']')).each(function(_, node) {
    var $node = $(node);
    var value = $node.attr(from);
    $node.attr(to, value);
    $node.removeAttr(from);
  });
  return $;
} // For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.

function normalizeMetaTags($) {
  $ = convertMetaProp($, 'content', 'value');
  $ = convertMetaProp($, 'property', 'name');
  return $;
}

// Spacer images to be removed
var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep
// but would normally remove

var KEEP_CLASS = 'mercury-parser-keep';
var KEEP_SELECTORS = [
  'iframe[src^="https://www.youtube.com"]',
  'iframe[src^="https://www.youtube-nocookie.com"]',
  'iframe[src^="http://www.youtube.com"]',
  'iframe[src^="https://player.vimeo"]',
  'iframe[src^="http://player.vimeo"]',
]; // A list of tags to strip from the output if we encounter them.

var STRIP_OUTPUT_TAGS = [
  'title',
  'script',
  'noscript',
  'link',
  'style',
  'hr',
  'embed',
  'iframe',
  'object',
]; // cleanAttributes
var WHITELIST_ATTRS = [
  'src',
  'srcset',
  'href',
  'class',
  'id',
  'alt',
  'xlink:href',
  'width',
  'height',
];
var WHITELIST_ATTRS_RE = new RegExp(
  '^('.concat(WHITELIST_ATTRS.join('|'), ')$'),
  'i'
); // removeEmpty

var CLEAN_CONDITIONALLY_TAGS = [
  'ul',
  'ol',
  'table',
  'div',
  'button',
  'form',
].join(','); // cleanHeaders

var HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];
var HEADER_TAG_LIST = HEADER_TAGS.join(','); // // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.

var UNLIKELY_CANDIDATES_BLACKLIST = [
  'ad-break',
  'adbox',
  'advert',
  'addthis',
  'agegate',
  'aux',
  'blogger-labels',
  'combx',
  'comment',
  'conversation',
  'disqus',
  'entry-unrelated',
  'extra',
  'foot', // 'form', // This is too generic, has too many false positives
  'header',
  'hidden',
  'loader',
  'login', // Note: This can hit 'blogindex'.
  'menu',
  'meta',
  'nav',
  'outbrain',
  'pager',
  'pagination',
  'predicta', // readwriteweb inline ad box
  'presence_control_external', // lifehacker.com container full of false positives
  'popup',
  'printfriendly',
  'related',
  'remove',
  'remark',
  'rss',
  'share',
  'shoutbox',
  'sidebar',
  'sociable',
  'sponsor',
  'taboola',
  'tools',
]; // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.

var UNLIKELY_CANDIDATES_WHITELIST = [
  'and',
  'article',
  'body',
  'blogindex',
  'column',
  'content',
  'entry-content-asset',
  'format', // misuse of form
  'hfeed',
  'hentry',
  'hatom',
  'main',
  'page',
  'posts',
  'shadow',
]; // A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.

var DIV_TO_P_BLOCK_TAGS = [
  'a',
  'blockquote',
  'dl',
  'div',
  'img',
  'p',
  'pre',
  'table',
].join(','); // A list of tags that should be ignored when trying to find the top candidate
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var POSITIVE_SCORE_HINTS = [
  'article',
  'articlecontent',
  'instapaper_body',
  'blog',
  'body',
  'content',
  'entry-content-asset',
  'entry',
  'hentry',
  'main',
  'Normal',
  'page',
  'pagination',
  'permalink',
  'post',
  'story',
  'text',
  '[-_]copy', // usatoday
  '\\Bcopy',
]; // The above list, joined into a matching regular expression

var POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var NEGATIVE_SCORE_HINTS = [
  'adbox',
  'advert',
  'author',
  'bio',
  'bookmark',
  'bottom',
  'byline',
  'clear',
  'com-',
  'combx',
  'comment',
  'comment\\B',
  'contact',
  'copy',
  'credit',
  'crumb',
  'date',
  'deck',
  'excerpt',
  'featured', // tnr.com has a featured_content which throws us off
  'foot',
  'footer',
  'footnote',
  'graf',
  'head',
  'info',
  'infotext', // newscientist.com copyright
  'instapaper_ignore',
  'jump',
  'linebreak',
  'link',
  'masthead',
  'media',
  'meta',
  'modal',
  'outbrain', // slate.com junk
  'promo',
  'pr_', // autoblog - press release
  'related',
  'respond',
  'roundcontent', // lifehacker restricted content warning
  'scroll',
  'secondary',
  'share',
  'shopping',
  'shoutbox',
  'side',
  'sidebar',
  'sponsor',
  'stamp',
  'sub',
  'summary',
  'tags',
  'tools',
  'widget',
]; // The above list, joined into a matching regular expression

var NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful.

var IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear.

var PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next
// http://bit.ly/qneNIT

var BLOCK_LEVEL_TAGS = [
  'article',
  'aside',
  'blockquote',
  'body',
  'br',
  'button',
  'canvas',
  'caption',
  'col',
  'colgroup',
  'dd',
  'div',
  'dl',
  'dt',
  'embed',
  'fieldset',
  'figcaption',
  'figure',
  'footer',
  'form',
  'h1',
  'h2',
  'h3',
  'h4',
  'h5',
  'h6',
  'header',
  'hgroup',
  'hr',
  'li',
  'map',
  'object',
  'ol',
  'output',
  'p',
  'pre',
  'progress',
  'section',
  'table',
  'tbody',
  'textarea',
  'tfoot',
  'th',
  'thead',
  'tr',
  'ul',
  'video',
];
var BLOCK_LEVEL_TAGS_RE = new RegExp(
  '^('.concat(BLOCK_LEVEL_TAGS.join('|'), ')$'),
  'i'
); // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.

var candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
var CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
var candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
var CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');

function stripUnlikelyCandidates($) {
  //  Loop through the provided document and remove any non-link nodes
  //  that are unlikely candidates for article content.
  //
  //  Links are ignored because there are very often links to content
  //  that are identified as non-body-content, but may be inside
  //  article-like content.
  //
  //  :param $: a cheerio object to strip nodes from
  //  :return $: the cleaned cheerio object
  $('*')
    .not('a')
    .each(function(index, node) {
      var $node = $(node);
      var classes = $node.attr('class');
      var id = $node.attr('id');
      if (!id && !classes) return;
      var classAndId = ''.concat(classes || '', ' ').concat(id || '');

      if (CANDIDATES_WHITELIST.test(classAndId)) {
        return;
      }

      if (CANDIDATES_BLACKLIST.test(classAndId)) {
        $node.remove();
      }
    });
  return $;
}

// Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
//  Given cheerio object, convert consecutive <br /> tags into
//  <p /> tags instead.
//
//  :param $: A cheerio object

function brsToPs$$1($) {
  var collapsing = false;
  $('br').each(function(index, element) {
    var $element = $(element);
    var nextElement = $element.next().get(0);

    if (nextElement && nextElement.tagName.toLowerCase() === 'br') {
      collapsing = true;
      $element.remove();
    } else if (collapsing) {
      collapsing = false; // $(element).replaceWith('<p />')

      paragraphize(element, $, true);
    }
  });
  return $;
}

// make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
// If the node is a <br />, it treats the following inline siblings
// as if they were its children.
//
// :param node: The node to paragraphize; this is a raw node
// :param $: The cheerio object to handle dom manipulation
// :param br: Whether or not the passed node is a br

function paragraphize(node, $) {
  var br =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
  var $node = $(node);

  if (br) {
    var sibling = node.nextSibling;
    var p = $('<p></p>'); // while the next node is text or not a block level element
    // append it to a new p node

    while (
      sibling &&
      !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))
    ) {
      var _sibling = sibling,
        nextSibling = _sibling.nextSibling;
      $(sibling).appendTo(p);
      sibling = nextSibling;
    }

    $node.replaceWith(p);
    $node.remove();
    return $;
  }

  return $;
}

function convertDivs($) {
  $('div').each(function(index, div) {
    var $div = $(div);
    var convertable = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;

    if (convertable) {
      convertNodeTo$$1($div, $, 'p');
    }
  });
  return $;
}

function convertSpans($) {
  $('span').each(function(index, span) {
    var $span = $(span);
    var convertable = $span.parents('p, div').length === 0;

    if (convertable) {
      convertNodeTo$$1($span, $, 'p');
    }
  });
  return $;
} // Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
//   Things fitting this criteria:
//   * Multiple consecutive <br /> tags.
//   * <div /> tags without block level elements inside of them
//   * <span /> tags who are not children of <p /> or <div /> tags.
//
//   :param $: A cheerio object to search
//   :return cheerio object with new p elements
//   (By-reference mutation, though. Returned just for convenience.)

function convertToParagraphs$$1($) {
  $ = brsToPs$$1($);
  $ = convertDivs($);
  $ = convertSpans($);
  return $;
}

function convertNodeTo$$1($node, $) {
  var tag =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
  var node = $node.get(0);

  if (!node) {
    return $;
  }

  var attrs = getAttrs(node) || {}; // console.log(attrs)

  var attribString = _Reflect$ownKeys(attrs)
    .map(function(key) {
      return ''.concat(key, '=').concat(attrs[key]);
    })
    .join(' ');

  var html;

  if ($.browser) {
    // In the browser, the contents of noscript tags aren't rendered, therefore
    // transforms on the noscript tag (commonly used for lazy-loading) don't work
    // as expected. This test case handles that
    html =
      node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
  } else {
    html = $node.contents();
  }

  $node.replaceWith(
    '<'
      .concat(tag, ' ')
      .concat(attribString, '>')
      .concat(html, '</')
      .concat(tag, '>')
  );
  return $;
}

function cleanForHeight($img, $) {
  var height = _parseInt($img.attr('height'), 10);

  var width = _parseInt($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or
  // widths, because they are most likely shims or icons,
  // which aren't very useful for reading.

  if ((height || 20) < 10 || width < 10) {
    $img.remove();
  } else if (height) {
    // Don't ever specify a height on images, so that we can
    // scale with respect to width without screwing up the
    // aspect ratio.
    $img.removeAttr('height');
  }

  return $;
} // Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP

function removeSpacers($img, $) {
  if (SPACER_RE.test($img.attr('src'))) {
    $img.remove();
  }

  return $;
}

function cleanImages($article, $) {
  $article.find('img').each(function(index, img) {
    var $img = $(img);
    cleanForHeight($img, $);
    removeSpacers($img, $);
  });
  return $;
}

function markToKeep(article, $, url) {
  var tags =
    arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];

  if (tags.length === 0) {
    tags = KEEP_SELECTORS;
  }

  if (url) {
    var _URL$parse = URL.parse(url),
      protocol = _URL$parse.protocol,
      hostname = _URL$parse.hostname;

    tags = [].concat(_toConsumableArray(tags), [
      'iframe[src^="'.concat(protocol, '//').concat(hostname, '"]'),
    ]);
  }

  $(tags.join(','), article).addClass(KEEP_CLASS);
  return $;
}

function stripJunkTags(article, $) {
  var tags =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];

  if (tags.length === 0) {
    tags = STRIP_OUTPUT_TAGS;
  } // Remove matching elements, but ignore
  // any element with a class of mercury-parser-keep

  $(tags.join(','), article)
    .not('.'.concat(KEEP_CLASS))
    .remove();
  return $;
}

// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.

function cleanHOnes$$1(article, $) {
  var $hOnes = $('h1', article);

  if ($hOnes.length < 3) {
    $hOnes.each(function(index, node) {
      return $(node).remove();
    });
  } else {
    $hOnes.each(function(index, node) {
      convertNodeTo$$1($(node), $, 'h2');
    });
  }

  return $;
}

function removeAllButWhitelist($article, $) {
  $article.find('*').each(function(index, node) {
    var attrs = getAttrs(node);
    setAttrs(
      node,
      _Reflect$ownKeys(attrs).reduce(function(acc, attr) {
        if (WHITELIST_ATTRS_RE.test(attr)) {
          return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr]));
        }

        return acc;
      }, {})
    );
  }); // Remove the mercury-parser-keep class from result

  $('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
  return $article;
} // function removeAttrs(article, $) {
//   REMOVE_ATTRS.forEach((attr) => {
//     $(`[${attr}]`, article).removeAttr(attr);
//   });
// }
// Remove attributes like style or align

function cleanAttributes$$1($article, $) {
  // Grabbing the parent because at this point
  // $article will be wrapped in a div which will
  // have a score set on it.
  return removeAllButWhitelist(
    $article.parent().length ? $article.parent() : $article,
    $
  );
}

function removeEmpty($article, $) {
  $article.find('p').each(function(index, p) {
    var $p = $(p);
    if ($p.find('iframe, img').length === 0 && $p.text().trim() === '')
      $p.remove();
  });
  return $;
}

// // CONTENT FETCHING CONSTANTS ////
// for a document.

var NON_TOP_CANDIDATE_TAGS$1 = [
  'br',
  'b',
  'i',
  'label',
  'hr',
  'area',
  'base',
  'basefont',
  'input',
  'img',
  'link',
  'meta',
];
var NON_TOP_CANDIDATE_TAGS_RE$1 = new RegExp(
  '^('.concat(NON_TOP_CANDIDATE_TAGS$1.join('|'), ')$'),
  'i'
); // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats

var HNEWS_CONTENT_SELECTORS$1 = [
  ['.hentry', '.entry-content'],
  ['entry', '.entry-content'],
  ['.entry', '.entry_content'],
  ['.post', '.postbody'],
  ['.post', '.post_body'],
  ['.post', '.post-body'],
];
var PHOTO_HINTS$1 = ['figure', 'photo', 'image', 'caption'];
var PHOTO_HINTS_RE$1 = new RegExp(PHOTO_HINTS$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var POSITIVE_SCORE_HINTS$1 = [
  'article',
  'articlecontent',
  'instapaper_body',
  'blog',
  'body',
  'content',
  'entry-content-asset',
  'entry',
  'hentry',
  'main',
  'Normal',
  'page',
  'pagination',
  'permalink',
  'post',
  'story',
  'text',
  '[-_]copy', // usatoday
  '\\Bcopy',
]; // The above list, joined into a matching regular expression

var POSITIVE_SCORE_RE$1 = new RegExp(POSITIVE_SCORE_HINTS$1.join('|'), 'i'); // Readability publisher-specific guidelines

var READABILITY_ASSET$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var NEGATIVE_SCORE_HINTS$1 = [
  'adbox',
  'advert',
  'author',
  'bio',
  'bookmark',
  'bottom',
  'byline',
  'clear',
  'com-',
  'combx',
  'comment',
  'comment\\B',
  'contact',
  'copy',
  'credit',
  'crumb',
  'date',
  'deck',
  'excerpt',
  'featured', // tnr.com has a featured_content which throws us off
  'foot',
  'footer',
  'footnote',
  'graf',
  'head',
  'info',
  'infotext', // newscientist.com copyright
  'instapaper_ignore',
  'jump',
  'linebreak',
  'link',
  'masthead',
  'media',
  'meta',
  'modal',
  'outbrain', // slate.com junk
  'promo',
  'pr_', // autoblog - press release
  'related',
  'respond',
  'roundcontent', // lifehacker restricted content warning
  'scroll',
  'secondary',
  'share',
  'shopping',
  'shoutbox',
  'side',
  'sidebar',
  'sponsor',
  'stamp',
  'sub',
  'summary',
  'tags',
  'tools',
  'widget',
]; // The above list, joined into a matching regular expression

var NEGATIVE_SCORE_RE$1 = new RegExp(NEGATIVE_SCORE_HINTS$1.join('|'), 'i'); // Match a digit. Pretty clear.
var PARAGRAPH_SCORE_TAGS$1 = new RegExp('^(p|li|span|pre)$', 'i');
var CHILD_CONTENT_TAGS$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
var BAD_TAGS$1 = new RegExp('^(address|form)$', 'i');

function getWeight(node) {
  var classes = node.attr('class');
  var id = node.attr('id');
  var score = 0;

  if (id) {
    // if id exists, try to score on both positive and negative
    if (POSITIVE_SCORE_RE$1.test(id)) {
      score += 25;
    }

    if (NEGATIVE_SCORE_RE$1.test(id)) {
      score -= 25;
    }
  }

  if (classes) {
    if (score === 0) {
      // if classes exist and id did not contribute to score
      // try to score on both positive and negative
      if (POSITIVE_SCORE_RE$1.test(classes)) {
        score += 25;
      }

      if (NEGATIVE_SCORE_RE$1.test(classes)) {
        score -= 25;
      }
    } // even if score has been set by id, add score for
    // possible photo matches
    // "try to keep photos if we can"

    if (PHOTO_HINTS_RE$1.test(classes)) {
      score += 10;
    } // add 25 if class matches entry-content-asset,
    // a class apparently instructed for use in the
    // Readability publisher guidelines
    // https://www.readability.com/developers/guidelines

    if (READABILITY_ASSET$1.test(classes)) {
      score += 25;
    }
  }

  return score;
}

// returns the score of a node based on
// the node's score attribute
// returns null if no score set
function getScore($node) {
  return _parseFloat($node.attr('score')) || null;
}

// return 1 for every comma in text
function scoreCommas(text) {
  return (text.match(/,/g) || []).length;
}

var idkRe = new RegExp('^(p|pre)$', 'i');
function scoreLength(textLength) {
  var tagName =
    arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
  var chunks = textLength / 50;

  if (chunks > 0) {
    var lengthBonus; // No idea why p or pre are being tamped down here
    // but just following the source for now
    // Not even sure why tagName is included here,
    // since this is only being called from the context
    // of scoreParagraph

    if (idkRe.test(tagName)) {
      lengthBonus = chunks - 2;
    } else {
      lengthBonus = chunks - 1.25;
    }

    return Math.min(Math.max(lengthBonus, 0), 3);
  }

  return 0;
}

// commas, etc. Higher is better.

function scoreParagraph$$1(node) {
  var score = 1;
  var text = node.text().trim();
  var textLength = text.length; // If this paragraph is less than 25 characters, don't count it.

  if (textLength < 25) {
    return 0;
  } // Add points for any commas within this paragraph

  score += scoreCommas(text); // For every 50 characters in this paragraph, add another point. Up
  // to 3 points.

  score += scoreLength(textLength); // Articles can end with short paragraphs when people are being clever
  // but they can also end with short paragraphs setting up lists of junk
  // that we strip. This negative tweaks junk setup paragraphs just below
  // the cutoff threshold.

  if (text.slice(-1) === ':') {
    score -= 1;
  }

  return score;
}

function setScore($node, $, score) {
  $node.attr('score', score);
  return $node;
}

function addScore$$1($node, $, amount) {
  try {
    var score = getOrInitScore$$1($node, $) + amount;
    setScore($node, $, score);
  } catch (e) {
    // Ignoring; error occurs in scoreNode
  }

  return $node;
}

function addToParent$$1(node, $, score) {
  var parent = node.parent();

  if (parent) {
    addScore$$1(parent, $, score * 0.25);
  }

  return node;
}

// if not, initializes a score based on
// the node's tag type

function getOrInitScore$$1($node, $) {
  var weightNodes =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
  var score = getScore($node);

  if (score) {
    return score;
  }

  score = scoreNode$$1($node);

  if (weightNodes) {
    score += getWeight($node);
  }

  addToParent$$1($node, $, score);
  return score;
}

// just scores based on tag.

function scoreNode$$1($node) {
  var _$node$get = $node.get(0),
    tagName = _$node$get.tagName; // TODO: Consider ordering by most likely.
  // E.g., if divs are a more common tag on a page,
  // Could save doing that regex test on every node – AP

  if (PARAGRAPH_SCORE_TAGS$1.test(tagName)) {
    return scoreParagraph$$1($node);
  }

  if (tagName.toLowerCase() === 'div') {
    return 5;
  }

  if (CHILD_CONTENT_TAGS$1.test(tagName)) {
    return 3;
  }

  if (BAD_TAGS$1.test(tagName)) {
    return -3;
  }

  if (tagName.toLowerCase() === 'th') {
    return -5;
  }

  return 0;
}

function convertSpans$1($node, $) {
  if ($node.get(0)) {
    var _$node$get = $node.get(0),
      tagName = _$node$get.tagName;

    if (tagName === 'span') {
      // convert spans to divs
      convertNodeTo$$1($node, $, 'div');
    }
  }
}

function addScoreTo($node, $, score) {
  if ($node) {
    convertSpans$1($node, $);
    addScore$$1($node, $, score);
  }
}

function scorePs($, weightNodes) {
  $('p, pre')
    .not('[score]')
    .each(function(index, node) {
      // The raw score for this paragraph, before we add any parent/child
      // scores.
      var $node = $(node);
      $node = setScore($node, $, getOrInitScore$$1($node, $, weightNodes));
      var $parent = $node.parent();
      var rawScore = scoreNode$$1($node);
      addScoreTo($parent, $, rawScore, weightNodes);

      if ($parent) {
        // Add half of the individual content score to the
        // grandparent
        addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
      }
    });
  return $;
} // score content. Parents get the full value of their children's
// content score, grandparents half

function scoreContent$$1($) {
  var weightNodes =
    arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
  // First, look for special hNews based selectors and give them a big
  // boost, if they exist
  HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref) {
    var _ref2 = _slicedToArray(_ref, 2),
      parentSelector = _ref2[0],
      childSelector = _ref2[1];

    $(''.concat(parentSelector, ' ').concat(childSelector)).each(function(
      index,
      node
    ) {
      addScore$$1($(node).parent(parentSelector), $, 80);
    });
  }); // Doubling this again
  // Previous solution caused a bug
  // in which parents weren't retaining
  // scores. This is not ideal, and
  // should be fixed.

  scorePs($, weightNodes);
  scorePs($, weightNodes);
  return $;
}

// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14

function mergeSiblings($candidate, topScore, $) {
  if (!$candidate.parent().length) {
    return $candidate;
  }

  var siblingScoreThreshold = Math.max(10, topScore * 0.25);
  var wrappingDiv = $('<div></div>');
  $candidate
    .parent()
    .children()
    .each(function(index, sibling) {
      var $sibling = $(sibling); // Ignore tags like BR, HR, etc

      if (NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)) {
        return null;
      }

      var siblingScore = getScore($sibling);

      if (siblingScore) {
        if ($sibling.get(0) === $candidate.get(0)) {
          wrappingDiv.append($sibling);
        } else {
          var contentBonus = 0;
          var density = linkDensity($sibling); // If sibling has a very low link density,
          // give it a small bonus

          if (density < 0.05) {
            contentBonus += 20;
          } // If sibling has a high link density,
          // give it a penalty

          if (density >= 0.5) {
            contentBonus -= 20;
          } // If sibling node has the same class as
          // candidate, give it a bonus

          if ($sibling.attr('class') === $candidate.attr('class')) {
            contentBonus += topScore * 0.2;
          }

          var newScore = siblingScore + contentBonus;

          if (newScore >= siblingScoreThreshold) {
            return wrappingDiv.append($sibling);
          }

          if (sibling.tagName === 'p') {
            var siblingContent = $sibling.text();
            var siblingContentLength = textLength(siblingContent);

            if (siblingContentLength > 80 && density < 0.25) {
              return wrappingDiv.append($sibling);
            }

            if (
              siblingContentLength <= 80 &&
              density === 0 &&
              hasSentenceEnd(siblingContent)
            ) {
              return wrappingDiv.append($sibling);
            }
          }
        }
      }

      return null;
    });

  if (
    wrappingDiv.children().length === 1 &&
    wrappingDiv
      .children()
      .first()
      .get(0) === $candidate.get(0)
  ) {
    return $candidate;
  }

  return wrappingDiv;
}

// candidate nodes we found and find the one with the highest score.

function findTopCandidate$$1($) {
  var $candidate;
  var topScore = 0;
  $('[score]').each(function(index, node) {
    // Ignore tags like BR, HR, etc
    if (NON_TOP_CANDIDATE_TAGS_RE$1.test(node.tagName)) {
      return;
    }

    var $node = $(node);
    var score = getScore($node);

    if (score > topScore) {
      topScore = score;
      $candidate = $node;
    }
  }); // If we don't have a candidate, return the body
  // or whatever the first element is

  if (!$candidate) {
    return $('body') || $('*').first();
  }

  $candidate = mergeSiblings($candidate, topScore, $);
  return $candidate;
}

// Scoring

function removeUnlessContent($node, $, weight) {
  // Explicitly save entry-content-asset tags, which are
  // noted as valuable in the Publisher guidelines. For now
  // this works everywhere. We may want to consider making
  // this less of a sure-thing later.
  if ($node.hasClass('entry-content-asset')) {
    return;
  }

  var content = normalizeSpaces($node.text());

  if (scoreCommas(content) < 10) {
    var pCount = $('p', $node).length;
    var inputCount = $('input', $node).length; // Looks like a form, too many inputs.

    if (inputCount > pCount / 3) {
      $node.remove();
      return;
    }

    var contentLength = content.length;
    var imgCount = $('img', $node).length; // Content is too short, and there are no images, so
    // this is probably junk content.

    if (contentLength < 25 && imgCount === 0) {
      $node.remove();
      return;
    }

    var density = linkDensity($node); // Too high of link density, is probably a menu or
    // something similar.
    // console.log(weight, density, contentLength)

    if (weight < 25 && density > 0.2 && contentLength > 75) {
      $node.remove();
      return;
    } // Too high of a link density, despite the score being
    // high.

    if (weight >= 25 && density > 0.5) {
      // Don't remove the node if it's a list and the
      // previous sibling starts with a colon though. That
      // means it's probably content.
      var tagName = $node.get(0).tagName.toLowerCase();
      var nodeIsList = tagName === 'ol' || tagName === 'ul';

      if (nodeIsList) {
        var previousNode = $node.prev();

        if (
          previousNode &&
          normalizeSpaces(previousNode.text()).slice(-1) === ':'
        ) {
          return;
        }
      }

      $node.remove();
      return;
    }

    var scriptCount = $('script', $node).length; // Too many script tags, not enough content.

    if (scriptCount > 0 && contentLength < 150) {
      $node.remove();
    }
  }
} // Given an article, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.

function cleanTags$$1($article, $) {
  $(CLEAN_CONDITIONALLY_TAGS, $article).each(function(index, node) {
    var $node = $(node); // If marked to keep, skip it

    if (
      $node.hasClass(KEEP_CLASS) ||
      $node.find('.'.concat(KEEP_CLASS)).length > 0
    )
      return;
    var weight = getScore($node);

    if (!weight) {
      weight = getOrInitScore$$1($node, $);
      setScore($node, $, weight);
    } // drop node if its weight is < 0

    if (weight < 0) {
      $node.remove();
    } else {
      // deteremine if node seems like content
      removeUnlessContent($node, $, weight);
    }
  });
  return $;
}

function cleanHeaders($article, $) {
  var title =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
  $(HEADER_TAG_LIST, $article).each(function(index, header) {
    var $header = $(header); // Remove any headers that appear before all other p tags in the
    // document. This probably means that it was part of the title, a
    // subtitle or something else extraneous like a datestamp or byline,
    // all of which should be handled by other metadata handling.

    if ($($header, $article).prevAll('p').length === 0) {
      return $header.remove();
    } // Remove any headers that match the title exactly.

    if (normalizeSpaces($(header).text()) === title) {
      return $header.remove();
    } // If this header has a negative weight, it's probably junk.
    // Get rid of it.

    if (getWeight($(header)) < 0) {
      return $header.remove();
    }

    return $header;
  });
  return $;
}

// html to avoid later complications with multiple body tags.

function rewriteTopLevel$$1(article, $) {
  // I'm not using context here because
  // it's problematic when converting the
  // top-level/root node - AP
  $ = convertNodeTo$$1($('html'), $, 'div');
  $ = convertNodeTo$$1($('body'), $, 'div');
  return $;
}

function absolutize($, rootUrl, attr, $content) {
  $('['.concat(attr, ']'), $content).each(function(_, node) {
    var attrs = getAttrs(node);
    var url = attrs[attr];

    if (url) {
      var absoluteUrl = URL.resolve(rootUrl, url);
      setAttr(node, attr, absoluteUrl);
    }
  });
}

function makeLinksAbsolute$$1($content, $, url) {
  ['href', 'src'].forEach(function(attr) {
    return absolutize($, url, attr, $content);
  });
  return $content;
}

function textLength(text) {
  return text.trim().replace(/\s+/g, ' ').length;
} // Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float

function linkDensity($node) {
  var totalTextLength = textLength($node.text());
  var linkText = $node.find('a').text();
  var linkLength = textLength(linkText);

  if (totalTextLength > 0) {
    return linkLength / totalTextLength;
  }

  if (totalTextLength === 0 && linkLength > 0) {
    return 1;
  }

  return 0;
}

// search for, find a meta tag associated.

function extractFromMeta$$1($, metaNames, cachedNames) {
  var cleanTags =
    arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
  var foundNames = metaNames.filter(function(name) {
    return cachedNames.indexOf(name) !== -1;
  }); // eslint-disable-next-line no-restricted-syntax

  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;

  try {
    var _loop = function _loop() {
      var name = _step.value;
      var type = 'name';
      var value = 'value';
      var nodes = $('meta['.concat(type, '="').concat(name, '"]')); // Get the unique value of every matching node, in case there
      // are two meta tags with the same name and value.
      // Remove empty values.

      var values = nodes
        .map(function(index, node) {
          return $(node).attr(value);
        })
        .toArray()
        .filter(function(text) {
          return text !== '';
        }); // If we have more than one value for the same name, we have a
      // conflict and can't trust any of them. Skip this name. If we have
      // zero, that means our meta tags had no values. Skip this name
      // also.

      if (values.length === 1) {
        var metaValue; // Meta values that contain HTML should be stripped, as they
        // weren't subject to cleaning previously.

        if (cleanTags) {
          metaValue = stripTags(values[0], $);
        } else {
          var _values = _slicedToArray(values, 1);

          metaValue = _values[0];
        }

        return {
          v: metaValue,
        };
      }
    };

    for (
      var _iterator = _getIterator(foundNames), _step;
      !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
      _iteratorNormalCompletion = true
    ) {
      var _ret = _loop();

      if (_typeof(_ret) === 'object') return _ret.v;
    } // If nothing is found, return null
  } catch (err) {
    _didIteratorError = true;
    _iteratorError = err;
  } finally {
    try {
      if (!_iteratorNormalCompletion && _iterator.return != null) {
        _iterator.return();
      }
    } finally {
      if (_didIteratorError) {
        throw _iteratorError;
      }
    }
  }

  return null;
}

function isGoodNode($node, maxChildren) {
  // If it has a number of children, it's more likely a container
  // element. Skip it.
  if ($node.children().length > maxChildren) {
    return false;
  } // If it looks to be within a comment, skip it.

  if (withinComment$$1($node)) {
    return false;
  }

  return true;
} // Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.

function extractFromSelectors$$1($, selectors) {
  var maxChildren =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
  var textOnly =
    arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
  // eslint-disable-next-line no-restricted-syntax
  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;

  try {
    for (
      var _iterator = _getIterator(selectors), _step;
      !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
      _iteratorNormalCompletion = true
    ) {
      var selector = _step.value;
      var nodes = $(selector); // If we didn't get exactly one of this selector, this may be
      // a list of articles or comments. Skip it.

      if (nodes.length === 1) {
        var $node = $(nodes[0]);

        if (isGoodNode($node, maxChildren)) {
          var content = void 0;

          if (textOnly) {
            content = $node.text();
          } else {
            content = $node.html();
          }

          if (content) {
            return content;
          }
        }
      }
    }
  } catch (err) {
    _didIteratorError = true;
    _iteratorError = err;
  } finally {
    try {
      if (!_iteratorNormalCompletion && _iterator.return != null) {
        _iterator.return();
      }
    } finally {
      if (_didIteratorError) {
        throw _iteratorError;
      }
    }
  }

  return null;
}

// strips all tags from a string of text
function stripTags(text, $) {
  // Wrapping text in html element prevents errors when text
  // has no html
  var cleanText = $('<span>'.concat(text, '</span>')).text();
  return cleanText === '' ? text : cleanText;
}

function withinComment$$1($node) {
  var parents = $node.parents().toArray();
  var commentParent = parents.find(function(parent) {
    var attrs = getAttrs(parent);
    var nodeClass = attrs.class,
      id = attrs.id;
    var classAndId = ''.concat(nodeClass, ' ').concat(id);
    return classAndId.includes('comment');
  });
  return commentParent !== undefined;
}

// Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean
function nodeIsSufficient($node) {
  return $node.text().trim().length >= 100;
}

function isWordpress($) {
  return $(IS_WP_SELECTOR).length > 0;
}

function getAttrs(node) {
  var attribs = node.attribs,
    attributes = node.attributes;

  if (!attribs && attributes) {
    var attrs = _Reflect$ownKeys(attributes).reduce(function(acc, index) {
      var attr = attributes[index];
      if (!attr.name || !attr.value) return acc;
      acc[attr.name] = attr.value;
      return acc;
    }, {});

    return attrs;
  }

  return attribs;
}

function setAttr(node, attr, val) {
  if (node.attribs) {
    node.attribs[attr] = val;
  } else if (node.attributes) {
    node.setAttribute(attr, val);
  }

  return node;
}

function setAttrs(node, attrs) {
  if (node.attribs) {
    node.attribs = attrs;
  } else if (node.attributes) {
    while (node.attributes.length > 0) {
      node.removeAttribute(node.attributes[0].name);
    }

    _Reflect$ownKeys(attrs).forEach(function(key) {
      node.setAttribute(key, attrs[key]);
    });
  }

  return node;
}

// DOM manipulation

var IS_LINK = new RegExp('https?://', 'i');
var IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i');
var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');

// lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.

function convertLazyLoadedImages($) {
  $('img').each(function(_, img) {
    var attrs = getAttrs(img);

    _Reflect$ownKeys(attrs).forEach(function(attr) {
      var value = attrs[attr];

      if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
        $(img).attr('src', value);
      }
    });
  });
  return $;
}

function isComment(index, node) {
  return node.type === 'comment';
}

function cleanComments($) {
  $.root()
    .find('*')
    .contents()
    .filter(isComment)
    .remove();
  return $;
}

function clean($) {
  $(TAGS_TO_REMOVE).remove();
  $ = cleanComments($);
  return $;
}

var Resource = {
  // Create a Resource.
  //
  // :param url: The URL for the document we should retrieve.
  // :param response: If set, use as the response rather than
  //                  attempting to fetch it ourselves. Expects a
  //                  string.
  create: function create(url, preparedResponse, parsedUrl) {
    var _this = this;

    return _asyncToGenerator(
      /*#__PURE__*/
      _regeneratorRuntime.mark(function _callee() {
        var result, validResponse;
        return _regeneratorRuntime.wrap(
          function _callee$(_context) {
            while (1) {
              switch ((_context.prev = _context.next)) {
                case 0:
                  if (!preparedResponse) {
                    _context.next = 5;
                    break;
                  }

                  validResponse = {
                    statusMessage: 'OK',
                    statusCode: 200,
                    headers: {
                      'content-type': 'text/html',
                      'content-length': 500,
                    },
                  };
                  result = {
                    body: preparedResponse,
                    response: validResponse,
                  };
                  _context.next = 8;
                  break;

                case 5:
                  _context.next = 7;
                  return fetchResource(url, parsedUrl);

                case 7:
                  result = _context.sent;

                case 8:
                  if (!result.error) {
                    _context.next = 11;
                    break;
                  }

                  result.failed = true;
                  return _context.abrupt('return', result);

                case 11:
                  return _context.abrupt('return', _this.generateDoc(result));

                case 12:
                case 'end':
                  return _context.stop();
              }
            }
          },
          _callee,
          this
        );
      })
    )();
  },
  generateDoc: function generateDoc(_ref) {
    var content = _ref.body,
      response = _ref.response;
    var contentType = response.headers['content-type']; // TODO: Implement is_text function from
    // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57

    if (!contentType.includes('html') && !contentType.includes('text')) {
      throw new Error('Content does not appear to be text.');
    }

    var $ = this.encodeDoc({
      content: content,
      contentType: contentType,
    });

    if ($.root().children().length === 0) {
      throw new Error('No children, likely a bad parse.');
    }

    $ = normalizeMetaTags($);
    $ = convertLazyLoadedImages($);
    $ = clean($);
    return $;
  },
  encodeDoc: function encodeDoc(_ref2) {
    var content = _ref2.content,
      contentType = _ref2.contentType;
    var encoding = getEncoding(contentType);
    var decodedContent = iconv.decode(content, encoding);
    var $ = cheerio.load(decodedContent); // after first cheerio.load, check to see if encoding matches

    var metaContentType = $('meta[http-equiv=content-type]').attr('content');
    var properEncoding = getEncoding(metaContentType); // if encodings in the header/body dont match, use the one in the body

    if (properEncoding !== encoding) {
      decodedContent = iconv.decode(content, properEncoding);
      $ = cheerio.load(decodedContent);
    }

    return $;
  },
};

var merge = function merge(extractor, domains) {
  return domains.reduce(function(acc, domain) {
    acc[domain] = extractor;
    return acc;
  }, {});
};

function mergeSupportedDomains(extractor) {
  return extractor.supportedDomains
    ? merge(
        extractor,
        [extractor.domain].concat(
          _toConsumableArray(extractor.supportedDomains)
        )
      )
    : merge(extractor, [extractor.domain]);
}

var BloggerExtractor = {
  domain: 'blogspot.com',
  content: {
    // Blogger is insane and does not load its content
    // initially in the page, but it's all there
    // in noscript
    selectors: ['.post-content noscript'],
    // Selectors to remove from the extracted content
    clean: [],
    // Convert the noscript tag to a div
    transforms: {
      noscript: 'div',
    },
  },
  author: {
    selectors: ['.post-author-name'],
  },
  title: {
    selectors: ['.post h2.title'],
  },
  date_published: {
    selectors: ['span.publishdate'],
  },
};

var NYMagExtractor = {
  domain: 'nymag.com',
  content: {
    // Order by most likely. Extractor will stop on first occurrence
    selectors: ['div.article-content', 'section.body', 'article.article'],
    // Selectors to remove from the extracted content
    clean: ['.ad', '.single-related-story'],
    // Object of tranformations to make on matched elements
    // Each key is the selector, each value is the tag to
    // transform to.
    // If a function is given, it should return a string
    // to convert to or nothing (in which case it will not perform
    // the transformation.
    transforms: {
      // Convert h1s to h2s
      h1: 'h2',
      // Convert lazy-loaded noscript images to figures
      noscript: function noscript($node, $) {
        var $children = $.browser ? $($node.text()) : $node.children();

        if (
          $children.length === 1 &&
          $children.get(0) !== undefined &&
          $children.get(0).tagName.toLowerCase() === 'img'
        ) {
          return 'figure';
        }

        return null;
      },
    },
  },
  title: {
    selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1'],
  },
  author: {
    selectors: ['.by-authors', '.lede-feature-author'],
  },
  dek: {
    selectors: ['.lede-feature-teaser'],
  },
  date_published: {
    selectors: [
      ['time.article-timestamp[datetime]', 'datetime'],
      'time.article-timestamp',
    ],
  },
};

var WikipediaExtractor = {
  domain: 'wikipedia.org',
  content: {
    selectors: ['#mw-content-text'],
    defaultCleaner: false,
    // transform top infobox to an image with caption
    transforms: {
      '.infobox img': function infoboxImg($node) {
        var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox

        if ($parent.children('img').length === 0) {
          $parent.prepend($node);
        }
      },
      '.infobox caption': 'figcaption',
      '.infobox': 'figure',
    },
    // Selectors to remove from the extracted content
    clean: [
      '.mw-editsection',
      'figure tr, figure td, figure tbody',
      '#toc',
      '.navbox',
    ],
  },
  author: 'Wikipedia Contributors',
  title: {
    selectors: ['h2.title'],
  },
  date_published: {
    selectors: ['#footer-info-lastmod'],
  },
};

var TwitterExtractor = {
  domain: 'twitter.com',
  content: {
    transforms: {
      // We're transforming essentially the whole page here.
      // Twitter doesn't have nice selectors, so our initial
      // selector grabs the whole page, then we're re-writing
      // it to fit our needs before we clean it up.
      '.permalink[role=main]': function permalinkRoleMain($node, $) {
        var tweets = $node.find('.tweet');
        var $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
        $tweetContainer.append(tweets);
        $node.replaceWith($tweetContainer);
      },
      // Twitter wraps @ with s, which
      // renders as a strikethrough
      s: 'span',
    },
    selectors: ['.permalink[role=main]'],
    defaultCleaner: false,
    clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'],
  },
  author: {
    selectors: ['.tweet.permalink-tweet .username'],
  },
  date_published: {
    selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']],
  },
};

var NYTimesExtractor = {
  domain: 'www.nytimes.com',
  title: {
    selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline'],
  },
  content: {
    selectors: ['div.g-blocks', 'article#story'],
    transforms: {
      'img.g-lazy': function imgGLazy($node) {
        var src = $node.attr('src'); // const widths = $node.attr('data-widths')
        //                   .slice(1)
        //                   .slice(0, -1)
        //                   .split(',');
        // if (widths.length) {
        //   width = widths.slice(-1);
        // } else {
        //   width = '900';
        // }

        var width = 640;
        src = src.replace('{{size}}', width);
        $node.attr('src', src);
      },
    },
    clean: [
      '.ad',
      'header#story-header',
      '.story-body-1 .lede.video',
      '.visually-hidden',
      '#newsletter-promo',
      '.promo',
      '.comments-button',
      '.hidden',
      '.comments',
      '.supplemental',
      '.nocontent',
      '.story-footer-links',
    ],
  },
  date_published: {
    selectors: [['meta[name="article:published"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: null,
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
var TheAtlanticExtractor = {
  domain: 'www.theatlantic.com',
  title: {
    selectors: ['h1.hed'],
  },
  author: {
    selectors: ['article#article .article-cover-extra .metadata .byline a'],
  },
  content: {
    selectors: [
      ['.article-cover figure.lead-img', '.article-body'],
      '.article-body',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.partner-box', '.callout'],
  },
  date_published: {
    selectors: [['time[itemProp="datePublished"]', 'datetime']],
  },
  lead_image_url: null,
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var NewYorkerExtractor = {
  domain: 'www.newyorker.com',
  title: {
    selectors: ['h1.title'],
  },
  author: {
    selectors: ['.contributors'],
  },
  content: {
    selectors: ['div#articleBody', 'div.articleBody'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
  date_published: {
    selectors: [
      ['meta[name="article:published_time"]', 'value'],
      ['time[itemProp="datePublished"]', 'content'],
    ],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: ['.dek', 'h2.dek'],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WiredExtractor = {
  domain: 'www.wired.com',
  title: {
    selectors: ['h1.post-title'],
  },
  author: {
    selectors: ['a[rel="author"]'],
  },
  content: {
    selectors: ['article.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.visually-hidden', 'figcaption img.photo'],
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var MSNExtractor = {
  domain: 'www.msn.com',
  title: {
    selectors: ['h1'],
  },
  author: {
    selectors: ['span.authorname-txt'],
  },
  content: {
    selectors: ['div.richtext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['span.caption'],
  },
  date_published: {
    selectors: ['span.time'],
  },
  lead_image_url: {
    selectors: [],
  },
  dek: {
    selectors: [],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var YahooExtractor = {
  domain: 'www.yahoo.com',
  title: {
    selectors: ['header.canvas-header'],
  },
  author: {
    selectors: ['span.provider-name'],
  },
  content: {
    selectors: [
      // enter content selectors
      '.content-canvas',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.figure-caption'],
  },
  date_published: {
    selectors: [['time.date[datetime]', 'datetime']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [
      // enter dek selectors
    ],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BuzzfeedExtractor = {
  domain: 'www.buzzfeed.com',
  title: {
    selectors: ['h1[id="post-title"]'],
  },
  author: {
    selectors: ['a[data-action="user/username"]', 'byline__author'],
  },
  content: {
    selectors: [
      ['.longform_custom_header_media', '#buzz_sub_buzz'],
      '#buzz_sub_buzz',
    ],
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: 'b',
      'div.longform_custom_header_media': function divLongform_custom_header_media(
        $node
      ) {
        if ($node.has('img') && $node.has('.longform_header_image_source')) {
          return 'figure';
        }

        return null;
      },
      'figure.longform_custom_header_media .longform_header_image_source':
        'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [
      '.instapaper_ignore',
      '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline',
      '.share-box',
      '.print',
    ],
  },
  date_published: {
    selectors: ['.buzz-datetime'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WikiaExtractor = {
  domain: 'fandom.wikia.com',
  title: {
    selectors: ['h1.entry-title'],
  },
  author: {
    selectors: ['.author vcard', '.fn'],
  },
  content: {
    selectors: ['.grid-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var LittleThingsExtractor = {
  domain: 'www.littlethings.com',
  title: {
    selectors: ['h1.post-title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  content: {
    selectors: [
      // enter content selectors
      '.mainContentIntro',
      '.content-wrapper',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  next_page_url: null,
  excerpt: null,
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var PoliticoExtractor = {
  domain: 'www.politico.com',
  title: {
    selectors: [
      // enter title selectors
      ['meta[name="og:title"]', 'value'],
    ],
  },
  author: {
    selectors: ['.story-main-content .byline .vcard'],
  },
  content: {
    selectors: [
      // enter content selectors
      '.story-main-content',
      '.content-group',
      '.story-core',
      '.story-text',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['figcaption'],
  },
  date_published: {
    selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']],
  },
  lead_image_url: {
    selectors: [
      // enter lead_image_url selectors
      ['meta[name="og:image"]', 'value'],
    ],
  },
  dek: {
    selectors: [],
  },
  next_page_url: null,
  excerpt: null,
};

var DeadspinExtractor = {
  domain: 'deadspin.com',
  supportedDomains: [
    'jezebel.com',
    'lifehacker.com',
    'kotaku.com',
    'gizmodo.com',
    'jalopnik.com',
    'kinja.com',
  ],
  title: {
    selectors: ['h1.headline'],
  },
  author: {
    selectors: ['.author'],
  },
  content: {
    selectors: ['.post-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube(
        $node
      ) {
        var youtubeId = $node.attr('id').split('youtube-')[1];
        $node.attr('src', 'https://www.youtube.com/embed/'.concat(youtubeId));
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.magnifier', '.lightbox'],
  },
  date_published: {
    selectors: [['time.updated[datetime]', 'datetime']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  next_page_url: {
    selectors: [
      // enter selectors
    ],
  },
  excerpt: {
    selectors: [
      // enter selectors
    ],
  },
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BroadwayWorldExtractor = {
  domain: 'www.broadwayworld.com',
  title: {
    selectors: ['h1.article-title'],
  },
  author: {
    selectors: ['span[itemprop=author]'],
  },
  content: {
    selectors: ['div[itemprop=articlebody]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
  date_published: {
    selectors: [['meta[itemprop=datePublished]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [],
  },
  next_page_url: {
    selectors: [
      // enter selectors
    ],
  },
  excerpt: {
    selectors: [
      // enter selectors
    ],
  },
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var ApartmentTherapyExtractor = {
  domain: 'www.apartmenttherapy.com',
  title: {
    selectors: ['h1.headline'],
  },
  author: {
    selectors: ['.PostByline__name'],
  },
  content: {
    selectors: ['div.post__content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture(
        $node,
        $
      ) {
        var data = JSON.parse($node.attr('data-props'));
        var src = data.sources[0].src;
        var $img = $('<img />').attr('src', src);
        $node.replaceWith($img);
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
  date_published: {
    selectors: [['.PostByline__timestamp[datetime]', 'datetime']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [],
  },
  next_page_url: {
    selectors: [
      // enter selectors
    ],
  },
  excerpt: {
    selectors: [
      // enter selectors
    ],
  },
};

var MediumExtractor = {
  domain: 'medium.com',
  supportedDomains: ['trackchanges.postlight.com'],
  title: {
    selectors: ['h1'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  content: {
    selectors: [
      ['.section-content'],
      '.section-content',
      'article > div > section',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // Re-write lazy-loaded youtube videos
      iframe: function iframe($node) {
        var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
        var thumb = decodeURIComponent($node.attr('data-thumbnail'));

        if (ytRe.test(thumb)) {
          var _thumb$match = thumb.match(ytRe),
            _thumb$match2 = _slicedToArray(_thumb$match, 2),
            _ = _thumb$match2[0],
            youtubeId = _thumb$match2[1]; // eslint-disable-line

          $node.attr('src', 'https://www.youtube.com/embed/'.concat(youtubeId));
          var $parent = $node.parents('figure');
          var $caption = $parent.find('figcaption');
          $parent.empty().append([$node, $caption]);
        }
      },
      // rewrite figures to pull out image and caption, remove rest
      figure: function figure($node) {
        // ignore if figure has an iframe
        if ($node.find('iframe').length > 0) return;
        var $img = $node.find('img').slice(-1)[0];
        var $caption = $node.find('figcaption');
        $node.empty().append([$img, $caption]);
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
  date_published: {
    selectors: [['time[datetime]', 'datetime']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  next_page_url: {
    selectors: [
      // enter selectors
    ],
  },
  excerpt: {
    selectors: [
      // enter selectors
    ],
  },
};

var WwwTmzComExtractor = {
  domain: 'www.tmz.com',
  title: {
    selectors: ['.post-title-breadcrumb', 'h1', '.headline'],
  },
  author: 'TMZ STAFF',
  date_published: {
    selectors: ['.article-posted-date'],
    timezone: 'America/Los_Angeles',
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article-content', '.all-post-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.lightbox-link'],
  },
};

var WwwWashingtonpostComExtractor = {
  domain: 'www.washingtonpost.com',
  title: {
    selectors: ['h1', '#topper-headline-wrapper'],
  },
  author: {
    selectors: ['.pb-byline'],
  },
  date_published: {
    selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']],
  },
  dek: {
    selectors: [],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.inline-content': function divInlineContent($node) {
        if ($node.has('img,iframe,video').length > 0) {
          return 'figure';
        }

        $node.remove();
        return null;
      },
      '.pb-caption': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.interstitial-link', '.newsletter-inline-unit'],
  },
};

var WwwHuffingtonpostComExtractor = {
  domain: 'www.huffingtonpost.com',
  title: {
    selectors: ['h1.headline__title'],
  },
  author: {
    selectors: ['span.author-card__details__name'],
  },
  date_published: {
    selectors: [
      ['meta[name="article:modified_time"]', 'value'],
      ['meta[name="article:published_time"]', 'value'],
    ],
  },
  dek: {
    selectors: ['h2.headline__subtitle'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.entry__body'],
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // 'div.top-media': ($node) => {
      //   const $figure = $node.children('figure');
      //   $node.replaceWith($figure);
      // },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [
      '.pull-quote',
      '.tag-cloud',
      '.embed-asset',
      '.below-entry',
      '.entry-corrections',
      '#suggested-story',
    ],
  },
};

var NewrepublicComExtractor = {
  domain: 'newrepublic.com',
  title: {
    selectors: ['h1.article-headline', '.minutes-primary h1.minute-title'],
  },
  author: {
    selectors: ['div.author-list', '.minutes-primary h3.minute-byline'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: ['h2.article-subhead'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['.article-cover', 'div.content-body'],
      ['.minute-image', '.minutes-primary div.content-body'],
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['aside'],
  },
};

var MoneyCnnComExtractor = {
  domain: 'money.cnn.com',
  title: {
    selectors: ['.article-title'],
  },
  author: {
    selectors: ['.byline a'],
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']],
    timezone: 'GMT',
  },
  dek: {
    selectors: ['#storytext h2'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['#storytext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.inStoryHeading'],
  },
};

var WwwThevergeComExtractor = {
  domain: 'www.theverge.com',
  supportedDomains: ['www.polygon.com'],
  title: {
    selectors: ['h1'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['h2.p-dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      // feature template multi-match
      ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match
      ['.e-image--hero', '.c-entry-content'], // feature template fallback
      '.l-wrapper .l-feature', // regular post fallback
      'div.c-entry-content',
    ],
    // Transform lazy-loaded images
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'span';
        }

        return null;
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.aside', 'img.c-dynamic-image'],
  },
};

var WwwCnnComExtractor = {
  domain: 'www.cnn.com',
  title: {
    selectors: ['h1.pg-headline', 'h1'],
  },
  author: {
    selectors: ['.metadata__byline__author'],
  },
  date_published: {
    selectors: [['meta[name="pubdate"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      // a more specific selector to grab the lead image and the body
      ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above
      '.zn-body-text',
      'div[itemprop="articleBody"]',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph(
        $node
      ) {
        var $text = $node.html();

        if ($text) {
          return 'p';
        }

        return null;
      },
      // this transform cleans the short, all-link sections linking
      // to related content but not marked as such in any way.
      '.zn-body__paragraph': function znBody__paragraph($node) {
        if ($node.has('a')) {
          if (
            $node.text().trim() ===
            $node
              .find('a')
              .text()
              .trim()
          ) {
            $node.remove();
          }
        }
      },
      '.media__video--thumbnail': 'figure',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwAolComExtractor = {
  domain: 'www.aol.com',
  title: {
    selectors: ['h1.p-article__title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: ['.p-article__byline__date'],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwYoutubeComExtractor = {
  domain: 'www.youtube.com',
  title: {
    selectors: ['.watch-title', 'h1.watch-title-container'],
  },
  author: {
    selectors: ['.yt-user-info'],
  },
  date_published: {
    selectors: [['meta[itemProp="datePublished"]', 'value']],
    timezone: 'GMT',
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    defaultCleaner: false,
    selectors: [['#player-api', '#eow-description']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '#player-api': function playerApi($node, $) {
        var videoId = $('meta[itemProp="videoId"]').attr('value');
        $node.html(
          '\n          <iframe src="https://www.youtube.com/embed/'.concat(
            videoId,
            '" frameborder="0" allowfullscreen></iframe>'
          )
        );
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwTheguardianComExtractor = {
  domain: 'www.theguardian.com',
  title: {
    selectors: ['.content__headline'],
  },
  author: {
    selectors: ['p.byline'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['.content__standfirst'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.content__article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.hide-on-mobile', '.inline-icon'],
  },
};

var WwwSbnationComExtractor = {
  domain: 'www.sbnation.com',
  title: {
    selectors: ['h1.c-page-title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['h2.c-entry-summary.p-dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwBloombergComExtractor = {
  domain: 'www.bloomberg.com',
  title: {
    selectors: [
      // normal articles
      '.lede-headline', // /graphics/ template
      'h1.article-title', // /news/ template
      'h1.lede-text-only__hed',
    ],
  },
  author: {
    selectors: [
      ['meta[name="parsely-author"]', 'value'],
      '.byline-details__link', // /graphics/ template
      '.bydek', // /news/ template
      '.author',
    ],
  },
  date_published: {
    selectors: [
      ['time.published-at', 'datetime'],
      ['time[datetime]', 'datetime'],
      ['meta[name="date"]', 'value'],
      ['meta[name="parsely-pub-date"]', 'value'],
    ],
  },
  dek: {
    selectors: [],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      '.article-body__content', // /graphics/ template
      ['section.copy-block'], // /news/ template
      '.body-copy',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.inline-newsletter', '.page-ad'],
  },
};

var WwwBustleComExtractor = {
  domain: 'www.bustle.com',
  title: {
    selectors: ['h1.post-page__title'],
  },
  author: {
    selectors: ['div.content-meta__author'],
  },
  date_published: {
    selectors: [['time.content-meta__published-date[datetime]', 'datetime']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.post-page__body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwNprOrgExtractor = {
  domain: 'www.npr.org',
  title: {
    selectors: ['h1', '.storytitle'],
  },
  author: {
    selectors: ['p.byline__name.byline__name--block'],
  },
  date_published: {
    selectors: [
      ['.dateblock time[datetime]', 'datetime'],
      ['meta[name="date"]', 'value'],
    ],
  },
  lead_image_url: {
    selectors: [
      ['meta[name="og:image"]', 'value'],
      ['meta[name="twitter:image:src"]', 'value'],
    ],
  },
  content: {
    selectors: ['.storytext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.bucketwrap.image': 'figure',
      '.bucketwrap.image .credit-caption': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['div.enlarge_measure'],
  },
};

var WwwRecodeNetExtractor = {
  domain: 'www.recode.net',
  title: {
    selectors: ['h1.c-page-title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['h2.c-entry-summary.p-dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['figure.e-image--hero', '.c-entry-content'],
      '.c-entry-content',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var QzComExtractor = {
  domain: 'qz.com',
  title: {
    selectors: ['header.item-header.content-width-responsive'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: ['.timestamp'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['figure.featured-image', '.item-body'], '.item-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.article-aside', '.progressive-image-thumbnail'],
  },
};

var WwwDmagazineComExtractor = {
  domain: 'www.dmagazine.com',
  title: {
    selectors: ['h1.story__title'],
  },
  author: {
    selectors: ['.story__info .story__info__item:first-child'],
  },
  date_published: {
    selectors: [
      // enter selectors
      '.story__info',
    ],
    timezone: 'America/Chicago',
  },
  dek: {
    selectors: ['.story__subhead'],
  },
  lead_image_url: {
    selectors: [['article figure a:first-child', 'href']],
  },
  content: {
    selectors: ['.story__content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwReutersComExtractor = {
  domain: 'www.reuters.com',
  title: {
    selectors: ['h1.article-headline'],
  },
  author: {
    selectors: ['.author'],
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['#article-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.article-subtitle': 'h4',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['#article-byline .author'],
  },
};

var MashableComExtractor = {
  domain: 'mashable.com',
  title: {
    selectors: ['h1.title'],
  },
  author: {
    selectors: ['span.author_name a'],
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['section.article-content.blueprint'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.image-credit': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwChicagotribuneComExtractor = {
  domain: 'www.chicagotribune.com',
  title: {
    selectors: ['h1.trb_ar_hl_t'],
  },
  author: {
    selectors: ['span.trb_ar_by_nm_au'],
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.trb_ar_page'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwVoxComExtractor = {
  domain: 'www.vox.com',
  title: {
    selectors: ['h1.c-page-title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['.p-dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['figure.e-image--hero', '.c-entry-content'],
      '.c-entry-content',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'figure .e-image__image noscript': function figureEImage__imageNoscript(
        $node
      ) {
        var imgHtml = $node.html();
        $node
          .parents('.e-image__image')
          .find('.c-dynamic-image')
          .replaceWith(imgHtml);
      },
      'figure .e-image__meta': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var NewsNationalgeographicComExtractor = {
  domain: 'news.nationalgeographic.com',
  title: {
    selectors: ['h1', 'h1.main-title'],
  },
  author: {
    selectors: ['.byline-component__contributors b span'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    format: 'ddd MMM DD HH:mm:ss zz YYYY',
    timezone: 'EST',
  },
  dek: {
    selectors: ['.article__deck'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.parsys.content': function parsysContent($node, $) {
        var $imgSrc = $node
          .find('.image.parbase.section')
          .find('.picturefill')
          .first()
          .data('platform-src');

        if ($imgSrc) {
          $node.prepend(
            $('<img class="__image-lead__" src="'.concat($imgSrc, '"/>'))
          );
        }
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote.pull-quote--large'],
  },
};

var WwwNationalgeographicComExtractor = {
  domain: 'www.nationalgeographic.com',
  title: {
    selectors: ['h1', 'h1.main-title'],
  },
  author: {
    selectors: ['.byline-component__contributors b span'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['.article__deck'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.parsys.content': function parsysContent($node, $) {
        var $imageParent = $node.children().first();

        if ($imageParent.hasClass('imageGroup')) {
          var $dataAttrContainer = $imageParent
            .find('.media--medium__container')
            .children()
            .first();
          var imgPath1 = $dataAttrContainer.data('platform-image1-path');
          var imgPath2 = $dataAttrContainer.data('platform-image2-path');

          if (imgPath2 && imgPath1) {
            $node.prepend(
              $(
                '<div class="__image-lead__">\n                <img src="'
                  .concat(imgPath1, '"/>\n                <img src="')
                  .concat(imgPath2, '"/>\n              </div>')
              )
            );
          }
        } else {
          var $imgSrc = $node
            .find('.image.parbase.section')
            .find('.picturefill')
            .first()
            .data('platform-src');

          if ($imgSrc) {
            $node.prepend(
              $('<img class="__image-lead__" src="'.concat($imgSrc, '"/>'))
            );
          }
        }
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote.pull-quote--small'],
  },
};

var WwwLatimesComExtractor = {
  domain: 'www.latimes.com',
  title: {
    selectors: ['.trb_ar_hl'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.trb_ar_main'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.trb_ar_la': function trb_ar_la($node) {
        var $figure = $node.find('figure');
        $node.replaceWith($figure);
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.trb_ar_by', '.trb_ar_cr'],
  },
};

var PagesixComExtractor = {
  domain: 'pagesix.com',
  supportedDomains: ['nypost.com'],
  title: {
    selectors: ['h1 a'],
  },
  author: {
    selectors: ['.byline'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['#featured-image-wrapper', '.entry-content'],
      '.entry-content',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '#featured-image-wrapper': 'figure',
      '.wp-caption-text': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.modal-trigger'],
  },
};

var ThefederalistpapersOrgExtractor = {
  domain: 'thefederalistpapers.org',
  title: {
    selectors: ['h1.entry-title'],
  },
  author: {
    selectors: ['main span.entry-author-name'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [['p[style]']],
  },
};

var WwwCbssportsComExtractor = {
  domain: 'www.cbssports.com',
  title: {
    selectors: ['.article-headline'],
  },
  author: {
    selectors: ['.author-name'],
  },
  date_published: {
    selectors: [['.date-original-reading-time time', 'datetime']],
    timezone: 'UTC',
  },
  dek: {
    selectors: ['.article-subline'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwMsnbcComExtractor = {
  domain: 'www.msnbc.com',
  title: {
    selectors: ['h1', 'h1.is-title-pane'],
  },
  author: {
    selectors: ['.author'],
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']],
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.pane-node-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.pane-node-body': function paneNodeBody($node, $) {
        var _WwwMsnbcComExtractor = _slicedToArray(
            WwwMsnbcComExtractor.lead_image_url.selectors[0],
            2
          ),
          selector = _WwwMsnbcComExtractor[0],
          attr = _WwwMsnbcComExtractor[1];

        var src = $(selector).attr(attr);

        if (src) {
          $node.prepend('<img src="'.concat(src, '" />'));
        }
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwThepoliticalinsiderComExtractor = {
  domain: 'www.thepoliticalinsider.com',
  title: {
    selectors: [['meta[name="sailthru.title"]', 'value']],
  },
  author: {
    selectors: [['meta[name="sailthru.author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div#article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwMentalflossComExtractor = {
  domain: 'www.mentalfloss.com',
  title: {
    selectors: ['h1.title', '.title-group', '.inner'],
  },
  author: {
    selectors: ['.field-name-field-enhanced-authors'],
  },
  date_published: {
    selectors: ['.date-display-single'],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.field.field-name-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var AbcnewsGoComExtractor = {
  domain: 'abcnews.go.com',
  title: {
    selectors: ['.article-header h1'],
  },
  author: {
    selectors: ['.authors'],
    clean: ['.author-overlay', '.by-text'],
  },
  date_published: {
    selectors: ['.timestamp'],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article-copy'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwNydailynewsComExtractor = {
  domain: 'www.nydailynews.com',
  title: {
    selectors: ['h1#ra-headline'],
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['article#ra-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom'],
  },
};

var WwwCnbcComExtractor = {
  domain: 'www.cnbc.com',
  title: {
    selectors: ['h1.title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div#article_body.content', 'div.story'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwPopsugarComExtractor = {
  domain: 'www.popsugar.com',
  title: {
    selectors: ['h2.post-title', 'title-text'],
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['#content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.share-copy-title', '.post-tags', '.reactions'],
  },
};

var ObserverComExtractor = {
  domain: 'observer.com',
  title: {
    selectors: ['h1.entry-title'],
  },
  author: {
    selectors: ['.author', '.vcard'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['h2.dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var PeopleComExtractor = {
  domain: 'people.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']],
  },
  author: {
    selectors: ['a.author.url.fn'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.article-body__inner'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwUsmagazineComExtractor = {
  domain: 'www.usmagazine.com',
  title: {
    selectors: ['header h1'],
  },
  author: {
    selectors: ['a.article-byline.tracked-offpage'],
  },
  date_published: {
    timezone: 'America/New_York',
    selectors: ['time.article-published-date'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.article-body-inner'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.module-related'],
  },
};

var WwwRollingstoneComExtractor = {
  domain: 'www.rollingstone.com',
  title: {
    selectors: ['h1.content-title'],
  },
  author: {
    selectors: ['a.content-author.tracked-offpage'],
  },
  date_published: {
    selectors: ['time.content-published-date'],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: ['.content-description'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.lead-container', '.article-content'], '.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.module-related'],
  },
};

var twofortysevensportsComExtractor = {
  domain: '247sports.com',
  title: {
    selectors: ['title', 'article header h1'],
  },
  author: {
    selectors: ['.author'],
  },
  date_published: {
    selectors: [['time[data-published]', 'data-published']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['section.body.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var UproxxComExtractor = {
  domain: 'uproxx.com',
  title: {
    selectors: ['div.post-top h1'],
  },
  author: {
    selectors: ['.post-top .authorname'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.post-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.image': 'figure',
      'div.image .wp-media-credit': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwEonlineComExtractor = {
  domain: 'www.eonline.com',
  title: {
    selectors: ['h1.article__title'],
  },
  author: {
    selectors: ['.entry-meta__author a'],
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['.post-content section, .post-content div.post-content__image'],
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.post-content__image': 'figure',
      'div.post-content__image .image__credits': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwMiamiheraldComExtractor = {
  domain: 'www.miamiherald.com',
  title: {
    selectors: ['h1.title'],
  },
  date_published: {
    selectors: ['p.published-date'],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.dateline-storybody'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwRefinery29ComExtractor = {
  domain: 'www.refinery29.com',
  title: {
    selectors: ['h1.title'],
  },
  author: {
    selectors: ['.contributor'],
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['.full-width-opener', '.article-content'],
      '.article-content',
      '.body',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.loading noscript': function divLoadingNoscript($node) {
        var imgHtml = $node.html();
        $node.parents('.loading').replaceWith(imgHtml);
      },
      '.section-image': 'figure',
      '.section-image .content-caption': 'figcaption',
      '.section-text': 'p',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.story-share'],
  },
};

var WwwMacrumorsComExtractor = {
  domain: 'www.macrumors.com',
  title: {
    selectors: ['h1', 'h1.title'],
  },
  author: {
    selectors: ['.author-url'],
  },
  date_published: {
    selectors: ['.article .byline'],
    // Wednesday January 18, 2017 11:44 am PST
    format: 'dddd MMMM D, YYYY h:mm A zz',
    timezone: 'America/Los_Angeles',
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwAndroidcentralComExtractor = {
  domain: 'www.androidcentral.com',
  title: {
    selectors: ['h1', 'h1.main-title'],
  },
  author: {
    selectors: ['.meta-by'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']],
  },
  lead_image_url: {
    selectors: [['.image-large', 'src']],
  },
  content: {
    selectors: ['.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.intro', 'blockquote'],
  },
};

var WwwSiComExtractor = {
  domain: 'www.si.com',
  title: {
    selectors: ['h1', 'h1.headline'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: ['.timestamp'],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: ['.quick-hit ul'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['p', '.marquee_large_2x', '.component.image']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'figure';
        }

        return null;
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [
      ['.inline-thumb', '.primary-message', '.description', '.instructions'],
    ],
  },
};

var WwwRawstoryComExtractor = {
  domain: 'www.rawstory.com',
  title: {
    selectors: ['.blog-title'],
  },
  author: {
    selectors: ['.blog-author a:first-of-type'],
  },
  date_published: {
    selectors: ['.blog-author a:last-of-type'],
    timezone: 'EST',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.blog-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwCnetComExtractor = {
  domain: 'www.cnet.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']],
  },
  author: {
    selectors: ['a.author'],
  },
  date_published: {
    selectors: ['time'],
    timezone: 'America/Los_Angeles',
  },
  dek: {
    selectors: ['.article-dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['img.__image-lead__', '.article-main-body'],
      '.article-main-body',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'figure.image': function figureImage($node) {
        var $img = $node.find('img');
        $img.attr('width', '100%');
        $img.attr('height', '100%');
        $img.addClass('__image-lead__');
        $node.remove('.imgContainer').prepend($img);
      },
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwCinemablendComExtractor = {
  domain: 'www.cinemablend.com',
  title: {
    selectors: ['.story_title'],
  },
  author: {
    selectors: ['.author'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    timezone: 'EST',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div#wrap_left_content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwTodayComExtractor = {
  domain: 'www.today.com',
  title: {
    selectors: ['h1.entry-headline'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry-container'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.label-comment'],
  },
};

var WwwHowtogeekComExtractor = {
  domain: 'www.howtogeek.com',
  title: {
    selectors: ['title'],
  },
  author: {
    selectors: ['#authorinfobox a'],
  },
  date_published: {
    selectors: ['#authorinfobox + div li'],
    timezone: 'GMT',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.thecontent'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwAlComExtractor = {
  domain: 'www.al.com',
  title: {
    selectors: [['meta[name="title"]', 'value']],
  },
  author: {
    selectors: [['meta[name="article_author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article_date_original"]', 'value']],
    timezone: 'EST',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwThepennyhoarderComExtractor = {
  domain: 'www.thepennyhoarder.com',
  title: {
    selectors: [['meta[name="dcterms.title"]', 'value']],
  },
  author: {
    selectors: [['link[rel="author"]', 'title']],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.post-img', '.post-text'], '.post-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwWesternjournalismComExtractor = {
  domain: 'www.westernjournalism.com',
  title: {
    selectors: ['title', 'h1.entry-title'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']],
  },
  dek: {
    selectors: ['.subtitle'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.article-sharing.top + div'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.ad-notice-small'],
  },
};

var FusionNetExtractor = {
  domain: 'fusion.net',
  title: {
    selectors: ['.post-title', '.single-title', '.headline'],
  },
  author: {
    selectors: ['.show-for-medium .byline'],
  },
  date_published: {
    selectors: [['time.local-time', 'datetime']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      ['.post-featured-media', '.article-content'],
      '.article-content',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.fusion-youtube-oembed': 'figure',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwAmericanowComExtractor = {
  domain: 'www.americanow.com',
  title: {
    selectors: ['.title', ['meta[name="title"]', 'value']],
  },
  author: {
    selectors: ['.byline'],
  },
  date_published: {
    selectors: [['meta[name="publish_date"]', 'value']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.article-content', '.image', '.body'], '.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.article-video-wrapper', '.show-for-small-only'],
  },
};

var ScienceflyComExtractor = {
  domain: 'sciencefly.com',
  title: {
    selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title'],
  },
  author: {
    selectors: ['div.cb-author', 'div.cb-author-title'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['div.theiaPostSlider_slides img', 'src']],
  },
  content: {
    selectors: ['div.theiaPostSlider_slides'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var HellogigglesComExtractor = {
  domain: 'hellogiggles.com',
  title: {
    selectors: ['.title'],
  },
  author: {
    selectors: ['.author-link'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var ThoughtcatalogComExtractor = {
  domain: 'thoughtcatalog.com',
  title: {
    selectors: ['h1.title', ['meta[name="og:title"]', 'value']],
  },
  author: {
    selectors: [
      'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name',
      'h1.writer-name',
    ],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry.post'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.tc_mark'],
  },
};

var WwwNjComExtractor = {
  domain: 'www.nj.com',
  title: {
    selectors: [['meta[name="title"]', 'value']],
  },
  author: {
    selectors: [['meta[name="article_author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="article_date_original"]', 'value']],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwInquisitrComExtractor = {
  domain: 'www.inquisitr.com',
  title: {
    selectors: ['h1.entry-title.story--header--title'],
  },
  author: {
    selectors: ['div.story--header--author'],
  },
  date_published: {
    selectors: [['meta[name="datePublished"]', 'value']],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['article.story', '.entry-content.'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [
      '.post-category',
      '.story--header--socials',
      '.story--header--content',
    ],
  },
};

var WwwNbcnewsComExtractor = {
  domain: 'www.nbcnews.com',
  title: {
    selectors: ['div.article-hed h1'],
  },
  author: {
    selectors: ['span.byline_author'],
  },
  date_published: {
    selectors: [
      ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'],
      '.flag_article-wrapper time',
    ],
    timezone: 'America/New_York',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['div.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var FortuneComExtractor = {
  domain: 'fortune.com',
  title: {
    selectors: ['h1'],
  },
  author: {
    selectors: [['meta[name="author"]', 'value']],
  },
  date_published: {
    selectors: ['.MblGHNMJ'],
    timezone: 'UTC',
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['picture', 'article.row'], 'article.row'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var WwwLinkedinComExtractor = {
  domain: 'www.linkedin.com',
  title: {
    selectors: ['.article-title', 'h1'],
  },
  author: {
    selectors: [
      ['meta[name="article:author"]', 'value'],
      '.entity-name a[rel=author]',
    ],
  },
  date_published: {
    selectors: [['time[itemprop="datePublished"]', 'datetime']],
    timezone: 'America/Los_Angeles',
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['header figure', '.prose'], '.prose'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.entity-image'],
  },
};

var ObamawhitehouseArchivesGovExtractor = {
  domain: 'obamawhitehouse.archives.gov',
  supportedDomains: ['whitehouse.gov'],
  title: {
    selectors: ['h1', '.pane-node-title'],
  },
  author: {
    selectors: ['.blog-author-link', '.node-person-name-link'],
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
  },
  dek: {
    selectors: ['.field-name-field-forall-summary'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    defaultCleaner: false,
    selectors: ['div#content-start', '.pane-node-field-forall-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pane-node-title', '.pane-custom.pane-1'],
  },
};

var WwwOpposingviewsComExtractor = {
  domain: 'www.opposingviews.com',
  title: {
    selectors: ['h1.title'],
  },
  author: {
    selectors: ['div.date span span a'],
  },
  date_published: {
    selectors: [['meta[name="publish_date"]', 'value']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.show-for-small-only'],
  },
};

var WwwProspectmagazineCoUkExtractor = {
  domain: 'www.prospectmagazine.co.uk',
  title: {
    selectors: ['.page-title'],
  },
  author: {
    selectors: ['.aside_author .title'],
  },
  date_published: {
    selectors: ['.post-info'],
    timezone: 'Europe/London',
  },
  dek: {
    selectors: ['.page-subtitle'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [
      // ['article.type-post div.post_content p'],
      'article .post_content',
    ],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var ForwardComExtractor = {
  domain: 'forward.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']],
  },
  author: {
    selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']],
  },
  dek: {
    selectors: [
      // enter selectors
    ],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.post-item-media-wrap', '.post-item p']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.donate-box', '.message', '.subtitle'],
  },
};

var WwwQdailyComExtractor = {
  domain: 'www.qdaily.com',
  title: {
    selectors: ['h2', 'h2.title'],
  },
  author: {
    selectors: ['.name'],
  },
  date_published: {
    selectors: [['.date.smart-date', 'data-origindate']],
  },
  dek: {
    selectors: ['.excerpt'],
  },
  lead_image_url: {
    selectors: [['.article-detail-hd img', 'src']],
  },
  content: {
    selectors: ['.detail'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.lazyload', '.lazylad', '.lazylood'],
  },
};

var GothamistComExtractor = {
  domain: 'gothamist.com',
  supportedDomains: [
    'chicagoist.com',
    'laist.com',
    'sfist.com',
    'shanghaiist.com',
    'dcist.com',
  ],
  title: {
    selectors: ['h1', '.entry-header h1'],
  },
  author: {
    selectors: ['.author'],
  },
  date_published: {
    selectors: ['abbr', 'abbr.published'],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: [null],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.entry-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.image-none': 'figure',
      '.image-none i': 'figcaption',
      'div.image-left': 'figure',
      '.image-left i': 'figcaption',
      'div.image-right': 'figure',
      '.image-right i': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [
      '.image-none br',
      '.image-left br',
      '.image-right br',
      '.galleryEase',
    ],
  },
};

var WwwFoolComExtractor = {
  domain: 'www.fool.com',
  title: {
    selectors: ['h1'],
  },
  author: {
    selectors: ['.author-inline .author-name'],
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']],
  },
  dek: {
    selectors: ['header h2'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.caption img': function captionImg($node) {
        var src = $node.attr('src');
        $node
          .parent()
          .replaceWith('<figure><img src="'.concat(src, '"/></figure>'));
      },
      '.caption': 'figcaption',
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['#pitch'],
  },
};

var WwwSlateComExtractor = {
  domain: 'www.slate.com',
  title: {
    selectors: ['.hed', 'h1'],
  },
  author: {
    selectors: ['a[rel=author]'],
  },
  date_published: {
    selectors: ['.pub-date'],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: ['.dek'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: ['.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [
      '.about-the-author',
      '.pullquote',
      '.newsletter-signup-component',
      '.top-comment',
    ],
  },
};

var IciRadioCanadaCaExtractor = {
  domain: 'ici.radio-canada.ca',
  title: {
    selectors: ['h1'],
  },
  author: {
    selectors: [['meta[name="dc.creator"]', 'value']],
  },
  date_published: {
    selectors: [['meta[name="dc.date.created"]', 'value']],
    timezone: 'America/New_York',
  },
  dek: {
    selectors: ['.bunker-component.lead'],
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
    selectors: [['.main-multimedia-item', '.news-story-content']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [],
  },
};

var CustomExtractors = /*#__PURE__*/ Object.freeze({
  BloggerExtractor: BloggerExtractor,
  NYMagExtractor: NYMagExtractor,
  WikipediaExtractor: WikipediaExtractor,
  TwitterExtractor: TwitterExtractor,
  NYTimesExtractor: NYTimesExtractor,
  TheAtlanticExtractor: TheAtlanticExtractor,
  NewYorkerExtractor: NewYorkerExtractor,
  WiredExtractor: WiredExtractor,
  MSNExtractor: MSNExtractor,
  YahooExtractor: YahooExtractor,
  BuzzfeedExtractor: BuzzfeedExtractor,
  WikiaExtractor: WikiaExtractor,
  LittleThingsExtractor: LittleThingsExtractor,
  PoliticoExtractor: PoliticoExtractor,
  DeadspinExtractor: DeadspinExtractor,
  BroadwayWorldExtractor: BroadwayWorldExtractor,
  ApartmentTherapyExtractor: ApartmentTherapyExtractor,
  MediumExtractor: MediumExtractor,
  WwwTmzComExtractor: WwwTmzComExtractor,
  WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
  WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
  NewrepublicComExtractor: NewrepublicComExtractor,
  MoneyCnnComExtractor: MoneyCnnComExtractor,
  WwwThevergeComExtractor: WwwThevergeComExtractor,
  WwwCnnComExtractor: WwwCnnComExtractor,
  WwwAolComExtractor: WwwAolComExtractor,
  WwwYoutubeComExtractor: WwwYoutubeComExtractor,
  WwwTheguardianComExtractor: WwwTheguardianComExtractor,
  WwwSbnationComExtractor: WwwSbnationComExtractor,
  WwwBloombergComExtractor: WwwBloombergComExtractor,
  WwwBustleComExtractor: WwwBustleComExtractor,
  WwwNprOrgExtractor: WwwNprOrgExtractor,
  WwwRecodeNetExtractor: WwwRecodeNetExtractor,
  QzComExtractor: QzComExtractor,
  WwwDmagazineComExtractor: WwwDmagazineComExtractor,
  WwwReutersComExtractor: WwwReutersComExtractor,
  MashableComExtractor: MashableComExtractor,
  WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
  WwwVoxComExtractor: WwwVoxComExtractor,
  NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
  WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
  WwwLatimesComExtractor: WwwLatimesComExtractor,
  PagesixComExtractor: PagesixComExtractor,
  ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
  WwwCbssportsComExtractor: WwwCbssportsComExtractor,
  WwwMsnbcComExtractor: WwwMsnbcComExtractor,
  WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
  WwwMentalflossComExtractor: WwwMentalflossComExtractor,
  AbcnewsGoComExtractor: AbcnewsGoComExtractor,
  WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
  WwwCnbcComExtractor: WwwCnbcComExtractor,
  WwwPopsugarComExtractor: WwwPopsugarComExtractor,
  ObserverComExtractor: ObserverComExtractor,
  PeopleComExtractor: PeopleComExtractor,
  WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
  WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
  twofortysevensportsComExtractor: twofortysevensportsComExtractor,
  UproxxComExtractor: UproxxComExtractor,
  WwwEonlineComExtractor: WwwEonlineComExtractor,
  WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
  WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
  WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
  WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
  WwwSiComExtractor: WwwSiComExtractor,
  WwwRawstoryComExtractor: WwwRawstoryComExtractor,
  WwwCnetComExtractor: WwwCnetComExtractor,
  WwwCinemablendComExtractor: WwwCinemablendComExtractor,
  WwwTodayComExtractor: WwwTodayComExtractor,
  WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
  WwwAlComExtractor: WwwAlComExtractor,
  WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
  WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
  FusionNetExtractor: FusionNetExtractor,
  WwwAmericanowComExtractor: WwwAmericanowComExtractor,
  ScienceflyComExtractor: ScienceflyComExtractor,
  HellogigglesComExtractor: HellogigglesComExtractor,
  ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
  WwwNjComExtractor: WwwNjComExtractor,
  WwwInquisitrComExtractor: WwwInquisitrComExtractor,
  WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
  FortuneComExtractor: FortuneComExtractor,
  WwwLinkedinComExtractor: WwwLinkedinComExtractor,
  ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
  WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
  WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
  ForwardComExtractor: ForwardComExtractor,
  WwwQdailyComExtractor: WwwQdailyComExtractor,
  GothamistComExtractor: GothamistComExtractor,
  WwwFoolComExtractor: WwwFoolComExtractor,
  WwwSlateComExtractor: WwwSlateComExtractor,
  IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
});

var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
  var extractor = CustomExtractors[key];
  return _objectSpread({}, acc, mergeSupportedDomains(extractor));
}, {});

// CLEAN AUTHOR CONSTANTS
var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; //     author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS

var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.

var MS_DATE_STRING = /^\d{13}$/i;
var SEC_DATE_STRING = /^\d{10}$/i;
var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
var months = [
  'jan',
  'feb',
  'mar',
  'apr',
  'may',
  'jun',
  'jul',
  'aug',
  'sep',
  'oct',
  'nov',
  'dec',
];
var allMonths = months.join('|');
var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
var timestamp3 = '-[0-9]{3,4}$';
var SPLIT_DATE_STRING = new RegExp(
  '('
    .concat(timestamp1, ')|(')
    .concat(timestamp2, ')|(')
    .concat(timestamp3, ')|([0-9]{1,4})|(')
    .concat(allMonths, ')'),
  'ig'
); // 2016-11-22T08:57-500
// Check if datetime string has an offset at the end

var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.

var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g');

// just the name(s): 'David Smith'.

function cleanAuthor(author) {
  return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}

function clean$1(leadImageUrl) {
  leadImageUrl = leadImageUrl.trim();

  if (validUrl.isWebUri(leadImageUrl)) {
    return leadImageUrl;
  }

  return null;
}

// Return None if the dek wasn't good enough.

function cleanDek(dek, _ref) {
  var $ = _ref.$,
    excerpt = _ref.excerpt;
  // Sanity check that we didn't get too short or long of a dek.
  if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt

  if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10))
    return null;
  var dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's
  // not a good dek - bail.

  if (TEXT_LINK_RE.test(dekText)) return null;
  return normalizeSpaces(dekText.trim());
}

function cleanDateString(dateString) {
  return (dateString.match(SPLIT_DATE_STRING) || [])
    .join(' ')
    .replace(TIME_MERIDIAN_DOTS_RE, 'm')
    .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
    .replace(CLEAN_DATE_STRING_RE, '$1')
    .trim();
}
function createDate(dateString, timezone, format) {
  if (TIME_WITH_OFFSET_RE.test(dateString)) {
    return moment(new Date(dateString));
  }

  return timezone
    ? moment.tz(dateString, format || parseFormat(dateString), timezone)
    : moment(dateString, format || parseFormat(dateString));
} // Take a date published string, and hopefully return a date out of
// it. Return none if we fail.

function cleanDatePublished(dateString) {
  var _ref =
      arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
    timezone = _ref.timezone,
    format = _ref.format;

  // If string is in milliseconds or seconds, convert to int and return
  if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
    return new Date(_parseInt(dateString, 10)).toISOString();
  }

  var date = createDate(dateString, timezone, format);

  if (!date.isValid()) {
    dateString = cleanDateString(dateString);
    date = createDate(dateString, timezone, format);
  }

  return date.isValid() ? date.toISOString() : null;
}

function extractCleanNode(article, _ref) {
  var $ = _ref.$,
    _ref$cleanConditional = _ref.cleanConditionally,
    cleanConditionally =
      _ref$cleanConditional === void 0 ? true : _ref$cleanConditional,
    _ref$title = _ref.title,
    title = _ref$title === void 0 ? '' : _ref$title,
    _ref$url = _ref.url,
    url = _ref$url === void 0 ? '' : _ref$url,
    _ref$defaultCleaner = _ref.defaultCleaner,
    defaultCleaner =
      _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner;
  // Rewrite the tag name to div if it's a top level node like body or
  // html to avoid later complications with multiple body tags.
  rewriteTopLevel$$1(article, $); // Drop small images and spacer images
  // Only do this is defaultCleaner is set to true;
  // this can sometimes be too aggressive.

  if (defaultCleaner) cleanImages(article, $); // Make links absolute

  makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed.
  // E.g., stripJunkTags will remove iframes, so we're going to mark
  // YouTube/Vimeo videos as elements we want to keep.

  markToKeep(article, $, url); // Drop certain tags like <title>, etc
  // This is -mostly- for cleanliness, not security.

  stripJunkTags(article, $); // H1 tags are typically the article title, which should be extracted
  // by the title extractor instead. If there's less than 3 of them (<3),
  // strip them. Otherwise, turn 'em into H2s.

  cleanHOnes$$1(article, $); // Clean headers

  cleanHeaders(article, $, title); // We used to clean UL's and OL's here, but it was leading to
  // too many in-article lists being removed. Consider a better
  // way to detect menus particularly and remove them.
  // Also optionally running, since it can be overly aggressive.

  if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes

  removeEmpty(article, $); // Remove unnecessary attributes

  cleanAttributes$$1(article, $);
  return article;
}

function cleanTitle$$1(title, _ref) {
  var url = _ref.url,
    $ = _ref.$;

  // If title has |, :, or - in it, see if
  // we can clean it up.
  if (TITLE_SPLITTERS_RE.test(title)) {
    title = resolveSplitTitle(title, url);
  } // Final sanity check that we didn't get a crazy title.
  // if (title.length > 150 || title.length < 15) {

  if (title.length > 150) {
    // If we did, return h1 from the document if it exists
    var h1 = $('h1');

    if (h1.length === 1) {
      title = h1.text();
    }
  } // strip any html tags in the title text

  return normalizeSpaces(stripTags(title, $).trim());
}

function extractBreadcrumbTitle(splitTitle, text) {
  // This must be a very breadcrumbed title, like:
  // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
  // NYTimes - Blogs - Bits - The Best Gadgets on Earth
  if (splitTitle.length >= 6) {
    // Look to see if we can find a breadcrumb splitter that happens
    // more than once. If we can, we'll be able to better pull out
    // the title.
    var termCounts = splitTitle.reduce(function(acc, titleText) {
      acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
      return acc;
    }, {});

    var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(
        function(acc, key) {
          if (acc[1] < termCounts[key]) {
            return [key, termCounts[key]];
          }

          return acc;
        },
        [0, 0]
      ),
      _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
      maxTerm = _Reflect$ownKeys$redu2[0],
      termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it
    // is probably the breadcrumber. Split our title on that instead.
    // Note: max_term should be <= 4 characters, so that " >> "
    // will match, but nothing longer than that.

    if (termCount >= 2 && maxTerm.length <= 4) {
      splitTitle = text.split(maxTerm);
    }

    var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
    var longestEnd = splitEnds.reduce(function(acc, end) {
      return acc.length > end.length ? acc : end;
    }, '');

    if (longestEnd.length > 10) {
      return longestEnd;
    }

    return text;
  }

  return null;
}

function cleanDomainFromTitle(splitTitle, url) {
  // Search the ends of the title, looking for bits that fuzzy match
  // the URL too closely. If one is found, discard it and return the
  // rest.
  //
  // Strip out the big TLDs - it just makes the matching a bit more
  // accurate. Not the end of the world if it doesn't strip right.
  var _URL$parse = URL.parse(url),
    host = _URL$parse.host;

  var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
  var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
  var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);

  if (startSlugRatio > 0.4 && startSlug.length > 5) {
    return splitTitle.slice(2).join('');
  }

  var endSlug = splitTitle
    .slice(-1)[0]
    .toLowerCase()
    .replace(' ', '');
  var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);

  if (endSlugRatio > 0.4 && endSlug.length >= 5) {
    return splitTitle.slice(0, -2).join('');
  }

  return null;
} // Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.

function resolveSplitTitle(title) {
  var url =
    arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';
  // Splits while preserving splitters, like:
  // ['The New New York', ' - ', 'The Washington Post']
  var splitTitle = title.split(TITLE_SPLITTERS_RE);

  if (splitTitle.length === 1) {
    return title;
  }

  var newTitle = extractBreadcrumbTitle(splitTitle, title);
  if (newTitle) return newTitle;
  newTitle = cleanDomainFromTitle(splitTitle, url);
  if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit.
  // Just return it all.

  return title;
}

var Cleaners = {
  author: cleanAuthor,
  lead_image_url: clean$1,
  dek: cleanDek,
  date_published: cleanDatePublished,
  content: extractCleanNode,
  title: cleanTitle$$1,
};

// likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns a cheerio object $

function extractBestNode($, opts) {
  // clone the node so we can get back to our
  // initial parsed state if needed
  // TODO Do I need this? – AP
  // let $root = $.root().clone()
  if (opts.stripUnlikelyCandidates) {
    $ = stripUnlikelyCandidates($);
  }

  $ = convertToParagraphs$$1($);
  $ = scoreContent$$1($, opts.weightNodes);
  var $topCandidate = findTopCandidate$$1($);
  return $topCandidate;
}

var GenericContentExtractor = {
  defaultOpts: {
    stripUnlikelyCandidates: true,
    weightNodes: true,
    cleanConditionally: true,
  },
  // Extract the content for this resource - initially, pass in our
  // most restrictive opts which will return the highest quality
  // content. On each failure, retry with slightly more lax opts.
  //
  // :param return_type: string. If "node", should return the content
  // as a cheerio node rather than as an HTML string.
  //
  // Opts:
  // stripUnlikelyCandidates: Remove any elements that match
  // non-article-like criteria first.(Like, does this element
  //   have a classname of "comment")
  //
  // weightNodes: Modify an elements score based on whether it has
  // certain classNames or IDs. Examples: Subtract if a node has
  // a className of 'comment', Add if a node has an ID of
  // 'entry-content'.
  //
  // cleanConditionally: Clean the node to return of some
  // superfluous content. Things like forms, ads, etc.
  extract: function extract(_ref, opts) {
    var $ = _ref.$,
      html = _ref.html,
      title = _ref.title,
      url = _ref.url;
    opts = _objectSpread({}, this.defaultOpts, opts);
    $ = $ || cheerio.load(html); // Cascade through our extraction-specific opts in an ordered fashion,
    // turning them off as we try to extract content.

    var node = this.getContentNode($, title, url, opts);

    if (nodeIsSufficient(node)) {
      return this.cleanAndReturnNode(node, $);
    } // We didn't succeed on first pass, one by one disable our
    // extraction opts and try again.
    // eslint-disable-next-line no-restricted-syntax

    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (
        var _iterator = _getIterator(
            _Reflect$ownKeys(opts).filter(function(k) {
              return opts[k] === true;
            })
          ),
          _step;
        !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
        _iteratorNormalCompletion = true
      ) {
        var key = _step.value;
        opts[key] = false;
        $ = cheerio.load(html);
        node = this.getContentNode($, title, url, opts);

        if (nodeIsSufficient(node)) {
          break;
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return this.cleanAndReturnNode(node, $);
  },
  // Get node given current options
  getContentNode: function getContentNode($, title, url, opts) {
    return extractCleanNode(extractBestNode($, opts), {
      $: $,
      cleanConditionally: opts.cleanConditionally,
      title: title,
      url: url,
    });
  },
  // Once we got here, either we're at our last-resort node, or
  // we broke early. Make sure we at least have -something- before we
  // move forward.
  cleanAndReturnNode: function cleanAndReturnNode(node, $) {
    if (!node) {
      return null;
    }

    return normalizeSpaces($.html(node)); // if return_type == "html":
    //     return normalize_spaces(node_to_html(node))
    // else:
    //     return node
  },
};

// TODO: It would be great if we could merge the meta and selector lists into
// a list of objects, because we could then rank them better. For example,
// .hentry .entry-title is far better suited than <meta title>.
// An ordered list of meta tag names that denote likely article titles. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
var STRONG_TITLE_META_TAGS = [
  'tweetmeme-title',
  'dc.title',
  'rbtitle',
  'headline',
  'title',
]; // og:title is weak because it typically contains context that we don't like,
// for example the source site's name. Gotta get that brand into facebook!

var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.

var STRONG_TITLE_SELECTORS = [
  '.hentry .entry-title',
  'h1#articleHeader',
  'h1.articleHeader',
  'h1.article',
  '.instapaper_title',
  '#meebo-title',
];
var WEAK_TITLE_SELECTORS = [
  'article h1',
  '#entry-title',
  '.entry-title',
  '#entryTitle',
  '#entrytitle',
  '.entryTitle',
  '.entrytitle',
  '#articleTitle',
  '.articleTitle',
  'post post-title',
  'h1.title',
  'h2.article',
  'h1',
  'html head title',
  'title',
];

var GenericTitleExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      url = _ref.url,
      metaCache = _ref.metaCache;
    // First, check to see if we have a matching meta tag that we can make
    // use of that is strongly associated with the headline.
    var title;
    title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache);
    if (title)
      return cleanTitle$$1(title, {
        url: url,
        $: $,
      }); // Second, look through our content selectors for the most likely
    // article title that is strongly associated with the headline.

    title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS);
    if (title)
      return cleanTitle$$1(title, {
        url: url,
        $: $,
      }); // Third, check for weaker meta tags that may match.

    title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache);
    if (title)
      return cleanTitle$$1(title, {
        url: url,
        $: $,
      }); // Last, look for weaker selector tags that may match.

    title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS);
    if (title)
      return cleanTitle$$1(title, {
        url: url,
        $: $,
      }); // If no matches, return an empty string

    return '';
  },
};

// An ordered list of meta tag names that denote likely article authors. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
//
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
var AUTHOR_META_TAGS = [
  'byl',
  'clmst',
  'dc.author',
  'dcsext.author',
  'dc.creator',
  'rbauthors',
  'authors',
];
var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.

var AUTHOR_SELECTORS = [
  '.entry .entry-author',
  '.author.vcard .fn',
  '.author .vcard .fn',
  '.byline.vcard .fn',
  '.byline .vcard .fn',
  '.byline .by .author',
  '.byline .by',
  '.byline .author',
  '.post-author.vcard',
  '.post-author .vcard',
  'a[rel=author]',
  '#by_author',
  '.by_author',
  '#entryAuthor',
  '.entryAuthor',
  '.byline a[href*=author]',
  '#author .authorname',
  '.author .authorname',
  '#author',
  '.author',
  '.articleauthor',
  '.ArticleAuthor',
  '.byline',
]; // An ordered list of Selectors to find likely article authors, with
// regular expression for content.

var bylineRe = /^[\n\s]*By/i;
var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];

var GenericAuthorExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      metaCache = _ref.metaCache;
    var author; // First, check to see if we have a matching
    // meta tag that we can make use of.

    author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache);

    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    } // Second, look through our selectors looking for potential authors.

    author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2);

    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    } // Last, use our looser regular-expression based selectors for
    // potential authors.
    // eslint-disable-next-line no-restricted-syntax

    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (
        var _iterator = _getIterator(BYLINE_SELECTORS_RE), _step;
        !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
        _iteratorNormalCompletion = true
      ) {
        var _ref4 = _step.value;

        var _ref3 = _slicedToArray(_ref4, 2);

        var selector = _ref3[0];
        var regex = _ref3[1];
        var node = $(selector);

        if (node.length === 1) {
          var text = node.text();

          if (regex.test(text)) {
            return cleanAuthor(text);
          }
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return null;
  },
};

// An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var DATE_PUBLISHED_META_TAGS = [
  'article:published_time',
  'displaydate',
  'dc.date',
  'dc.date.issued',
  'rbpubdate',
  'publish_date',
  'pub_date',
  'pagedate',
  'pubdate',
  'revision_date',
  'doc_date',
  'date_created',
  'content_create_date',
  'lastmodified',
  'created',
  'date',
]; // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.

var DATE_PUBLISHED_SELECTORS = [
  '.hentry .dtstamp.published',
  '.hentry .published',
  '.hentry .dtstamp.updated',
  '.hentry .updated',
  '.single .published',
  '.meta .published',
  '.meta .postDate',
  '.entry-date',
  '.byline .date',
  '.postmetadata .date',
  '.article_datetime',
  '.date-header',
  '.story-date',
  '.dateStamp',
  '#story .datetime',
  '.dateline',
  '.pubdate',
]; // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse

var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
var DATE_PUBLISHED_URL_RES = [
  // /2012/01/27/ but not /2012/01/293
  new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733
  // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
  // 2012-01-27
  new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/
  new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'),
];

var GenericDatePublishedExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      url = _ref.url,
      metaCache = _ref.metaCache;
    var datePublished; // First, check to see if we have a matching meta tag
    // that we can make use of.
    // Don't try cleaning tags from this string

    datePublished = extractFromMeta$$1(
      $,
      DATE_PUBLISHED_META_TAGS,
      metaCache,
      false
    );
    if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential
    // date_published's.

    datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL

    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);
    return null;
  },
};

// import {
//   DEK_META_TAGS,
//   DEK_SELECTORS,
//   DEK_URL_RES,
// } from './constants';
// import { cleanDek } from 'cleaners';
// import {
//   extractFromMeta,
//   extractFromSelectors,
// } from 'utils/dom';
// Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.
var GenericDekExtractor = {
  // extract({ $, content, metaCache }) {
  extract: function extract() {
    return null;
  },
};
//     # First, check to see if we have a matching meta tag that we can make
//     # use of.
//     dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
//     if not dek:
//         # Second, look through our CSS/XPath selectors. This may return
//         # an HTML fragment.
//         dek = self.extract_from_selectors('dek',
//                                            constants.DEK_SELECTORS,
//                                            text_only=False)
//
//     if dek:
//         # Make sure our dek isn't in the first few thousand characters
//         # of the content, otherwise it's just the start of the article
//         # and not a true dek.
//         content = self.extract_content()
//         content_chunk = normalize_spaces(strip_tags(content[:2000]))
//         dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
//
//         # 80% or greater similarity means the dek was very similar to some
//         # of the starting content, so we skip it.
//         if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
//             return dek
//
//     return None

// An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src'];
var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]'];
var POSITIVE_LEAD_IMAGE_URL_HINTS = [
  'upload',
  'wp-content',
  'large',
  'photo',
  'wp-image',
];
var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(
  POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'),
  'i'
);
var NEGATIVE_LEAD_IMAGE_URL_HINTS = [
  'spacer',
  'sprite',
  'blank',
  'throbber',
  'gradient',
  'tile',
  'bg',
  'background',
  'icon',
  'social',
  'header',
  'hdr',
  'advert',
  'spinner',
  'loader',
  'loading',
  'default',
  'rating',
  'share',
  'facebook',
  'twitter',
  'theme',
  'promo',
  'ads',
  'wp-includes',
];
var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(
  NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'),
  'i'
);
var GIF_RE = /\.gif(\?.*)?$/i;
var JPG_RE = /\.jpe?g(\?.*)?$/i;

function getSig($node) {
  return ''
    .concat($node.attr('class') || '', ' ')
    .concat($node.attr('id') || '');
} // Scores image urls based on a variety of heuristics.

function scoreImageUrl(url) {
  url = url.trim();
  var score = 0;

  if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
    score += 20;
  }

  if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
    score -= 20;
  } // TODO: We might want to consider removing this as
  // gifs are much more common/popular than they once were

  if (GIF_RE.test(url)) {
    score -= 10;
  }

  if (JPG_RE.test(url)) {
    score += 10;
  } // PNGs are neutral.

  return score;
} // Alt attribute usually means non-presentational image.

function scoreAttr($img) {
  if ($img.attr('alt')) {
    return 5;
  }

  return 0;
} // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them

function scoreByParents($img) {
  var score = 0;
  var $figParent = $img.parents('figure').first();

  if ($figParent.length === 1) {
    score += 25;
  }

  var $parent = $img.parent();
  var $gParent;

  if ($parent.length === 1) {
    $gParent = $parent.parent();
  }

  [$parent, $gParent].forEach(function($node) {
    if (PHOTO_HINTS_RE$1.test(getSig($node))) {
      score += 15;
    }
  });
  return score;
} // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.

function scoreBySibling($img) {
  var score = 0;
  var $sibling = $img.next();
  var sibling = $sibling.get(0);

  if (sibling && sibling.tagName.toLowerCase() === 'figcaption') {
    score += 25;
  }

  if (PHOTO_HINTS_RE$1.test(getSig($sibling))) {
    score += 15;
  }

  return score;
}
function scoreByDimensions($img) {
  var score = 0;

  var width = _parseFloat($img.attr('width'));

  var height = _parseFloat($img.attr('height'));

  var src = $img.attr('src'); // Penalty for skinny images

  if (width && width <= 50) {
    score -= 50;
  } // Penalty for short images

  if (height && height <= 50) {
    score -= 50;
  }

  if (width && height && !src.includes('sprite')) {
    var area = width * height;

    if (area < 5000) {
      // Smaller than 50 x 100
      score -= 100;
    } else {
      score += Math.round(area / 1000);
    }
  }

  return score;
}
function scoreByPosition($imgs, index) {
  return $imgs.length / 2 - index;
}

// it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
//   * domain
//   * weird aspect ratio

var GenericLeadImageUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      content = _ref.content,
      metaCache = _ref.metaCache,
      html = _ref.html;
    var cleanUrl;

    if (!$.browser && $('head').length === 0) {
      $('*')
        .first()
        .prepend(html);
    } // Check to see if we have a matching meta tag that we can make use of.
    // Moving this higher because common practice is now to use large
    // images on things like Open Graph or Twitter cards.
    // images usually have for things like Open Graph.

    var imageUrl = extractFromMeta$$1(
      $,
      LEAD_IMAGE_URL_META_TAGS,
      metaCache,
      false
    );

    if (imageUrl) {
      cleanUrl = clean$1(imageUrl);
      if (cleanUrl) return cleanUrl;
    } // Next, try to find the "best" image via the content.
    // We'd rather not have to fetch each image and check dimensions,
    // so try to do some analysis and determine them instead.

    var $content = $(content);
    var imgs = $('img', $content).toArray();
    var imgScores = {};
    imgs.forEach(function(img, index) {
      var $img = $(img);
      var src = $img.attr('src');
      if (!src) return;
      var score = scoreImageUrl(src);
      score += scoreAttr($img);
      score += scoreByParents($img);
      score += scoreBySibling($img);
      score += scoreByDimensions($img);
      score += scoreByPosition(imgs, index);
      imgScores[src] = score;
    });

    var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(
        function(acc, key) {
          return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
        },
        [null, 0]
      ),
      _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
      topUrl = _Reflect$ownKeys$redu2[0],
      topScore = _Reflect$ownKeys$redu2[1];

    if (topScore > 0) {
      cleanUrl = clean$1(topUrl);
      if (cleanUrl) return cleanUrl;
    } // If nothing else worked, check to see if there are any really
    // probable nodes in the doc, like <link rel="image_src" />.
    // eslint-disable-next-line no-restricted-syntax

    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (
        var _iterator = _getIterator(LEAD_IMAGE_URL_SELECTORS), _step;
        !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
        _iteratorNormalCompletion = true
      ) {
        var selector = _step.value;
        var $node = $(selector).first();
        var src = $node.attr('src');

        if (src) {
          cleanUrl = clean$1(src);
          if (cleanUrl) return cleanUrl;
        }

        var href = $node.attr('href');

        if (href) {
          cleanUrl = clean$1(href);
          if (cleanUrl) return cleanUrl;
        }

        var value = $node.attr('value');

        if (value) {
          cleanUrl = clean$1(value);
          if (cleanUrl) return cleanUrl;
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return null;
  },
};
//     """
//     # First, try to find the "best" image via the content.
//     # We'd rather not have to fetch each image and check dimensions,
//     # so try to do some analysis and determine them instead.
//     content = self.extractor.extract_content(return_type="node")
//     imgs = content.xpath('.//img')
//     img_scores = defaultdict(int)
//     logger.debug('Scoring %d images from content', len(imgs))
//     for (i, img) in enumerate(imgs):
//         img_score = 0
//
//         if not 'src' in img.attrib:
//             logger.debug('No src attribute found')
//             continue
//
//         try:
//             parsed_img = urlparse(img.attrib['src'])
//             img_path = parsed_img.path.lower()
//         except ValueError:
//             logger.debug('ValueError getting img path.')
//             continue
//         logger.debug('Image path is %s', img_path)
//
//         if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
//             logger.debug('Positive URL hints match. Adding 20.')
//             img_score += 20
//
//         if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
//             logger.debug('Negative URL hints match. Subtracting 20.')
//             img_score -= 20
//
//         # Gifs are more often structure than photos
//         if img_path.endswith('gif'):
//             logger.debug('gif found. Subtracting 10.')
//             img_score -= 10
//
//         # JPGs are more often photographs
//         if img_path.endswith('jpg'):
//             logger.debug('jpg found. Adding 10.')
//             img_score += 10
//
//         # PNGs are neutral.
//
//         # Alt attribute usually means non-presentational image.
//         if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
//             logger.debug('alt attribute found. Adding 5.')
//             img_score += 5
//
//         # Look through our parent and grandparent for figure-like
//         # container elements, give a bonus if we find them
//         parents = [img.getparent()]
//         if parents[0] is not None and parents[0].getparent() is not None:
//             parents.append(parents[0].getparent())
//         for p in parents:
//             if p.tag == 'figure':
//                 logger.debug('Parent with <figure> tag found. Adding 25.')
//                 img_score += 25
//
//             p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
//             if constants.PHOTO_HINTS_RE.search(p_sig):
//                 logger.debug('Photo hints regex match. Adding 15.')
//                 img_score += 15
//
//         # Look at our immediate sibling and see if it looks like it's a
//         # caption. Bonus if so.
//         sibling = img.getnext()
//         if sibling is not None:
//             if sibling.tag == 'figcaption':
//                 img_score += 25
//
//             sib_sig = ' '.join([sibling.get('id', ''),
//                                 sibling.get('class', '')]).lower()
//             if 'caption' in sib_sig:
//                 img_score += 15
//
//         # Pull out width/height if they were set.
//         img_width = None
//         img_height = None
//         if 'width' in img.attrib:
//             try:
//                 img_width = float(img.get('width'))
//             except ValueError:
//                 pass
//         if 'height' in img.attrib:
//             try:
//                 img_height = float(img.get('height'))
//             except ValueError:
//                 pass
//
//         # Penalty for skinny images
//         if img_width and img_width <= 50:
//             logger.debug('Skinny image found. Subtracting 50.')
//             img_score -= 50
//
//         # Penalty for short images
//         if img_height and img_height <= 50:
//             # Wide, short images are more common than narrow, tall ones
//             logger.debug('Short image found. Subtracting 25.')
//             img_score -= 25
//
//         if img_width and img_height and not 'sprite' in img_path:
//             area = img_width * img_height
//
//             if area < 5000: # Smaller than 50x100
//                 logger.debug('Image with small area found. Subtracting 100.')
//                 img_score -= 100
//             else:
//                 img_score += round(area/1000.0)
//
//         # If the image is higher on the page than other images,
//         # it gets a bonus. Penalty if lower.
//         logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
//         img_score += len(imgs)/2 - i
//
//         # Use the raw src here because we munged img_path for case
//         # insensitivity
//         logger.debug('Final score is %d.', img_score)
//         img_scores[img.attrib['src']] += img_score
//
//     top_score = 0
//     top_url = None
//     for (url, score) in img_scores.items():
//         if score > top_score:
//             top_url = url
//             top_score = score
//
//     if top_score > 0:
//         logger.debug('Using top score image from content. Score was %d', top_score)
//         return top_url
//
//
//     # If nothing else worked, check to see if there are any really
//     # probable nodes in the doc, like <link rel="image_src" />.
//     logger.debug('Trying to find lead image in probable nodes')
//     for selector in constants.LEAD_IMAGE_URL_SELECTORS:
//         nodes = self.resource.extract_by_selector(selector)
//         for node in nodes:
//             clean_value = None
//             if node.attrib.get('src'):
//                 clean_value = self.clean(node.attrib['src'])
//
//             if not clean_value and node.attrib.get('href'):
//                 clean_value = self.clean(node.attrib['href'])
//
//             if not clean_value and node.attrib.get('value'):
//                 clean_value = self.clean(node.attrib['value'])
//
//             if clean_value:
//                 logger.debug('Found lead image in probable nodes.')
//                 logger.debug('Node was: %s', node)
//                 return clean_value
//
//     return None

function scoreSimilarity(score, articleUrl, href) {
  // Do this last and only if we have a real candidate, because it's
  // potentially expensive computationally. Compare the link to this
  // URL using difflib to get the % similarity of these URLs. On a
  // sliding scale, subtract points from this link based on
  // similarity.
  if (score > 0) {
    var similarity = new difflib.SequenceMatcher(
      null,
      articleUrl,
      href
    ).ratio(); // Subtract .1 from diff_percent when calculating modifier,
    // which means that if it's less than 10% different, we give a
    // bonus instead. Ex:
    //  3% different = +17.5 points
    // 10% different = 0 points
    // 20% different = -25 points

    var diffPercent = 1.0 - similarity;
    var diffModifier = -(250 * (diffPercent - 0.2));
    return score + diffModifier;
  }

  return 0;
}

function scoreLinkText(linkText, pageNum) {
  // If the link text can be parsed as a number, give it a minor
  // bonus, with a slight bias towards lower numbered pages. This is
  // so that pages that might not have 'next' in their text can still
  // get scored, and sorted properly by score.
  var score = 0;

  if (IS_DIGIT_RE.test(linkText.trim())) {
    var linkTextAsNum = _parseInt(linkText, 10); // If it's the first page, we already got it on the first call.
    // Give it a negative score. Otherwise, up to page 10, give a
    // small bonus.

    if (linkTextAsNum < 2) {
      score = -30;
    } else {
      score = Math.max(0, 10 - linkTextAsNum);
    } // If it appears that the current page number is greater than
    // this links page number, it's a very bad sign. Give it a big
    // penalty.

    if (pageNum && pageNum >= linkTextAsNum) {
      score -= 50;
    }
  }

  return score;
}

function scorePageInLink(pageNum, isWp) {
  // page in the link = bonus. Intentionally ignore wordpress because
  // their ?p=123 link style gets caught by this even though it means
  // separate documents entirely.
  if (pageNum && !isWp) {
    return 50;
  }

  return 0;
}

var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.

var EXTRANEOUS_LINK_HINTS$1 = [
  'print',
  'archive',
  'comment',
  'discuss',
  'e-mail',
  'email',
  'share',
  'reply',
  'all',
  'login',
  'sign',
  'single',
  'adx',
  'entry-unrelated',
];
var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(
  EXTRANEOUS_LINK_HINTS$1.join('|'),
  'i'
); // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.

var NEXT_LINK_TEXT_RE$1 = new RegExp(
  '(next|weiter|continue|>([^|]|$)|»([^|]|$))',
  'i'
); // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.

var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous
// page.

var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination

function scoreExtraneousLinks(href) {
  // If the URL itself contains extraneous values, give a penalty.
  if (EXTRANEOUS_LINK_HINTS_RE$1.test(href)) {
    return -25;
  }

  return 0;
}

function makeSig($link) {
  return ''
    .concat($link.attr('class') || '', ' ')
    .concat($link.attr('id') || '');
}

function scoreByParents$1($link) {
  // If a parent node contains paging-like classname or id, give a
  // bonus. Additionally, if a parent_node contains bad content
  // (like 'sponsor'), give a penalty.
  var $parent = $link.parent();
  var positiveMatch = false;
  var negativeMatch = false;
  var score = 0;

  _Array$from(range(0, 4)).forEach(function() {
    if ($parent.length === 0) {
      return;
    }

    var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good
    // sign. Add a bonus.

    if (!positiveMatch && PAGE_RE.test(parentData)) {
      positiveMatch = true;
      score += 25;
    } // If we have 'comment' or something in our data, and
    // we don't have something like 'content' as well, that's
    // a bad sign. Give a penalty.

    if (
      !negativeMatch &&
      NEGATIVE_SCORE_RE.test(parentData) &&
      EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)
    ) {
      if (!POSITIVE_SCORE_RE.test(parentData)) {
        negativeMatch = true;
        score -= 25;
      }
    }

    $parent = $parent.parent();
  });

  return score;
}

function scorePrevLink(linkData) {
  // If the link has something like "previous", its definitely
  // an old link, skip it.
  if (PREV_LINK_TEXT_RE$1.test(linkData)) {
    return -200;
  }

  return 0;
}

function shouldScore(
  href,
  articleUrl,
  baseUrl,
  parsedUrl,
  linkText,
  previousUrls
) {
  // skip if we've already fetched this url
  if (
    previousUrls.find(function(url) {
      return href === url;
    }) !== undefined
  ) {
    return false;
  } // If we've already parsed this URL, or the URL matches the base
  // URL, or is empty, skip it.

  if (!href || href === articleUrl || href === baseUrl) {
    return false;
  }

  var hostname = parsedUrl.hostname;

  var _URL$parse = URL.parse(href),
    linkHost = _URL$parse.hostname; // Domain mismatch.

  if (linkHost !== hostname) {
    return false;
  } // If href doesn't contain a digit after removing the base URL,
  // it's certainly not the next page.

  var fragment = href.replace(baseUrl, '');

  if (!DIGIT_RE$2.test(fragment)) {
    return false;
  } // This link has extraneous content (like "comment") in its link
  // text, so we skip it.

  if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) {
    return false;
  } // Next page link text is never long, skip if it is too long.

  if (linkText.length > 25) {
    return false;
  }

  return true;
}

function scoreBaseUrl(href, baseRegex) {
  // If the baseUrl isn't part of this URL, penalize this
  // link. It could still be the link, but the odds are lower.
  // Example:
  // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
  if (!baseRegex.test(href)) {
    return -25;
  }

  return 0;
}

function scoreNextLinkText(linkData) {
  // Things like "next", ">>", etc.
  if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
    return 50;
  }

  return 0;
}

function scoreCapLinks(linkData) {
  // Cap links are links like "last", etc.
  if (CAP_LINK_TEXT_RE$1.test(linkData)) {
    // If we found a link like "last", but we've already seen that
    // this link is also "next", it's fine. If it's not been
    // previously marked as "next", then it's probably bad.
    // Penalize.
    if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
      return -65;
    }
  }

  return 0;
}

function makeBaseRegex(baseUrl) {
  return new RegExp('^'.concat(baseUrl), 'i');
}

function makeSig$1($link, linkText) {
  return ''
    .concat(linkText || $link.text(), ' ')
    .concat($link.attr('class') || '', ' ')
    .concat($link.attr('id') || '');
}

function scoreLinks(_ref) {
  var links = _ref.links,
    articleUrl = _ref.articleUrl,
    baseUrl = _ref.baseUrl,
    parsedUrl = _ref.parsedUrl,
    $ = _ref.$,
    _ref$previousUrls = _ref.previousUrls,
    previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
  parsedUrl = parsedUrl || URL.parse(articleUrl);
  var baseRegex = makeBaseRegex(baseUrl);
  var isWp = isWordpress($); // Loop through all links, looking for hints that they may be next-page
  // links. Things like having "page" in their textContent, className or
  // id, or being a child of a node with a page-y className or id.
  //
  // After we do that, assign each page a score, and pick the one that
  // looks most like the next page link, as long as its score is strong
  // enough to have decent confidence.

  var scoredPages = links.reduce(function(possiblePages, link) {
    // Remove any anchor data since we don't do a good job
    // standardizing URLs (it's hard), we're going to do
    // some checking with and without a trailing slash
    var attrs = getAttrs(link); // if href is undefined, return

    if (!attrs.href) return possiblePages;
    var href = removeAnchor(attrs.href);
    var $link = $(link);
    var linkText = $link.text();

    if (
      !shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)
    ) {
      return possiblePages;
    } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##

    if (!possiblePages[href]) {
      possiblePages[href] = {
        score: 0,
        linkText: linkText,
        href: href,
      };
    } else {
      possiblePages[href].linkText = ''
        .concat(possiblePages[href].linkText, '|')
        .concat(linkText);
    }

    var possiblePage = possiblePages[href];
    var linkData = makeSig$1($link, linkText);
    var pageNum = pageNumFromUrl(href);
    var score = scoreBaseUrl(href, baseRegex);
    score += scoreNextLinkText(linkData);
    score += scoreCapLinks(linkData);
    score += scorePrevLink(linkData);
    score += scoreByParents$1($link);
    score += scoreExtraneousLinks(href);
    score += scorePageInLink(pageNum, isWp);
    score += scoreLinkText(linkText, pageNum);
    score += scoreSimilarity(score, articleUrl, href);
    possiblePage.score = score;
    return possiblePages;
  }, {});
  return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages;
}

// for multi-page articles

var GenericNextPageUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      url = _ref.url,
      parsedUrl = _ref.parsedUrl,
      _ref$previousUrls = _ref.previousUrls,
      previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
    parsedUrl = parsedUrl || URL.parse(url);
    var articleUrl = removeAnchor(url);
    var baseUrl = articleBaseUrl(url, parsedUrl);
    var links = $('a[href]').toArray();
    var scoredLinks = scoreLinks({
      links: links,
      articleUrl: articleUrl,
      baseUrl: baseUrl,
      parsedUrl: parsedUrl,
      $: $,
      previousUrls: previousUrls,
    }); // If no links were scored, return null

    if (!scoredLinks) return null; // now that we've scored all possible pages,
    // find the biggest one.

    var topPage = _Reflect$ownKeys(scoredLinks).reduce(
      function(acc, link) {
        var scoredLink = scoredLinks[link];
        return scoredLink.score > acc.score ? scoredLink : acc;
      },
      {
        score: -100,
      }
    ); // If the score is less than 50, we're not confident enough to use it,
    // so we fail.

    if (topPage.score >= 50) {
      return topPage.href;
    }

    return null;
  },
};

var CANONICAL_META_SELECTORS = ['og:url'];

function parseDomain(url) {
  var parsedUrl = URL.parse(url);
  var hostname = parsedUrl.hostname;
  return hostname;
}

function result(url) {
  return {
    url: url,
    domain: parseDomain(url),
  };
}

var GenericUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      url = _ref.url,
      metaCache = _ref.metaCache;
    var $canonical = $('link[rel=canonical]');

    if ($canonical.length !== 0) {
      var href = $canonical.attr('href');

      if (href) {
        return result(href);
      }
    }

    var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache);

    if (metaUrl) {
      return result(metaUrl);
    }

    return result(url);
  },
};

var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];

function clean$2(content, $) {
  var maxLength =
    arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
  content = content.replace(/[\s\n]+/g, ' ').trim();
  return ellipsize(content, maxLength, {
    ellipse: '&hellip;',
  });
}
var GenericExcerptExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
      content = _ref.content,
      metaCache = _ref.metaCache;
    var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache);

    if (excerpt) {
      return clean$2(stripTags(excerpt, $));
    } // Fall back to excerpting from the extracted content

    var maxLength = 200;
    var shortContent = content.slice(0, maxLength * 5);
    return clean$2($(shortContent).text(), $, maxLength);
  },
};

var GenericWordCountExtractor = {
  extract: function extract(_ref) {
    var content = _ref.content;
    var $ = cheerio.load(content);
    var $content = $('div').first();
    var text = normalizeSpaces($content.text());
    return text.split(/\s/).length;
  },
};

var GenericExtractor = {
  // This extractor is the default for all domains
  domain: '*',
  title: GenericTitleExtractor.extract,
  date_published: GenericDatePublishedExtractor.extract,
  author: GenericAuthorExtractor.extract,
  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
  lead_image_url: GenericLeadImageUrlExtractor.extract,
  dek: GenericDekExtractor.extract,
  next_page_url: GenericNextPageUrlExtractor.extract,
  url_and_domain: GenericUrlExtractor.extract,
  excerpt: GenericExcerptExtractor.extract,
  word_count: GenericWordCountExtractor.extract,
  direction: function direction(_ref) {
    var title = _ref.title;
    return stringDirection.getDirection(title);
  },
  extract: function extract(options) {
    var html = options.html,
      $ = options.$;

    if (html && !$) {
      var loaded = cheerio.load(html);
      options.$ = loaded;
    }

    var title = this.title(options);
    var date_published = this.date_published(options);
    var author = this.author(options);
    var content = this.content(
      _objectSpread({}, options, {
        title: title,
      })
    );
    var lead_image_url = this.lead_image_url(
      _objectSpread({}, options, {
        content: content,
      })
    );
    var dek = this.dek(
      _objectSpread({}, options, {
        content: content,
      })
    );
    var next_page_url = this.next_page_url(options);
    var excerpt = this.excerpt(
      _objectSpread({}, options, {
        content: content,
      })
    );
    var word_count = this.word_count(
      _objectSpread({}, options, {
        content: content,
      })
    );
    var direction = this.direction({
      title: title,
    });

    var _this$url_and_domain = this.url_and_domain(options),
      url = _this$url_and_domain.url,
      domain = _this$url_and_domain.domain;

    return {
      title: title,
      author: author,
      date_published: date_published || null,
      dek: dek,
      lead_image_url: lead_image_url,
      content: content,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
      excerpt: excerpt,
      word_count: word_count,
      direction: direction,
    };
  },
};

var Detectors = {
  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
  'meta[name="generator"][value="blogger"]': BloggerExtractor,
};
function detectByHtml($) {
  var selector = _Reflect$ownKeys(Detectors).find(function(s) {
    return $(s).length > 0;
  });

  return Detectors[selector];
}

function getExtractor(url, parsedUrl, $) {
  parsedUrl = parsedUrl || URL.parse(url);
  var _parsedUrl = parsedUrl,
    hostname = _parsedUrl.hostname;
  var baseDomain = hostname
    .split('.')
    .slice(-2)
    .join('.');
  return (
    Extractors[hostname] ||
    Extractors[baseDomain] ||
    detectByHtml($) ||
    GenericExtractor
  );
}

function cleanBySelectors($content, $, _ref) {
  var clean = _ref.clean;
  if (!clean) return $content;
  $(clean.join(','), $content).remove();
  return $content;
} // Transform matching elements

function transformElements($content, $, _ref2) {
  var transforms = _ref2.transforms;
  if (!transforms) return $content;

  _Reflect$ownKeys(transforms).forEach(function(key) {
    var $matches = $(key, $content);
    var value = transforms[key]; // If value is a string, convert directly

    if (typeof value === 'string') {
      $matches.each(function(index, node) {
        convertNodeTo$$1($(node), $, transforms[key]);
      });
    } else if (typeof value === 'function') {
      // If value is function, apply function to node
      $matches.each(function(index, node) {
        var result = value($(node), $); // If function returns a string, convert node to that value

        if (typeof result === 'string') {
          convertNodeTo$$1($(node), $, result);
        }
      });
    }
  });

  return $content;
}

function findMatchingSelector($, selectors, extractHtml) {
  return selectors.find(function(selector) {
    if (_Array$isArray(selector)) {
      if (extractHtml) {
        return selector.reduce(function(acc, s) {
          return acc && $(s).length > 0;
        }, true);
      }

      var _selector = _slicedToArray(selector, 2),
        s = _selector[0],
        attr = _selector[1];

      return (
        $(s).length === 1 &&
        $(s).attr(attr) &&
        $(s)
          .attr(attr)
          .trim() !== ''
      );
    }

    return (
      $(selector).length === 1 &&
      $(selector)
        .text()
        .trim() !== ''
    );
  });
}

function select(opts) {
  var $ = opts.$,
    type = opts.type,
    extractionOpts = opts.extractionOpts,
    _opts$extractHtml = opts.extractHtml,
    extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type

  if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
  // contributors), return the string

  if (typeof extractionOpts === 'string') return extractionOpts;
  var selectors = extractionOpts.selectors,
    _extractionOpts$defau = extractionOpts.defaultCleaner,
    defaultCleaner =
      _extractionOpts$defau === void 0 ? true : _extractionOpts$defau;
  var matchingSelector = findMatchingSelector($, selectors, extractHtml);
  if (!matchingSelector) return null; // Declaring result; will contain either
  // text or html, which will be cleaned
  // by the appropriate cleaner type
  // If the selector type requests html as its return type
  // transform and clean the element with provided selectors

  var $content;

  if (extractHtml) {
    // If matching selector is an array, we're considering this a
    // multi-match selection, which allows the parser to choose several
    // selectors to include in the result. Note that all selectors in the
    // array must match in order for this selector to trigger
    if (_Array$isArray(matchingSelector)) {
      $content = $(matchingSelector.join(','));
      var $wrapper = $('<div></div>');
      $content.each(function(index, element) {
        $wrapper.append(element);
      });
      $content = $wrapper;
    } else {
      $content = $(matchingSelector);
    } // Wrap in div so transformation can take place on root element

    $content.wrap($('<div></div>'));
    $content = $content.parent();
    $content = transformElements($content, $, extractionOpts);
    $content = cleanBySelectors($content, $, extractionOpts);
    $content = Cleaners[type](
      $content,
      _objectSpread({}, opts, {
        defaultCleaner: defaultCleaner,
      })
    );
    return $.html($content);
  }

  var result; // if selector is an array (e.g., ['img', 'src']),
  // extract the attr

  if (_Array$isArray(matchingSelector)) {
    var _matchingSelector = _slicedToArray(matchingSelector, 2),
      selector = _matchingSelector[0],
      attr = _matchingSelector[1];

    result = $(selector)
      .attr(attr)
      .trim();
  } else {
    var $node = $(matchingSelector);
    $node = cleanBySelectors($node, $, extractionOpts);
    $node = transformElements($node, $, extractionOpts);
    result = $node.text().trim();
  } // Allow custom extractor to skip default cleaner
  // for this type; defaults to true

  if (defaultCleaner) {
    return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
  }

  return result;
}

function extractResult(opts) {
  var type = opts.type,
    extractor = opts.extractor,
    _opts$fallback = opts.fallback,
    fallback = _opts$fallback === void 0 ? true : _opts$fallback;
  var result = select(
    _objectSpread({}, opts, {
      extractionOpts: extractor[type],
    })
  ); // If custom parser succeeds, return the result

  if (result) {
    return result;
  } // If nothing matches the selector, and fallback is enabled,
  // run the Generic extraction

  if (fallback) return GenericExtractor[type](opts);
  return null;
}

var RootExtractor = {
  extract: function extract() {
    var extractor =
      arguments.length > 0 && arguments[0] !== undefined
        ? arguments[0]
        : GenericExtractor;
    var opts = arguments.length > 1 ? arguments[1] : undefined;
    var _opts = opts,
      contentOnly = _opts.contentOnly,
      extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method

    if (extractor.domain === '*') return extractor.extract(opts);
    opts = _objectSpread({}, opts, {
      extractor: extractor,
    });

    if (contentOnly) {
      var _content = extractResult(
        _objectSpread({}, opts, {
          type: 'content',
          extractHtml: true,
          title: extractedTitle,
        })
      );

      return {
        content: _content,
      };
    }

    var title = extractResult(
      _objectSpread({}, opts, {
        type: 'title',
      })
    );
    var date_published = extractResult(
      _objectSpread({}, opts, {
        type: 'date_published',
      })
    );
    var author = extractResult(
      _objectSpread({}, opts, {
        type: 'author',
      })
    );
    var next_page_url = extractResult(
      _objectSpread({}, opts, {
        type: 'next_page_url',
      })
    );
    var content = extractResult(
      _objectSpread({}, opts, {
        type: 'content',
        extractHtml: true,
        title: title,
      })
    );
    var lead_image_url = extractResult(
      _objectSpread({}, opts, {
        type: 'lead_image_url',
        content: content,
      })
    );
    var excerpt = extractResult(
      _objectSpread({}, opts, {
        type: 'excerpt',
        content: content,
      })
    );
    var dek = extractResult(
      _objectSpread({}, opts, {
        type: 'dek',
        content: content,
        excerpt: excerpt,
      })
    );
    var word_count = extractResult(
      _objectSpread({}, opts, {
        type: 'word_count',
        content: content,
      })
    );
    var direction = extractResult(
      _objectSpread({}, opts, {
        type: 'direction',
        title: title,
      })
    );

    var _ref3 = extractResult(
        _objectSpread({}, opts, {
          type: 'url_and_domain',
        })
      ) || {
        url: null,
        domain: null,
      },
      url = _ref3.url,
      domain = _ref3.domain;

    return {
      title: title,
      content: content,
      author: author,
      date_published: date_published,
      lead_image_url: lead_image_url,
      dek: dek,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
      excerpt: excerpt,
      word_count: word_count,
      direction: direction,
    };
  },
};

function collectAllPages(_x) {
  return _collectAllPages.apply(this, arguments);
}

function _collectAllPages() {
  _collectAllPages = _asyncToGenerator(
    /*#__PURE__*/
    _regeneratorRuntime.mark(function _callee(_ref) {
      var next_page_url,
        html,
        $,
        metaCache,
        result,
        Extractor,
        title,
        url,
        pages,
        previousUrls,
        extractorOpts,
        nextPageResult,
        word_count;
      return _regeneratorRuntime.wrap(
        function _callee$(_context) {
          while (1) {
            switch ((_context.prev = _context.next)) {
              case 0:
                (next_page_url = _ref.next_page_url),
                  (html = _ref.html),
                  ($ = _ref.$),
                  (metaCache = _ref.metaCache),
                  (result = _ref.result),
                  (Extractor = _ref.Extractor),
                  (title = _ref.title),
                  (url = _ref.url);
                // At this point, we've fetched just the first page
                pages = 1;
                previousUrls = [removeAnchor(url)]; // If we've gone over 26 pages, something has
              // likely gone wrong.

              case 3:
                if (!(next_page_url && pages < 26)) {
                  _context.next = 16;
                  break;
                }

                pages += 1; // eslint-disable-next-line no-await-in-loop

                _context.next = 7;
                return Resource.create(next_page_url);

              case 7:
                $ = _context.sent;
                html = $.html();
                extractorOpts = {
                  url: next_page_url,
                  html: html,
                  $: $,
                  metaCache: metaCache,
                  contentOnly: true,
                  extractedTitle: title,
                  previousUrls: previousUrls,
                };
                nextPageResult = RootExtractor.extract(
                  Extractor,
                  extractorOpts
                );
                previousUrls.push(next_page_url);
                result = _objectSpread({}, result, {
                  content: ''
                    .concat(result.content, '<hr><h4>Page ')
                    .concat(pages, '</h4>')
                    .concat(nextPageResult.content),
                }); // eslint-disable-next-line prefer-destructuring

                next_page_url = nextPageResult.next_page_url;
                _context.next = 3;
                break;

              case 16:
                word_count = GenericExtractor.word_count({
                  content: '<div>'.concat(result.content, '</div>'),
                });
                return _context.abrupt(
                  'return',
                  _objectSpread({}, result, {
                    total_pages: pages,
                    pages_rendered: pages,
                    word_count: word_count,
                  })
                );

              case 18:
              case 'end':
                return _context.stop();
            }
          }
        },
        _callee,
        this
      );
    })
  );
  return _collectAllPages.apply(this, arguments);
}

var Mercury = {
  parse: function parse(url, html) {
    var opts =
      arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
    return _asyncToGenerator(
      /*#__PURE__*/
      _regeneratorRuntime.mark(function _callee() {
        var _opts$fetchAllPages,
          fetchAllPages,
          _opts$fallback,
          fallback,
          parsedUrl,
          $,
          Extractor,
          metaCache,
          result,
          _result,
          title,
          next_page_url;

        return _regeneratorRuntime.wrap(
          function _callee$(_context) {
            while (1) {
              switch ((_context.prev = _context.next)) {
                case 0:
                  (_opts$fetchAllPages = opts.fetchAllPages),
                    (fetchAllPages =
                      _opts$fetchAllPages === void 0
                        ? true
                        : _opts$fetchAllPages),
                    (_opts$fallback = opts.fallback),
                    (fallback =
                      _opts$fallback === void 0 ? true : _opts$fallback); // if no url was passed and this is the browser version,
                  // set url to window.location.href and load the html
                  // from the current page

                  if (!url && cheerio.browser) {
                    url = window.location.href; // eslint-disable-line no-undef

                    html = html || cheerio.html();
                  }

                  parsedUrl = URL.parse(url);

                  if (validateUrl(parsedUrl)) {
                    _context.next = 5;
                    break;
                  }

                  return _context.abrupt('return', Errors.badUrl);

                case 5:
                  _context.next = 7;
                  return Resource.create(url, html, parsedUrl);

                case 7:
                  $ = _context.sent;
                  Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
                  // If we found an error creating the resource, return that error

                  if (!$.failed) {
                    _context.next = 11;
                    break;
                  }

                  return _context.abrupt('return', $);

                case 11:
                  // if html still has not been set (i.e., url passed to Mercury.parse),
                  // set html from the response of Resource.create
                  if (!html) {
                    html = $.html();
                  } // Cached value of every meta name in our document.
                  // Used when extracting title/author/date_published/dek

                  metaCache = $('meta')
                    .map(function(_, node) {
                      return $(node).attr('name');
                    })
                    .toArray();
                  result = RootExtractor.extract(Extractor, {
                    url: url,
                    html: html,
                    $: $,
                    metaCache: metaCache,
                    parsedUrl: parsedUrl,
                    fallback: fallback,
                  });
                  (_result = result),
                    (title = _result.title),
                    (next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found

                  if (!(fetchAllPages && next_page_url)) {
                    _context.next = 21;
                    break;
                  }

                  _context.next = 18;
                  return collectAllPages({
                    Extractor: Extractor,
                    next_page_url: next_page_url,
                    html: html,
                    $: $,
                    metaCache: metaCache,
                    result: result,
                    title: title,
                    url: url,
                  });

                case 18:
                  result = _context.sent;
                  _context.next = 22;
                  break;

                case 21:
                  result = _objectSpread({}, result, {
                    total_pages: 1,
                    rendered_pages: 1,
                  });

                case 22:
                  return _context.abrupt('return', result);

                case 23:
                case 'end':
                  return _context.stop();
              }
            }
          },
          _callee,
          this
        );
      })
    )();
  },
  browser: !!cheerio.browser,
  // A convenience method for getting a resource
  // to work with, e.g., for custom extractor generator
  fetchResource: function fetchResource(url) {
    return Resource.create(url);
  },
};

module.exports = Mercury;