mercury-parser/dist/generate-custom-parser.js

'use strict';

function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }

var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
var _toConsumableArray = _interopDefault(require('@babel/runtime-corejs2/helpers/toConsumableArray'));
var fs = _interopDefault(require('fs'));
var URL = _interopDefault(require('url'));
var inquirer = _interopDefault(require('inquirer'));
var ora = _interopDefault(require('ora'));
var child_process = require('child_process');
var _Reflect$ownKeys = _interopDefault(require('@babel/runtime-corejs2/core-js/reflect/own-keys'));
var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
var defineProperty = _interopDefault(require('@babel/runtime-corejs2/helpers/defineProperty'));
var objectSpread = _interopDefault(require('@babel/runtime-corejs2/helpers/objectSpread'));
var _parseFloat = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-float'));
var iconvLite = _interopDefault(require('iconv-lite'));
var set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
var _Object$freeze = _interopDefault(require('@babel/runtime-corejs2/core-js/object/freeze'));
var regenerator = _interopDefault(require('@babel/runtime-corejs2/regenerator'));
var objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/helpers/objectWithoutProperties'));
var asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
var cheerio = _interopDefault(require('cheerio'));
var turndown = _interopDefault(require('turndown'));
var promise = _interopDefault(require('@babel/runtime-corejs2/core-js/promise'));
var postmanRequest = _interopDefault(require('postman-request'));
var assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
var keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
var stringDirection = _interopDefault(require('string-direction'));
var validUrl = _interopDefault(require('valid-url'));
var momentTimezone = _interopDefault(require('moment-timezone'));
var momentParseformat = _interopDefault(require('moment-parseformat'));
var wuzzy = _interopDefault(require('wuzzy'));
var difflib = _interopDefault(require('difflib'));
var from = _interopDefault(require('@babel/runtime-corejs2/core-js/array/from'));
var ellipsize = _interopDefault(require('ellipsize'));
var isArray = _interopDefault(require('@babel/runtime-corejs2/core-js/array/is-array'));
var _taggedTemplateLiteral = _interopDefault(require('@babel/runtime-corejs2/helpers/taggedTemplateLiteral'));

// Spacer images to be removed
// but would normally remove

var KEEP_CLASS = 'mercury-parser-keep';

var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes

function stripJunkTags(article, $) {
  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];

  if (tags.length === 0) {
    tags = STRIP_OUTPUT_TAGS;
  } // Remove matching elements, but ignore
  // any element with a class of mercury-parser-keep


  $(tags.join(','), article).not(".".concat(KEEP_CLASS)).remove();
  return $;
}

// // CONTENT FETCHING CONSTANTS ////

// return 1 for every comma in text

// Given a node type to search for, and a list of regular expressions,

// An expression that looks to try to find the page digit within a URL, if

// Given a string, return True if it appears to have an ending sentence

// Scoring

function absolutize($, rootUrl, attr) {
  var baseUrl = $('base').attr('href');
  $("[".concat(attr, "]")).each(function (_, node) {
    var attrs = getAttrs(node);
    var url = attrs[attr];
    if (!url) return;
    var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
    setAttr(node, attr, absoluteUrl);
  });
}

function absolutizeSet($, rootUrl, $content) {
  $('[srcset]', $content).each(function (_, node) {
    var attrs = getAttrs(node);
    var urlSet = attrs.srcset;

    if (urlSet) {
      // a comma should be considered part of the candidate URL unless preceded by a descriptor
      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
      // space characters inside the URL should be encoded (%20 or +)
      var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g);
      if (!candidates) return;
      var absoluteCandidates = candidates.map(function (candidate) {
        // a candidate URL cannot start or end with a comma
        // descriptors are separated from the URLs by unescaped whitespace
        var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
        parts[0] = URL.resolve(rootUrl, parts[0]);
        return parts.join(' ');
      });

      var absoluteUrlSet = _toConsumableArray(new set(absoluteCandidates)).join(', ');

      setAttr(node, 'srcset', absoluteUrlSet);
    }
  });
}

function makeLinksAbsolute$$1($content, $, url) {
  ['href', 'src'].forEach(function (attr) {
    return absolutize($, url, attr);
  });
  absolutizeSet($, url, $content);
  return $content;
}

// strips all tags from a string of text

// Given a node, determine if it's article-like enough to return

function getAttrs(node) {
  var attribs = node.attribs,
      attributes = node.attributes;

  if (!attribs && attributes) {
    var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
      var attr = attributes[index];
      if (!attr.name || !attr.value) return acc;
      acc[attr.name] = attr.value;
      return acc;
    }, {});

    return attrs;
  }

  return attribs;
}

function setAttr(node, attr, val) {
  if (node.attribs) {
    node.attribs[attr] = val;
  } else if (node.attributes) {
    node.setAttribute(attr, val);
  }

  return node;
}

// DOM manipulation

function _interopDefault$1(ex) {
  return ex && _typeof(ex) === 'object' && 'default' in ex ? ex['default'] : ex;
}

var _regeneratorRuntime = _interopDefault$1(regenerator);

var _objectSpread = _interopDefault$1(objectSpread);

var _objectWithoutProperties = _interopDefault$1(objectWithoutProperties);

var _asyncToGenerator = _interopDefault$1(asyncToGenerator);

var URL$1 = _interopDefault$1(URL);

var cheerio$1 = _interopDefault$1(cheerio);

var TurndownService = _interopDefault$1(turndown);

var iconv = _interopDefault$1(iconvLite);

var _parseInt$1 = _interopDefault$1(_parseInt);

var _slicedToArray$1 = _interopDefault$1(_slicedToArray);

var _Promise = _interopDefault$1(promise);

var request = _interopDefault$1(postmanRequest);

var _Reflect$ownKeys$1 = _interopDefault$1(_Reflect$ownKeys);

var _toConsumableArray$1 = _interopDefault$1(_toConsumableArray);

var _defineProperty = _interopDefault$1(defineProperty);

var _parseFloat$1 = _interopDefault$1(_parseFloat);

var _Set = _interopDefault$1(set);

var _typeof$1 = _interopDefault$1(_typeof);

var _getIterator$1 = _interopDefault$1(_getIterator);

var _Object$assign = _interopDefault$1(assign);

var _Object$keys = _interopDefault$1(keys);

var stringDirection$1 = _interopDefault$1(stringDirection);

var validUrl$1 = _interopDefault$1(validUrl);

var moment = _interopDefault$1(momentTimezone);

var parseFormat = _interopDefault$1(momentParseformat);

var wuzzy$1 = _interopDefault$1(wuzzy);

var difflib$1 = _interopDefault$1(difflib);

var _Array$from = _interopDefault$1(from);

var ellipsize$1 = _interopDefault$1(ellipsize);

var _Array$isArray = _interopDefault$1(isArray);

var NORMALIZE_RE$1 = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;

function normalizeSpaces$1(text) {
  return text.replace(NORMALIZE_RE$1, ' ').trim();
} // Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.


function extractFromUrl$1(url, regexList) {
  var matchRe = regexList.find(function (re) {
    return re.test(url);
  });

  if (matchRe) {
    return matchRe.exec(url)[1];
  }

  return null;
} // An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
//  page=1
//  pg=1
//  p=1
//  paging=12
//  pag=7
//  pagination/1
//  paging/88
//  pa/83
//  p/11
//
// Does not match:
//  pg=102
//  page:2


var PAGE_IN_HREF_RE$1 = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
var HAS_ALPHA_RE$1 = /[a-z]/i;
var IS_ALPHA_RE$1 = /^[a-z]+$/i;
var IS_DIGIT_RE$1 = /^[0-9]+$/i;
var ENCODING_RE$1 = /charset=([\w-]+)\b/;
var DEFAULT_ENCODING$1 = 'utf-8';

function pageNumFromUrl$1(url) {
  var matches = url.match(PAGE_IN_HREF_RE$1);
  if (!matches) return null;

  var pageNum = _parseInt$1(matches[6], 10); // Return pageNum < 100, otherwise
  // return null


  return pageNum < 100 ? pageNum : null;
}

function removeAnchor$1(url) {
  return url.split('#')[0].replace(/\/$/, '');
}

function isGoodSegment$1(segment, index, firstSegmentHasLetters) {
  var goodSegment = true; // If this is purely a number, and it's the first or second
  // url_segment, it's probably a page number. Remove it.

  if (index < 2 && IS_DIGIT_RE$1.test(segment) && segment.length < 3) {
    goodSegment = true;
  } // If this is the first url_segment and it's just "index",
  // remove it


  if (index === 0 && segment.toLowerCase() === 'index') {
    goodSegment = false;
  } // If our first or second url_segment is smaller than 3 characters,
  // and the first url_segment had no alphas, remove it.


  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
    goodSegment = false;
  }

  return goodSegment;
} // Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.


function articleBaseUrl$1(url, parsed) {
  var parsedUrl = parsed || URL$1.parse(url);
  var protocol = parsedUrl.protocol,
      host = parsedUrl.host,
      path = parsedUrl.path;
  var firstSegmentHasLetters = false;
  var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
    var segment = rawSegment; // Split off and save anything that looks like a file type.

    if (segment.includes('.')) {
      var _segment$split = segment.split('.'),
          _segment$split2 = _slicedToArray$1(_segment$split, 2),
          possibleSegment = _segment$split2[0],
          fileExt = _segment$split2[1];

      if (IS_ALPHA_RE$1.test(fileExt)) {
        segment = possibleSegment;
      }
    } // If our first or second segment has anything looking like a page
    // number, remove it.


    if (PAGE_IN_HREF_RE$1.test(segment) && index < 2) {
      segment = segment.replace(PAGE_IN_HREF_RE$1, '');
    } // If we're on the first segment, check to see if we have any
    // characters in it. The first segment is actually the last bit of
    // the URL, and this will be helpful to determine if we're on a URL
    // segment that looks like "/2/" for example.


    if (index === 0) {
      firstSegmentHasLetters = HAS_ALPHA_RE$1.test(segment);
    } // If it's not marked for deletion, push it to cleaned_segments.


    if (isGoodSegment$1(segment, index, firstSegmentHasLetters)) {
      acc.push(segment);
    }

    return acc;
  }, []);
  return "".concat(protocol, "//").concat(host).concat(cleanedSegments.reverse().join('/'));
} // Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.


var SENTENCE_END_RE$1 = new RegExp('.( |$)');

function hasSentenceEnd$1(text) {
  return SENTENCE_END_RE$1.test(text);
}

function excerptContent$1(content) {
  var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
  return content.trim().split(/\s+/).slice(0, words).join(' ');
} // used in our fetchResource function to
// ensure correctly encoded responses


function getEncoding$1(str) {
  var encoding = DEFAULT_ENCODING$1;
  var matches = ENCODING_RE$1.exec(str);

  if (matches !== null) {
    var _matches = _slicedToArray$1(matches, 2);

    str = _matches[1];
  }

  if (iconv.encodingExists(str)) {
    encoding = str;
  }

  return encoding;
}

var REQUEST_HEADERS = cheerio$1.browser ? {} : {
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}; // The number of milliseconds to attempt to fetch a resource before timing out.

var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from

var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];
var BAD_CONTENT_TYPES_RE = new RegExp("^(".concat(BAD_CONTENT_TYPES.join('|'), ")$"), 'i'); // Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.

var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off

function get(options) {
  return new _Promise(function (resolve, reject) {
    request(options, function (err, response, body) {
      if (err) {
        reject(err);
      } else {
        resolve({
          body: body,
          response: response
        });
      }
    });
  });
} // Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 or not.
// Validation here means that we haven't found reason to bail from
// further processing of this url.


function validateResponse(response) {
  var parseNon200 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; // Check if we got a valid status code
  // This isn't great, but I'm requiring a statusMessage to be set
  // before short circuiting b/c nock doesn't set it in tests
  // statusMessage only not set in nock response, in which case
  // I check statusCode, which is currently only 200 for OK responses
  // in tests

  if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) {
    if (!response.statusCode) {
      throw new Error("Unable to fetch content. Original exception was ".concat(response.error));
    } else if (!parseNon200) {
      throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-200 status codes."));
    }
  }

  var _response$headers = response.headers,
      contentType = _response$headers['content-type'],
      contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES

  if (BAD_CONTENT_TYPES_RE.test(contentType)) {
    throw new Error("Content-type for this resource was ".concat(contentType, " and is not allowed."));
  } // Check that the content length is below maximum


  if (contentLength > MAX_CONTENT_LENGTH) {
    throw new Error("Content for this resource was too large. Maximum content length is ".concat(MAX_CONTENT_LENGTH, "."));
  }

  return true;
} // Grabs the last two pieces of the URL and joins them back together
// TODO: This should gracefully handle timeouts and raise the
//       proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
//       unicode content for HTML, with charset conversion.


function fetchResource(_x, _x2) {
  return _fetchResource.apply(this, arguments);
}

function _fetchResource() {
  _fetchResource = _asyncToGenerator(
  /*#__PURE__*/
  _regeneratorRuntime.mark(function _callee(url, parsedUrl) {
    var headers,
        options,
        _ref2,
        response,
        body,
        _args = arguments;

    return _regeneratorRuntime.wrap(function _callee$(_context) {
      while (1) {
        switch (_context.prev = _context.next) {
          case 0:
            headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
            parsedUrl = parsedUrl || URL$1.parse(encodeURI(url));
            options = _objectSpread({
              url: parsedUrl.href,
              headers: _objectSpread({}, REQUEST_HEADERS, headers),
              timeout: FETCH_TIMEOUT,
              // Accept cookies
              jar: true,
              // Set to null so the response returns as binary and body as buffer
              // https://github.com/request/request#requestoptions-callback
              encoding: null,
              // Accept and decode gzip
              gzip: true,
              // Follow any non-GET redirects
              followAllRedirects: true
            }, typeof window !== 'undefined' ? {} : {
              // Follow GET redirects; this option is for Node only
              followRedirect: true
            });
            _context.next = 5;
            return get(options);

          case 5:
            _ref2 = _context.sent;
            response = _ref2.response;
            body = _ref2.body;
            _context.prev = 8;
            validateResponse(response);
            return _context.abrupt("return", {
              body: body,
              response: response
            });

          case 13:
            _context.prev = 13;
            _context.t0 = _context["catch"](8);
            return _context.abrupt("return", {
              error: true,
              message: _context.t0.message
            });

          case 16:
          case "end":
            return _context.stop();
        }
      }
    }, _callee, this, [[8, 13]]);
  }));
  return _fetchResource.apply(this, arguments);
}

function convertMetaProp($, from$$1, to) {
  $("meta[".concat(from$$1, "]")).each(function (_, node) {
    var $node = $(node);
    var value = $node.attr(from$$1);
    $node.attr(to, value);
    $node.removeAttr(from$$1);
  });
  return $;
} // For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.


function normalizeMetaTags($) {
  $ = convertMetaProp($, 'content', 'value');
  $ = convertMetaProp($, 'property', 'name');
  return $;
} // Spacer images to be removed


var SPACER_RE$1 = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep
// but would normally remove

var KEEP_CLASS$1 = 'mercury-parser-keep';
var KEEP_SELECTORS$1 = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.

var STRIP_OUTPUT_TAGS$1 = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes

var WHITELIST_ATTRS$1 = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
var WHITELIST_ATTRS_RE$1 = new RegExp("^(".concat(WHITELIST_ATTRS$1.join('|'), ")$"), 'i'); // removeEmpty

var CLEAN_CONDITIONALLY_TAGS$1 = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders

var HEADER_TAGS$1 = ['h2', 'h3', 'h4', 'h5', 'h6'];
var HEADER_TAG_LIST$1 = HEADER_TAGS$1.join(','); // // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.

var UNLIKELY_CANDIDATES_BLACKLIST$2 = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', // 'form', // This is too generic, has too many false positives
'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.
'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box
'presence_control_external', // lifehacker.com container full of false positives
'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools']; // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.

var UNLIKELY_CANDIDATES_WHITELIST$2 = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form
'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow']; // A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.

var DIV_TO_P_BLOCK_TAGS$2 = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(','); // A list of tags that should be ignored when trying to find the top candidate
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var POSITIVE_SCORE_HINTS$2 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
'\\Bcopy']; // The above list, joined into a matching regular expression

var POSITIVE_SCORE_RE$2 = new RegExp(POSITIVE_SCORE_HINTS$2.join('|'), 'i'); // Readability publisher-specific guidelines
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var NEGATIVE_SCORE_HINTS$2 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
'promo', 'pr_', // autoblog - press release
'related', 'respond', 'roundcontent', // lifehacker restricted content warning
'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression

var NEGATIVE_SCORE_RE$2 = new RegExp(NEGATIVE_SCORE_HINTS$2.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful.

var IS_WP_SELECTOR$1 = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear.

var PAGE_RE$1 = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next
// http://bit.ly/qneNIT

var BLOCK_LEVEL_TAGS$2 = ['article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video'];
var BLOCK_LEVEL_TAGS_RE$2 = new RegExp("^(".concat(BLOCK_LEVEL_TAGS$2.join('|'), ")$"), 'i'); // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.

var candidatesBlacklist$2 = UNLIKELY_CANDIDATES_BLACKLIST$2.join('|');
var CANDIDATES_BLACKLIST$2 = new RegExp(candidatesBlacklist$2, 'i');
var candidatesWhitelist$2 = UNLIKELY_CANDIDATES_WHITELIST$2.join('|');
var CANDIDATES_WHITELIST$2 = new RegExp(candidatesWhitelist$2, 'i');

function stripUnlikelyCandidates$1($) {
  //  Loop through the provided document and remove any non-link nodes
  //  that are unlikely candidates for article content.
  //
  //  Links are ignored because there are very often links to content
  //  that are identified as non-body-content, but may be inside
  //  article-like content.
  //
  //  :param $: a cheerio object to strip nodes from
  //  :return $: the cleaned cheerio object
  $('*').not('a').each(function (index, node) {
    var $node = $(node);
    var classes = $node.attr('class');
    var id = $node.attr('id');
    if (!id && !classes) return;
    var classAndId = "".concat(classes || '', " ").concat(id || '');

    if (CANDIDATES_WHITELIST$2.test(classAndId)) {
      return;
    }

    if (CANDIDATES_BLACKLIST$2.test(classAndId)) {
      $node.remove();
    }
  });
  return $;
} // Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
//  Given cheerio object, convert consecutive <br /> tags into
//  <p /> tags instead.
//
//  :param $: A cheerio object


function brsToPs$$1($) {
  var collapsing = false;
  $('br').each(function (index, element) {
    var $element = $(element);
    var nextElement = $element.next().get(0);

    if (nextElement && nextElement.tagName.toLowerCase() === 'br') {
      collapsing = true;
      $element.remove();
    } else if (collapsing) {
      collapsing = false;
      paragraphize$1(element, $, true);
    }
  });
  return $;
} // make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
// If the node is a <br />, it treats the following inline siblings
// as if they were its children.
//
// :param node: The node to paragraphize; this is a raw node
// :param $: The cheerio object to handle dom manipulation
// :param br: Whether or not the passed node is a br


function paragraphize$1(node, $) {
  var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
  var $node = $(node);

  if (br) {
    var sibling = node.nextSibling;
    var p = $('<p></p>'); // while the next node is text or not a block level element
    // append it to a new p node

    while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE$2.test(sibling.tagName))) {
      var _sibling = sibling,
          nextSibling = _sibling.nextSibling;
      $(sibling).appendTo(p);
      sibling = nextSibling;
    }

    $node.replaceWith(p);
    $node.remove();
    return $;
  }

  return $;
}

function convertDivs$1($) {
  $('div').each(function (index, div) {
    var $div = $(div);
    var convertible = $div.children(DIV_TO_P_BLOCK_TAGS$2).length === 0;

    if (convertible) {
      convertNodeTo$$1($div, $, 'p');
    }
  });
  return $;
}

function convertSpans$2($) {
  $('span').each(function (index, span) {
    var $span = $(span);
    var convertible = $span.parents('p, div, li, figcaption').length === 0;

    if (convertible) {
      convertNodeTo$$1($span, $, 'p');
    }
  });
  return $;
} // Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
//   Things fitting this criteria:
//   * Multiple consecutive <br /> tags.
//   * <div /> tags without block level elements inside of them
//   * <span /> tags who are not children of <p /> or <div /> tags.
//
//   :param $: A cheerio object to search
//   :return cheerio object with new p elements
//   (By-reference mutation, though. Returned just for convenience.)


function convertToParagraphs$$1($) {
  $ = brsToPs$$1($);
  $ = convertDivs$1($);
  $ = convertSpans$2($);
  return $;
}

function convertNodeTo$$1($node, $) {
  var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
  var node = $node.get(0);

  if (!node) {
    return $;
  }

  var attrs = getAttrs$1(node) || {};

  var attribString = _Reflect$ownKeys$1(attrs).map(function (key) {
    return "".concat(key, "=").concat(attrs[key]);
  }).join(' ');

  var html;

  if ($.browser) {
    // In the browser, the contents of noscript tags aren't rendered, therefore
    // transforms on the noscript tag (commonly used for lazy-loading) don't work
    // as expected. This test case handles that
    html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
  } else {
    html = $node.contents();
  }

  $node.replaceWith("<".concat(tag, " ").concat(attribString, ">").concat(html, "</").concat(tag, ">"));
  return $;
}

function cleanForHeight$1($img, $) {
  var height = _parseInt$1($img.attr('height'), 10);

  var width = _parseInt$1($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or
  // widths, because they are most likely shims or icons,
  // which aren't very useful for reading.

  if ((height || 20) < 10 || width < 10) {
    $img.remove();
  } else if (height) {
    // Don't ever specify a height on images, so that we can
    // scale with respect to width without screwing up the
    // aspect ratio.
    $img.removeAttr('height');
  }

  return $;
} // Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP


function removeSpacers$1($img, $) {
  if (SPACER_RE$1.test($img.attr('src'))) {
    $img.remove();
  }

  return $;
}

function cleanImages$1($article, $) {
  $article.find('img').each(function (index, img) {
    var $img = $(img);
    cleanForHeight$1($img, $);
    removeSpacers$1($img, $);
  });
  return $;
}

function markToKeep$1(article, $, url) {
  var tags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];

  if (tags.length === 0) {
    tags = KEEP_SELECTORS$1;
  }

  if (url) {
    var _URL$parse = URL$1.parse(url),
        protocol = _URL$parse.protocol,
        hostname = _URL$parse.hostname;

    tags = [].concat(_toConsumableArray$1(tags), ["iframe[src^=\"".concat(protocol, "//").concat(hostname, "\"]")]);
  }

  $(tags.join(','), article).addClass(KEEP_CLASS$1);
  return $;
}

function stripJunkTags$1(article, $) {
  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];

  if (tags.length === 0) {
    tags = STRIP_OUTPUT_TAGS$1;
  } // Remove matching elements, but ignore
  // any element with a class of mercury-parser-keep


  $(tags.join(','), article).not(".".concat(KEEP_CLASS$1)).remove();
  return $;
} // by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.


function cleanHOnes$$1(article, $) {
  var $hOnes = $('h1', article);

  if ($hOnes.length < 3) {
    $hOnes.each(function (index, node) {
      return $(node).remove();
    });
  } else {
    $hOnes.each(function (index, node) {
      convertNodeTo$$1($(node), $, 'h2');
    });
  }

  return $;
}

function removeAllButWhitelist$1($article, $) {
  $article.find('*').each(function (index, node) {
    var attrs = getAttrs$1(node);
    setAttrs$1(node, _Reflect$ownKeys$1(attrs).reduce(function (acc, attr) {
      if (WHITELIST_ATTRS_RE$1.test(attr)) {
        return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr]));
      }

      return acc;
    }, {}));
  }); // Remove the mercury-parser-keep class from result

  $(".".concat(KEEP_CLASS$1), $article).removeClass(KEEP_CLASS$1);
  return $article;
} // Remove attributes like style or align


function cleanAttributes$$1($article, $) {
  // Grabbing the parent because at this point
  // $article will be wrapped in a div which will
  // have a score set on it.
  return removeAllButWhitelist$1($article.parent().length ? $article.parent() : $article, $);
}

function removeEmpty$1($article, $) {
  $article.find('p').each(function (index, p) {
    var $p = $(p);
    if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove();
  });
  return $;
} // // CONTENT FETCHING CONSTANTS ////
// for a document.


var NON_TOP_CANDIDATE_TAGS$1$1 = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
var NON_TOP_CANDIDATE_TAGS_RE$1$1 = new RegExp("^(".concat(NON_TOP_CANDIDATE_TAGS$1$1.join('|'), ")$"), 'i'); // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats

var HNEWS_CONTENT_SELECTORS$1$1 = [['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body']];
var PHOTO_HINTS$1$1 = ['figure', 'photo', 'image', 'caption'];
var PHOTO_HINTS_RE$1$1 = new RegExp(PHOTO_HINTS$1$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var POSITIVE_SCORE_HINTS$1$1 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
'\\Bcopy']; // The above list, joined into a matching regular expression

var POSITIVE_SCORE_RE$1$1 = new RegExp(POSITIVE_SCORE_HINTS$1$1.join('|'), 'i'); // Readability publisher-specific guidelines

var READABILITY_ASSET$1$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var NEGATIVE_SCORE_HINTS$1$1 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
'promo', 'pr_', // autoblog - press release
'related', 'respond', 'roundcontent', // lifehacker restricted content warning
'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression

var NEGATIVE_SCORE_RE$1$1 = new RegExp(NEGATIVE_SCORE_HINTS$1$1.join('|'), 'i'); // Match a digit. Pretty clear.

var PARAGRAPH_SCORE_TAGS$1$1 = new RegExp('^(p|li|span|pre)$', 'i');
var CHILD_CONTENT_TAGS$1$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
var BAD_TAGS$1$1 = new RegExp('^(address|form)$', 'i');

function getWeight$1(node) {
  var classes = node.attr('class');
  var id = node.attr('id');
  var score = 0;

  if (id) {
    // if id exists, try to score on both positive and negative
    if (POSITIVE_SCORE_RE$1$1.test(id)) {
      score += 25;
    }

    if (NEGATIVE_SCORE_RE$1$1.test(id)) {
      score -= 25;
    }
  }

  if (classes) {
    if (score === 0) {
      // if classes exist and id did not contribute to score
      // try to score on both positive and negative
      if (POSITIVE_SCORE_RE$1$1.test(classes)) {
        score += 25;
      }

      if (NEGATIVE_SCORE_RE$1$1.test(classes)) {
        score -= 25;
      }
    } // even if score has been set by id, add score for
    // possible photo matches
    // "try to keep photos if we can"


    if (PHOTO_HINTS_RE$1$1.test(classes)) {
      score += 10;
    } // add 25 if class matches entry-content-asset,
    // a class apparently instructed for use in the
    // Readability publisher guidelines
    // https://www.readability.com/developers/guidelines


    if (READABILITY_ASSET$1$1.test(classes)) {
      score += 25;
    }
  }

  return score;
} // returns the score of a node based on
// the node's score attribute
// returns null if no score set


function getScore$1($node) {
  return _parseFloat$1($node.attr('score')) || null;
} // return 1 for every comma in text


function scoreCommas$1(text) {
  return (text.match(/,/g) || []).length;
}

var idkRe$1 = new RegExp('^(p|pre)$', 'i');

function scoreLength$1(textLength) {
  var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
  var chunks = textLength / 50;

  if (chunks > 0) {
    var lengthBonus; // No idea why p or pre are being tamped down here
    // but just following the source for now
    // Not even sure why tagName is included here,
    // since this is only being called from the context
    // of scoreParagraph

    if (idkRe$1.test(tagName)) {
      lengthBonus = chunks - 2;
    } else {
      lengthBonus = chunks - 1.25;
    }

    return Math.min(Math.max(lengthBonus, 0), 3);
  }

  return 0;
} // commas, etc. Higher is better.


function scoreParagraph$$1(node) {
  var score = 1;
  var text = node.text().trim();
  var textLength = text.length; // If this paragraph is less than 25 characters, don't count it.

  if (textLength < 25) {
    return 0;
  } // Add points for any commas within this paragraph


  score += scoreCommas$1(text); // For every 50 characters in this paragraph, add another point. Up
  // to 3 points.

  score += scoreLength$1(textLength); // Articles can end with short paragraphs when people are being clever
  // but they can also end with short paragraphs setting up lists of junk
  // that we strip. This negative tweaks junk setup paragraphs just below
  // the cutoff threshold.

  if (text.slice(-1) === ':') {
    score -= 1;
  }

  return score;
}

function setScore$1($node, $, score) {
  $node.attr('score', score);
  return $node;
}

function addScore$$1($node, $, amount) {
  try {
    var score = getOrInitScore$$1($node, $) + amount;
    setScore$1($node, $, score);
  } catch (e) {// Ignoring; error occurs in scoreNode
  }

  return $node;
}

function addToParent$$1(node, $, score) {
  var parent = node.parent();

  if (parent) {
    addScore$$1(parent, $, score * 0.25);
  }

  return node;
} // if not, initializes a score based on
// the node's tag type


function getOrInitScore$$1($node, $) {
  var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
  var score = getScore$1($node);

  if (score) {
    return score;
  }

  score = scoreNode$$1($node);

  if (weightNodes) {
    score += getWeight$1($node);
  }

  addToParent$$1($node, $, score);
  return score;
} // just scores based on tag.


function scoreNode$$1($node) {
  var _$node$get = $node.get(0),
      tagName = _$node$get.tagName; // TODO: Consider ordering by most likely.
  // E.g., if divs are a more common tag on a page,
  // Could save doing that regex test on every node – AP


  if (PARAGRAPH_SCORE_TAGS$1$1.test(tagName)) {
    return scoreParagraph$$1($node);
  }

  if (tagName.toLowerCase() === 'div') {
    return 5;
  }

  if (CHILD_CONTENT_TAGS$1$1.test(tagName)) {
    return 3;
  }

  if (BAD_TAGS$1$1.test(tagName)) {
    return -3;
  }

  if (tagName.toLowerCase() === 'th') {
    return -5;
  }

  return 0;
}

function convertSpans$1$1($node, $) {
  if ($node.get(0)) {
    var _$node$get = $node.get(0),
        tagName = _$node$get.tagName;

    if (tagName === 'span') {
      // convert spans to divs
      convertNodeTo$$1($node, $, 'div');
    }
  }
}

function addScoreTo$1($node, $, score) {
  if ($node) {
    convertSpans$1$1($node, $);
    addScore$$1($node, $, score);
  }
}

function scorePs$1($, weightNodes) {
  $('p, pre').not('[score]').each(function (index, node) {
    // The raw score for this paragraph, before we add any parent/child
    // scores.
    var $node = $(node);
    $node = setScore$1($node, $, getOrInitScore$$1($node, $, weightNodes));
    var $parent = $node.parent();
    var rawScore = scoreNode$$1($node);
    addScoreTo$1($parent, $, rawScore, weightNodes);

    if ($parent) {
      // Add half of the individual content score to the
      // grandparent
      addScoreTo$1($parent.parent(), $, rawScore / 2, weightNodes);
    }
  });
  return $;
} // score content. Parents get the full value of their children's
// content score, grandparents half


function scoreContent$$1($) {
  var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true; // First, look for special hNews based selectors and give them a big
  // boost, if they exist

  HNEWS_CONTENT_SELECTORS$1$1.forEach(function (_ref) {
    var _ref2 = _slicedToArray$1(_ref, 2),
        parentSelector = _ref2[0],
        childSelector = _ref2[1];

    $("".concat(parentSelector, " ").concat(childSelector)).each(function (index, node) {
      addScore$$1($(node).parent(parentSelector), $, 80);
    });
  }); // Doubling this again
  // Previous solution caused a bug
  // in which parents weren't retaining
  // scores. This is not ideal, and
  // should be fixed.

  scorePs$1($, weightNodes);
  scorePs$1($, weightNodes);
  return $;
} // it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14


function mergeSiblings$1($candidate, topScore, $) {
  if (!$candidate.parent().length) {
    return $candidate;
  }

  var siblingScoreThreshold = Math.max(10, topScore * 0.25);
  var wrappingDiv = $('<div></div>');
  $candidate.parent().children().each(function (index, sibling) {
    var $sibling = $(sibling); // Ignore tags like BR, HR, etc

    if (NON_TOP_CANDIDATE_TAGS_RE$1$1.test(sibling.tagName)) {
      return null;
    }

    var siblingScore = getScore$1($sibling);

    if (siblingScore) {
      if ($sibling.get(0) === $candidate.get(0)) {
        wrappingDiv.append($sibling);
      } else {
        var contentBonus = 0;
        var density = linkDensity$1($sibling); // If sibling has a very low link density,
        // give it a small bonus

        if (density < 0.05) {
          contentBonus += 20;
        } // If sibling has a high link density,
        // give it a penalty


        if (density >= 0.5) {
          contentBonus -= 20;
        } // If sibling node has the same class as
        // candidate, give it a bonus


        if ($sibling.attr('class') === $candidate.attr('class')) {
          contentBonus += topScore * 0.2;
        }

        var newScore = siblingScore + contentBonus;

        if (newScore >= siblingScoreThreshold) {
          return wrappingDiv.append($sibling);
        }

        if (sibling.tagName === 'p') {
          var siblingContent = $sibling.text();
          var siblingContentLength = textLength$1(siblingContent);

          if (siblingContentLength > 80 && density < 0.25) {
            return wrappingDiv.append($sibling);
          }

          if (siblingContentLength <= 80 && density === 0 && hasSentenceEnd$1(siblingContent)) {
            return wrappingDiv.append($sibling);
          }
        }
      }
    }

    return null;
  });

  if (wrappingDiv.children().length === 1 && wrappingDiv.children().first().get(0) === $candidate.get(0)) {
    return $candidate;
  }

  return wrappingDiv;
} // candidate nodes we found and find the one with the highest score.


function findTopCandidate$$1($) {
  var $candidate;
  var topScore = 0;
  $('[score]').each(function (index, node) {
    // Ignore tags like BR, HR, etc
    if (NON_TOP_CANDIDATE_TAGS_RE$1$1.test(node.tagName)) {
      return;
    }

    var $node = $(node);
    var score = getScore$1($node);

    if (score > topScore) {
      topScore = score;
      $candidate = $node;
    }
  }); // If we don't have a candidate, return the body
  // or whatever the first element is

  if (!$candidate) {
    return $('body') || $('*').first();
  }

  $candidate = mergeSiblings$1($candidate, topScore, $);
  return $candidate;
} // Scoring


function removeUnlessContent$1($node, $, weight) {
  // Explicitly save entry-content-asset tags, which are
  // noted as valuable in the Publisher guidelines. For now
  // this works everywhere. We may want to consider making
  // this less of a sure-thing later.
  if ($node.hasClass('entry-content-asset')) {
    return;
  }

  var content = normalizeSpaces$1($node.text());

  if (scoreCommas$1(content) < 10) {
    var pCount = $('p', $node).length;
    var inputCount = $('input', $node).length; // Looks like a form, too many inputs.

    if (inputCount > pCount / 3) {
      $node.remove();
      return;
    }

    var contentLength = content.length;
    var imgCount = $('img', $node).length; // Content is too short, and there are no images, so
    // this is probably junk content.

    if (contentLength < 25 && imgCount === 0) {
      $node.remove();
      return;
    }

    var density = linkDensity$1($node); // Too high of link density, is probably a menu or
    // something similar.
    // console.log(weight, density, contentLength)

    if (weight < 25 && density > 0.2 && contentLength > 75) {
      $node.remove();
      return;
    } // Too high of a link density, despite the score being
    // high.


    if (weight >= 25 && density > 0.5) {
      // Don't remove the node if it's a list and the
      // previous sibling starts with a colon though. That
      // means it's probably content.
      var tagName = $node.get(0).tagName.toLowerCase();
      var nodeIsList = tagName === 'ol' || tagName === 'ul';

      if (nodeIsList) {
        var previousNode = $node.prev();

        if (previousNode && normalizeSpaces$1(previousNode.text()).slice(-1) === ':') {
          return;
        }
      }

      $node.remove();
      return;
    }

    var scriptCount = $('script', $node).length; // Too many script tags, not enough content.

    if (scriptCount > 0 && contentLength < 150) {
      $node.remove();
    }
  }
} // Given an article, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.


function cleanTags$$1($article, $) {
  $(CLEAN_CONDITIONALLY_TAGS$1, $article).each(function (index, node) {
    var $node = $(node); // If marked to keep, skip it

    if ($node.hasClass(KEEP_CLASS$1) || $node.find(".".concat(KEEP_CLASS$1)).length > 0) return;
    var weight = getScore$1($node);

    if (!weight) {
      weight = getOrInitScore$$1($node, $);
      setScore$1($node, $, weight);
    } // drop node if its weight is < 0


    if (weight < 0) {
      $node.remove();
    } else {
      // deteremine if node seems like content
      removeUnlessContent$1($node, $, weight);
    }
  });
  return $;
}

function cleanHeaders$1($article, $) {
  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
  $(HEADER_TAG_LIST$1, $article).each(function (index, header) {
    var $header = $(header); // Remove any headers that appear before all other p tags in the
    // document. This probably means that it was part of the title, a
    // subtitle or something else extraneous like a datestamp or byline,
    // all of which should be handled by other metadata handling.

    if ($($header, $article).prevAll('p').length === 0) {
      return $header.remove();
    } // Remove any headers that match the title exactly.


    if (normalizeSpaces$1($(header).text()) === title) {
      return $header.remove();
    } // If this header has a negative weight, it's probably junk.
    // Get rid of it.


    if (getWeight$1($(header)) < 0) {
      return $header.remove();
    }

    return $header;
  });
  return $;
} // html to avoid later complications with multiple body tags.


function rewriteTopLevel$$1(article, $) {
  // I'm not using context here because
  // it's problematic when converting the
  // top-level/root node - AP
  $ = convertNodeTo$$1($('html'), $, 'div');
  $ = convertNodeTo$$1($('body'), $, 'div');
  return $;
}

function absolutize$1($, rootUrl, attr) {
  var baseUrl = $('base').attr('href');
  $("[".concat(attr, "]")).each(function (_, node) {
    var attrs = getAttrs$1(node);
    var url = attrs[attr];
    if (!url) return;
    var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url);
    setAttr$1(node, attr, absoluteUrl);
  });
}

function absolutizeSet$1($, rootUrl, $content) {
  $('[srcset]', $content).each(function (_, node) {
    var attrs = getAttrs$1(node);
    var urlSet = attrs.srcset;

    if (urlSet) {
      // a comma should be considered part of the candidate URL unless preceded by a descriptor
      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
      // space characters inside the URL should be encoded (%20 or +)
      var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g);
      if (!candidates) return;
      var absoluteCandidates = candidates.map(function (candidate) {
        // a candidate URL cannot start or end with a comma
        // descriptors are separated from the URLs by unescaped whitespace
        var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
        parts[0] = URL$1.resolve(rootUrl, parts[0]);
        return parts.join(' ');
      });

      var absoluteUrlSet = _toConsumableArray$1(new _Set(absoluteCandidates)).join(', ');

      setAttr$1(node, 'srcset', absoluteUrlSet);
    }
  });
}

function makeLinksAbsolute$$1($content, $, url) {
  ['href', 'src'].forEach(function (attr) {
    return absolutize$1($, url, attr);
  });
  absolutizeSet$1($, url, $content);
  return $content;
}

function textLength$1(text) {
  return text.trim().replace(/\s+/g, ' ').length;
} // Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float


function linkDensity$1($node) {
  var totalTextLength = textLength$1($node.text());
  var linkText = $node.find('a').text();
  var linkLength = textLength$1(linkText);

  if (totalTextLength > 0) {
    return linkLength / totalTextLength;
  }

  if (totalTextLength === 0 && linkLength > 0) {
    return 1;
  }

  return 0;
} // search for, find a meta tag associated.


function extractFromMeta$$1($, metaNames, cachedNames) {
  var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
  var foundNames = metaNames.filter(function (name) {
    return cachedNames.indexOf(name) !== -1;
  }); // eslint-disable-next-line no-restricted-syntax

  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;

  try {
    var _loop = function _loop() {
      var name = _step.value;
      var type = 'name';
      var value = 'value';
      var nodes = $("meta[".concat(type, "=\"").concat(name, "\"]")); // Get the unique value of every matching node, in case there
      // are two meta tags with the same name and value.
      // Remove empty values.

      var values = nodes.map(function (index, node) {
        return $(node).attr(value);
      }).toArray().filter(function (text) {
        return text !== '';
      }); // If we have more than one value for the same name, we have a
      // conflict and can't trust any of them. Skip this name. If we have
      // zero, that means our meta tags had no values. Skip this name
      // also.

      if (values.length === 1) {
        var metaValue; // Meta values that contain HTML should be stripped, as they
        // weren't subject to cleaning previously.

        if (cleanTags) {
          metaValue = stripTags$1(values[0], $);
        } else {
          var _values = _slicedToArray$1(values, 1);

          metaValue = _values[0];
        }

        return {
          v: metaValue
        };
      }
    };

    for (var _iterator = _getIterator$1(foundNames), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
      var _ret = _loop();

      if (_typeof$1(_ret) === "object") return _ret.v;
    } // If nothing is found, return null

  } catch (err) {
    _didIteratorError = true;
    _iteratorError = err;
  } finally {
    try {
      if (!_iteratorNormalCompletion && _iterator.return != null) {
        _iterator.return();
      }
    } finally {
      if (_didIteratorError) {
        throw _iteratorError;
      }
    }
  }

  return null;
}

function isGoodNode$1($node, maxChildren) {
  // If it has a number of children, it's more likely a container
  // element. Skip it.
  if ($node.children().length > maxChildren) {
    return false;
  } // If it looks to be within a comment, skip it.


  if (withinComment$$1($node)) {
    return false;
  }

  return true;
} // Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.


function extractFromSelectors$$1($, selectors) {
  var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
  var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true; // eslint-disable-next-line no-restricted-syntax

  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;

  try {
    for (var _iterator = _getIterator$1(selectors), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
      var selector = _step.value;
      var nodes = $(selector); // If we didn't get exactly one of this selector, this may be
      // a list of articles or comments. Skip it.

      if (nodes.length === 1) {
        var $node = $(nodes[0]);

        if (isGoodNode$1($node, maxChildren)) {
          var content = void 0;

          if (textOnly) {
            content = $node.text();
          } else {
            content = $node.html();
          }

          if (content) {
            return content;
          }
        }
      }
    }
  } catch (err) {
    _didIteratorError = true;
    _iteratorError = err;
  } finally {
    try {
      if (!_iteratorNormalCompletion && _iterator.return != null) {
        _iterator.return();
      }
    } finally {
      if (_didIteratorError) {
        throw _iteratorError;
      }
    }
  }

  return null;
} // strips all tags from a string of text


function stripTags$1(text, $) {
  // Wrapping text in html element prevents errors when text
  // has no html
  var cleanText = $("<span>".concat(text, "</span>")).text();
  return cleanText === '' ? text : cleanText;
}

function withinComment$$1($node) {
  var parents = $node.parents().toArray();
  var commentParent = parents.find(function (parent) {
    var attrs = getAttrs$1(parent);
    var nodeClass = attrs.class,
        id = attrs.id;
    var classAndId = "".concat(nodeClass, " ").concat(id);
    return classAndId.includes('comment');
  });
  return commentParent !== undefined;
} // Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean


function nodeIsSufficient$1($node) {
  return $node.text().trim().length >= 100;
}

function isWordpress$1($) {
  return $(IS_WP_SELECTOR$1).length > 0;
}

function getAttrs$1(node) {
  var attribs = node.attribs,
      attributes = node.attributes;

  if (!attribs && attributes) {
    var attrs = _Reflect$ownKeys$1(attributes).reduce(function (acc, index) {
      var attr = attributes[index];
      if (!attr.name || !attr.value) return acc;
      acc[attr.name] = attr.value;
      return acc;
    }, {});

    return attrs;
  }

  return attribs;
}

function setAttr$1(node, attr, val) {
  if (node.attribs) {
    node.attribs[attr] = val;
  } else if (node.attributes) {
    node.setAttribute(attr, val);
  }

  return node;
}

function setAttrs$1(node, attrs) {
  if (node.attribs) {
    node.attribs = attrs;
  } else if (node.attributes) {
    while (node.attributes.length > 0) {
      node.removeAttribute(node.attributes[0].name);
    }

    _Reflect$ownKeys$1(attrs).forEach(function (key) {
      node.setAttribute(key, attrs[key]);
    });
  }

  return node;
} // DOM manipulation


var IS_LINK = new RegExp('https?://', 'i');
var IMAGE_RE = '.(png|gif|jpe?g)';
var IS_IMAGE = new RegExp("".concat(IMAGE_RE), 'i');
var IS_SRCSET = new RegExp("".concat(IMAGE_RE, "(\\?\\S+)?(\\s*[\\d.]+[wx])"), 'i');
var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(','); // lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.

function convertLazyLoadedImages($) {
  var extractSrcFromJSON = function extractSrcFromJSON(str) {
    try {
      var _JSON$parse = JSON.parse(str),
          src = _JSON$parse.src;

      if (typeof src === 'string') return src;
    } catch (_) {
      return false;
    }

    return false;
  };

  $('img').each(function (_, img) {
    var attrs = getAttrs$1(img);

    _Reflect$ownKeys$1(attrs).forEach(function (attr) {
      var value = attrs[attr];

      if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
        $(img).attr('srcset', value);
      } else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
        var existingSrc = extractSrcFromJSON(value);

        if (existingSrc) {
          $(img).attr('src', existingSrc);
        } else {
          $(img).attr('src', value);
        }
      }
    });
  });
  return $;
}

function isComment(index, node) {
  return node.type === 'comment';
}

function cleanComments($) {
  $.root().find('*').contents().filter(isComment).remove();
  return $;
}

function clean($) {
  $(TAGS_TO_REMOVE).remove();
  $ = cleanComments($);
  return $;
}

var Resource = {
  // Create a Resource.
  //
  // :param url: The URL for the document we should retrieve.
  // :param response: If set, use as the response rather than
  //                  attempting to fetch it ourselves. Expects a
  //                  string.
  // :param headers: Custom headers to be included in the request
  create: function () {
    var _create = _asyncToGenerator(
    /*#__PURE__*/
    _regeneratorRuntime.mark(function _callee(url, preparedResponse, parsedUrl) {
      var headers,
          result,
          validResponse,
          _args = arguments;
      return _regeneratorRuntime.wrap(function _callee$(_context) {
        while (1) {
          switch (_context.prev = _context.next) {
            case 0:
              headers = _args.length > 3 && _args[3] !== undefined ? _args[3] : {};

              if (!preparedResponse) {
                _context.next = 6;
                break;
              }

              validResponse = {
                statusMessage: 'OK',
                statusCode: 200,
                headers: {
                  'content-type': 'text/html',
                  'content-length': 500
                }
              };
              result = {
                body: preparedResponse,
                response: validResponse,
                alreadyDecoded: true
              };
              _context.next = 9;
              break;

            case 6:
              _context.next = 8;
              return fetchResource(url, parsedUrl, headers);

            case 8:
              result = _context.sent;

            case 9:
              if (!result.error) {
                _context.next = 12;
                break;
              }

              result.failed = true;
              return _context.abrupt("return", result);

            case 12:
              return _context.abrupt("return", this.generateDoc(result));

            case 13:
            case "end":
              return _context.stop();
          }
        }
      }, _callee, this);
    }));

    function create(_x, _x2, _x3) {
      return _create.apply(this, arguments);
    }

    return create;
  }(),
  generateDoc: function generateDoc(_ref) {
    var content = _ref.body,
        response = _ref.response,
        _ref$alreadyDecoded = _ref.alreadyDecoded,
        alreadyDecoded = _ref$alreadyDecoded === void 0 ? false : _ref$alreadyDecoded;
    var _response$headers$con = response.headers['content-type'],
        contentType = _response$headers$con === void 0 ? '' : _response$headers$con; // TODO: Implement is_text function from
    // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57

    if (!contentType.includes('html') && !contentType.includes('text')) {
      throw new Error('Content does not appear to be text.');
    }

    var $ = this.encodeDoc({
      content: content,
      contentType: contentType,
      alreadyDecoded: alreadyDecoded
    });

    if ($.root().children().length === 0) {
      throw new Error('No children, likely a bad parse.');
    }

    $ = normalizeMetaTags($);
    $ = convertLazyLoadedImages($);
    $ = clean($);
    return $;
  },
  encodeDoc: function encodeDoc(_ref2) {
    var content = _ref2.content,
        contentType = _ref2.contentType,
        _ref2$alreadyDecoded = _ref2.alreadyDecoded,
        alreadyDecoded = _ref2$alreadyDecoded === void 0 ? false : _ref2$alreadyDecoded;

    if (alreadyDecoded) {
      return cheerio$1.load(content);
    }

    var encoding = getEncoding$1(contentType);
    var decodedContent = iconv.decode(content, encoding);
    var $ = cheerio$1.load(decodedContent); // after first cheerio.load, check to see if encoding matches

    var contentTypeSelector = cheerio$1.browser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]';
    var metaContentType = $(contentTypeSelector).attr('content') || $('meta[charset]').attr('charset');
    var properEncoding = getEncoding$1(metaContentType); // if encodings in the header/body dont match, use the one in the body

    if (metaContentType && properEncoding !== encoding) {
      decodedContent = iconv.decode(content, properEncoding);
      $ = cheerio$1.load(decodedContent);
    }

    return $;
  }
};

var _marked =
/*#__PURE__*/
_regeneratorRuntime.mark(range);

function range() {
  var start,
      end,
      _args = arguments;
  return _regeneratorRuntime.wrap(function range$(_context) {
    while (1) {
      switch (_context.prev = _context.next) {
        case 0:
          start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
          end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;

        case 2:
          if (!(start <= end)) {
            _context.next = 7;
            break;
          }

          _context.next = 5;
          return start += 1;

        case 5:
          _context.next = 2;
          break;

        case 7:
        case "end":
          return _context.stop();
      }
    }
  }, _marked, this);
} // extremely simple url validation as a first step


function validateUrl(_ref) {
  var hostname = _ref.hostname; // If this isn't a valid url, return an error message

  return !!hostname;
}

var merge = function merge(extractor, domains) {
  return domains.reduce(function (acc, domain) {
    acc[domain] = extractor;
    return acc;
  }, {});
};

function mergeSupportedDomains(extractor) {
  return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray$1(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
}

var apiExtractors = {};

function addExtractor(extractor) {
  if (!extractor || !extractor.domain) {
    return {
      error: true,
      message: 'Unable to add custom extractor. Invalid parameters.'
    };
  }

  _Object$assign(apiExtractors, mergeSupportedDomains(extractor));

  return apiExtractors;
}

var BloggerExtractor = {
  domain: 'blogspot.com',
  content: {
    // Blogger is insane and does not load its content
    // initially in the page, but it's all there
    // in noscript
    selectors: ['.post-content noscript'],
    // Selectors to remove from the extracted content
    clean: [],
    // Convert the noscript tag to a div
    transforms: {
      noscript: 'div'
    }
  },
  author: {
    selectors: ['.post-author-name']
  },
  title: {
    selectors: ['.post h2.title']
  },
  date_published: {
    selectors: ['span.publishdate']
  }
};
var NYMagExtractor = {
  domain: 'nymag.com',
  content: {
    // Order by most likely. Extractor will stop on first occurrence
    selectors: ['div.article-content', 'section.body', 'article.article'],
    // Selectors to remove from the extracted content
    clean: ['.ad', '.single-related-story'],
    // Object of tranformations to make on matched elements
    // Each key is the selector, each value is the tag to
    // transform to.
    // If a function is given, it should return a string
    // to convert to or nothing (in which case it will not perform
    // the transformation.
    transforms: {
      // Convert h1s to h2s
      h1: 'h2',
      // Convert lazy-loaded noscript images to figures
      noscript: function noscript($node, $) {
        var $children = $.browser ? $($node.text()) : $node.children();

        if ($children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img') {
          return 'figure';
        }

        return null;
      }
    }
  },
  title: {
    selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1']
  },
  author: {
    selectors: ['.by-authors', '.lede-feature-author']
  },
  dek: {
    selectors: ['.lede-feature-teaser']
  },
  date_published: {
    selectors: [['time.article-timestamp[datetime]', 'datetime'], 'time.article-timestamp']
  }
};
var WikipediaExtractor = {
  domain: 'wikipedia.org',
  content: {
    selectors: ['#mw-content-text'],
    defaultCleaner: false,
    // transform top infobox to an image with caption
    transforms: {
      '.infobox img': function infoboxImg($node) {
        var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox

        if ($parent.children('img').length === 0) {
          $parent.prepend($node);
        }
      },
      '.infobox caption': 'figcaption',
      '.infobox': 'figure'
    },
    // Selectors to remove from the extracted content
    clean: ['.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox']
  },
  author: 'Wikipedia Contributors',
  title: {
    selectors: ['h2.title']
  },
  date_published: {
    selectors: ['#footer-info-lastmod']
  }
};
var TwitterExtractor = {
  domain: 'twitter.com',
  content: {
    transforms: {
      // We're transforming essentially the whole page here.
      // Twitter doesn't have nice selectors, so our initial
      // selector grabs the whole page, then we're re-writing
      // it to fit our needs before we clean it up.
      '.permalink[role=main]': function permalinkRoleMain($node, $) {
        var tweets = $node.find('.tweet');
        var $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
        $tweetContainer.append(tweets);
        $node.replaceWith($tweetContainer);
      },
      // Twitter wraps @ with s, which
      // renders as a strikethrough
      s: 'span'
    },
    selectors: ['.permalink[role=main]'],
    defaultCleaner: false,
    clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']
  },
  author: {
    selectors: ['.tweet.permalink-tweet .username']
  },
  date_published: {
    selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']]
  }
};
var NYTimesExtractor = {
  domain: 'www.nytimes.com',
  title: {
    selectors: ['h1[data-testid="headline"]', 'h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline', 'h1 .balancedHeadline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline', ['meta[name="byl"]', 'value']]
  },
  content: {
    selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
    transforms: {
      'img.g-lazy': function imgGLazy($node) {
        var src = $node.attr('src');
        var width = 640;
        src = src.replace('{{size}}', width);
        $node.attr('src', src);
      }
    },
    clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden', '.comments', '.supplemental', '.nocontent', '.story-footer-links']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[name="article:published"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: null,
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication

var TheAtlanticExtractor = {
  domain: 'www.theatlantic.com',
  title: {
    selectors: ['h1', '.c-article-header__hed']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.c-byline__author']
  },
  content: {
    selectors: ['article', '.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet']
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  date_published: {
    selectors: [['time[itemprop="datePublished"]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var NewYorkerExtractor = {
  domain: 'www.newyorker.com',
  title: {
    selectors: ['h1[class^="content-header"]', 'h1[class^="ArticleHeader__hed"]', 'h1[class*="ContentHeaderHed"]', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['article header div[class^="BylinesWrapper"]', ['meta[name="article:author"]', 'value'], 'div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"]']
  },
  content: {
    selectors: ['.article__body', 'article.article.main-content', 'main[class^="Layout__content"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.caption__text': 'figcaption',
      '.caption__credit': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['footer[class^="ArticleFooter__footer"]', 'aside']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], 'time.content-header__publish-date', ['meta[name="pubdate"]', 'value']],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: ['div[class^="ContentHeaderDek"]', 'div.content-header__dek', 'h2[class^="ArticleHeader__dek"]']
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var WiredExtractor = {
  domain: 'www.wired.com',
  title: {
    selectors: ['h1[data-testId="ContentHeaderHed"]']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value'], 'a[rel="author"]']
  },
  content: {
    selectors: ['article.article.main-content', 'article.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.visually-hidden', 'figcaption img.photo', '.alert-message']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var MSNExtractor = {
  domain: 'www.msn.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['span.authorname-txt']
  },
  content: {
    selectors: ['div.richtext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['span.caption']
  },
  date_published: {
    selectors: ['span.time']
  },
  lead_image_url: {
    selectors: []
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var YahooExtractor = {
  domain: 'www.yahoo.com',
  title: {
    selectors: ['header.canvas-header']
  },
  author: {
    selectors: ['span.provider-name']
  },
  content: {
    selectors: [// enter content selectors
    '.content-canvas'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.figure-caption']
  },
  date_published: {
    selectors: [['time.date[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: [// enter dek selectors
    ]
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var BuzzfeedExtractor = {
  domain: 'www.buzzfeed.com',
  supportedDomains: ['www.buzzfeednews.com'],
  title: {
    selectors: ['h1.embed-headline-title']
  },
  author: {
    selectors: ['a[data-action="user/username"]', 'byline__author', ['meta[name="author"]', 'value']]
  },
  content: {
    selectors: [['div[class^="featureimage_featureImageWrapper"]', '.js-subbuzz-wrapper'], ['.js-subbuzz-wrapper']],
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: 'b',
      'div.longform_custom_header_media': function divLongform_custom_header_media($node) {
        if ($node.has('img') && $node.has('.longform_header_image_source')) {
          return 'figure';
        }

        return null;
      },
      'figure.longform_custom_header_media .longform_header_image_source': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.instapaper_ignore', '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline', '.share-box', '.print', '.js-inline-share-bar', '.js-ad-placement']
  },
  date_published: {
    selectors: [['time[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: ['.embed-headline-description']
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var WikiaExtractor = {
  domain: 'fandom.wikia.com',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['.author vcard', '.fn']
  },
  content: {
    selectors: ['.grid-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var LittleThingsExtractor = {
  domain: 'www.littlethings.com',
  title: {
    selectors: ['h1[class*="PostHeader"]', 'h1.post-title']
  },
  author: {
    selectors: ['div[class^="PostHeader__ScAuthorNameSection"]', ['meta[name="author"]', 'value']]
  },
  content: {
    selectors: [// enter content selectors
    'section[class*="PostMainArticle"]', '.mainContentIntro', '.content-wrapper'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  next_page_url: null,
  excerpt: null
};
var PoliticoExtractor = {
  domain: 'www.politico.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['div[itemprop="author"] meta[itemprop="name"]', 'value'], '.story-meta__authors .vcard', '.story-main-content .byline .vcard']
  },
  content: {
    selectors: [['.story-text'], '.story-main-content', '.story-core'],
    transforms: [],
    clean: ['figcaption', '.story-meta', '.ad']
  },
  date_published: {
    selectors: [['time[itemprop="datePublished"]', 'datetime'], ['.story-meta__details time[datetime]', 'datetime'], ['.story-main-content .timestamp time[datetime]', 'datetime']],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  }
};
var DeadspinExtractor = {
  domain: 'deadspin.com',
  supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', 'avclub.com', 'clickhole.com', 'splinternews.com', 'theonion.com', 'theroot.com', 'thetakeout.com', 'theinventory.com'],
  title: {
    selectors: ['header h1', 'h1.headline']
  },
  author: {
    selectors: ['a[data-ga*="Author"]', '.author']
  },
  content: {
    selectors: ['.js_post-content', '.post-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube($node) {
        var youtubeId = $node.attr('id').split('youtube-')[1];
        $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.magnifier', '.lightbox']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['time.updated[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var BroadwayWorldExtractor = {
  domain: 'www.broadwayworld.com',
  title: {
    selectors: ['h1[itemprop=headline]', 'h1.article-title']
  },
  author: {
    selectors: ['span[itemprop=author]']
  },
  content: {
    selectors: ['div[itemprop=articlebody]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  date_published: {
    selectors: [['meta[itemprop=datePublished]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
}; // Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)

var ApartmentTherapyExtractor = {
  domain: 'www.apartmenttherapy.com',
  title: {
    selectors: ['h1.headline']
  },
  author: {
    selectors: ['.PostByline__name']
  },
  content: {
    selectors: ['div.post__content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture($node, $) {
        var data = JSON.parse($node.attr('data-props'));
        var src = data.sources[0].src;
        var $img = $('<img />').attr('src', src);
        $node.replaceWith($img);
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  date_published: {
    selectors: [['.PostByline__timestamp[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
};
var MediumExtractor = {
  domain: 'medium.com',
  title: {
    selectors: ['h1', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  content: {
    selectors: ['article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // Allow drop cap character.
      'section span:first-of-type': function sectionSpanFirstOfType($node) {
        var $text = $node.html();

        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
          $node.replaceWith($text);
        }
      },
      // Re-write lazy-loaded youtube videos
      iframe: function iframe($node) {
        var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
        var thumb = decodeURIComponent($node.attr('data-thumbnail'));
        var $parent = $node.parents('figure');

        if (ytRe.test(thumb)) {
          var _thumb$match = thumb.match(ytRe),
              _thumb$match2 = _slicedToArray$1(_thumb$match, 2),
              _ = _thumb$match2[0],
              youtubeId = _thumb$match2[1]; // eslint-disable-line


          $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
          var $caption = $parent.find('figcaption');
          $parent.empty().append([$node, $caption]);
          return;
        } // If we can't draw the YouTube preview, remove the figure.


        $parent.remove();
      },
      // rewrite figures to pull out image and caption, remove rest
      figure: function figure($node) {
        // ignore if figure has an iframe
        if ($node.find('iframe').length > 0) return;
        var $img = $node.find('img').slice(-1)[0];
        var $caption = $node.find('figcaption');
        $node.empty().append([$img, $caption]);
      },
      // Remove any smaller images that did not get caught by the generic image
      // cleaner (author photo 48px, leading sentence images 79px, etc.).
      img: function img($node) {
        var width = _parseInt$1($node.attr('width'), 10);

        if (width < 100) $node.remove();
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['span a', 'svg']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: null,
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
};
var WwwTmzComExtractor = {
  domain: 'www.tmz.com',
  title: {
    selectors: ['.post-title-breadcrumb', 'h1', '.headline']
  },
  author: 'TMZ STAFF',
  date_published: {
    selectors: ['.article__published-at', '.article-posted-date'],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article__blocks', '.article-content', '.all-post-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.lightbox-link']
  }
};
var WwwWashingtonpostComExtractor = {
  domain: 'www.washingtonpost.com',
  title: {
    selectors: ['h1', '#topper-headline-wrapper']
  },
  author: {
    selectors: ['.pb-author-name']
  },
  date_published: {
    selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']]
  },
  dek: {
    selectors: []
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.inline-content': function divInlineContent($node) {
        if ($node.has('img,iframe,video').length > 0) {
          return 'figure';
        }

        $node.remove();
        return null;
      },
      '.pb-caption': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.interstitial-link', '.newsletter-inline-unit']
  }
};
var WwwHuffingtonpostComExtractor = {
  domain: 'www.huffingtonpost.com',
  title: {
    selectors: ['h1.headline__title']
  },
  author: {
    selectors: ['span.author-card__details__name']
  },
  date_published: {
    selectors: [['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.headline__subtitle']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.entry__body'],
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story']
  }
};
var NewrepublicComExtractor = {
  domain: 'newrepublic.com',
  title: {
    selectors: ['h1.article-headline']
  },
  author: {
    selectors: ['span.AuthorList']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['h2.article-subhead']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['div.article-body']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['aside']
  }
};
var MoneyCnnComExtractor = {
  domain: 'money.cnn.com',
  title: {
    selectors: ['.article-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.byline a']
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']],
    timezone: 'GMT'
  },
  dek: {
    selectors: ['#storytext h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#storytext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.inStoryHeading']
  }
};
var WwwThevergeComExtractor = {
  domain: 'www.theverge.com',
  supportedDomains: ['www.polygon.com'],
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [// feature template multi-match
    ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match
    ['.e-image--hero', '.c-entry-content'], // feature template fallback
    '.l-wrapper .l-feature', // regular post fallback
    'div.c-entry-content'],
    // Transform lazy-loaded images
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'span';
        }

        return null;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.aside', 'img.c-dynamic-image']
  }
};
var WwwCnnComExtractor = {
  domain: 'www.cnn.com',
  title: {
    selectors: ['h1.pg-headline', 'h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [// a more specific selector to grab the lead image and the body
    ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above
    '.zn-body-text', 'div[itemprop="articleBody"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph($node) {
        var $text = $node.html();

        if ($text) {
          return 'p';
        }

        return null;
      },
      // this transform cleans the short, all-link sections linking
      // to related content but not marked as such in any way.
      '.zn-body__paragraph': function znBody__paragraph($node) {
        if ($node.has('a')) {
          if ($node.text().trim() === $node.find('a').text().trim()) {
            $node.remove();
          }
        }
      },
      '.media__video--thumbnail': 'figure'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwAolComExtractor = {
  domain: 'www.aol.com',
  title: {
    selectors: ['h1.p-article__title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: ['.p-article__byline__date'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwYoutubeComExtractor = {
  domain: 'www.youtube.com',
  title: {
    selectors: [['meta[name="title"]', 'value'], '.watch-title', 'h1.watch-title-container']
  },
  author: {
    selectors: [['link[itemprop="name"]', 'content'], '.yt-user-info']
  },
  date_published: {
    selectors: [['meta[itemProp="datePublished"]', 'value']],
    timezone: 'GMT'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    defaultCleaner: false,
    selectors: ['#player-container-outer', 'ytd-expandable-video-description-body-renderer #description', ['#player-api', '#description']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '#player-api': function playerApi($node, $) {
        var videoId = $('meta[itemProp="videoId"]').attr('value');
        $node.html("\n          <iframe src=\"https://www.youtube.com/embed/".concat(videoId, "\" frameborder=\"0\" allowfullscreen></iframe>"));
      },
      '#player-container-outer': function playerContainerOuter($node, $) {
        var videoId = $('meta[itemProp="videoId"]').attr('value');
        var description = $('meta[itemProp="description"]').attr('value');
        $node.html("\n        <iframe src=\"https://www.youtube.com/embed/".concat(videoId, "\" frameborder=\"0\" allowfullscreen></iframe>\n        <div><span>").concat(description, "</span></div>"));
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwTheguardianComExtractor = {
  domain: 'www.theguardian.com',
  title: {
    selectors: ['h1', '.content__headline']
  },
  author: {
    selectors: ['address[data-link-name="byline"]', 'p.byline']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['div[data-gu-name="standfirst"]', '.content__standfirst']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#maincontent', '.content__article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.hide-on-mobile', '.inline-icon']
  }
};
var WwwSbnationComExtractor = {
  domain: 'www.sbnation.com',
  title: {
    selectors: ['h1.c-page-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['p.c-entry-summary.p-dek', 'h2.c-entry-summary.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwBloombergComExtractor = {
  domain: 'www.bloomberg.com',
  title: {
    selectors: [// normal articles
    '.lede-headline', // /graphics/ template
    'h1.article-title', // /news/ template
    'h1[class^="headline"]', 'h1.lede-text-only__hed']
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template
    '.bydek', // /news/ template
    '.author', 'p[class*="author"]']
  },
  date_published: {
    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value'], ['meta[name="parsely-pub-date"]', 'content']]
  },
  dek: {
    selectors: []
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="og:image"]', 'content']]
  },
  content: {
    selectors: ['.article-body__content', '.body-content', // /graphics/ template
    ['section.copy-block'], // /news/ template
    '.body-copy'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.inline-newsletter', '.page-ad']
  }
};
var WwwBustleComExtractor = {
  domain: 'www.bustle.com',
  title: {
    selectors: ['h1', 'h1.post-page__title']
  },
  author: {
    selectors: ['a[href*="profile"]', 'div.content-meta__author']
  },
  date_published: {
    selectors: [['time', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article', '.post-page__body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwNprOrgExtractor = {
  domain: 'www.npr.org',
  title: {
    selectors: ['h1', '.storytitle']
  },
  author: {
    selectors: ['p.byline__name.byline__name--block']
  },
  date_published: {
    selectors: [['.dateblock time[datetime]', 'datetime'], ['meta[name="date"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="twitter:image:src"]', 'value']]
  },
  content: {
    selectors: ['.storytext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.bucketwrap.image': 'figure',
      '.bucketwrap.image .credit-caption': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['div.enlarge_measure']
  }
};
var WwwRecodeNetExtractor = {
  domain: 'www.recode.net',
  title: {
    selectors: ['h1.c-page-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.c-entry-summary.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var QzComExtractor = {
  domain: 'qz.com',
  title: {
    selectors: ['article header h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['time[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[name="twitter:image"]', 'content']]
  },
  content: {
    selectors: ['#article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwDmagazineComExtractor = {
  domain: 'www.dmagazine.com',
  title: {
    selectors: ['h1.story__title']
  },
  author: {
    selectors: ['.story__info .story__info__item:first-child']
  },
  date_published: {
    selectors: [// enter selectors
    '.story__info'],
    timezone: 'America/Chicago',
    format: 'MMMM D, YYYY h:mm a'
  },
  dek: {
    selectors: ['.story__subhead']
  },
  lead_image_url: {
    selectors: [['article figure a:first-child', 'href']]
  },
  content: {
    selectors: ['.story__content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwReutersComExtractor = {
  domain: 'www.reuters.com',
  title: {
    selectors: ['h1[class*="ArticleHeader-headline-"]', 'h1.article-headline']
  },
  author: {
    selectors: [['meta[name="og:article:author"]', 'value'], '.author']
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.ArticleBodyWrapper', '#article-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.article-subtitle': 'h4'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['div[class^="ArticleBody-byline-container-"]', '#article-byline .author']
  }
};
var MashableComExtractor = {
  domain: 'mashable.com',
  title: {
    selectors: ['header h1', 'h1.title']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value'], 'span.author_name a']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#article', 'section.article-content.blueprint'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.image-credit': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwChicagotribuneComExtractor = {
  domain: 'www.chicagotribune.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['div.article_byline span:first-of-type']
  },
  date_published: {
    selectors: ['time']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwVoxComExtractor = {
  domain: 'www.vox.com',
  title: {
    selectors: ['h1.c-page-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'figure .e-image__image noscript': function figureEImage__imageNoscript($node) {
        var imgHtml = $node.html();
        $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml);
      },
      'figure .e-image__meta': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var NewsNationalgeographicComExtractor = {
  domain: 'news.nationalgeographic.com',
  title: {
    selectors: ['h1', 'h1.main-title']
  },
  author: {
    selectors: ['.byline-component__contributors b span']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    format: 'ddd MMM DD HH:mm:ss zz YYYY',
    timezone: 'EST'
  },
  dek: {
    selectors: ['.article__deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.parsys.content': function parsysContent($node, $) {
        var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');

        if ($imgSrc) {
          $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote.pull-quote--large']
  }
};
var WwwNationalgeographicComExtractor = {
  domain: 'www.nationalgeographic.com',
  title: {
    selectors: ['h1', 'h1.main-title']
  },
  author: {
    selectors: ['.byline-component__contributors b span']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.Article__Headline__Desc', '.article__deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['section.Article__Content', ['.parsys.content', '.__image-lead__'], '.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.parsys.content': function parsysContent($node, $) {
        var $imageParent = $node.children().first();

        if ($imageParent.hasClass('imageGroup')) {
          var $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();
          var imgPath1 = $dataAttrContainer.data('platform-image1-path');
          var imgPath2 = $dataAttrContainer.data('platform-image2-path');

          if (imgPath2 && imgPath1) {
            $node.prepend($("<div class=\"__image-lead__\">\n                <img src=\"".concat(imgPath1, "\"/>\n                <img src=\"").concat(imgPath2, "\"/>\n              </div>")));
          }
        } else {
          var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');

          if ($imgSrc) {
            $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
          }
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote.pull-quote--small']
  }
};
var WwwLatimesComExtractor = {
  domain: 'www.latimes.com',
  title: {
    selectors: ['h1.headline', '.trb_ar_hl']
  },
  author: {
    selectors: ['a[data-click="standardBylineAuthorName"]', ['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.page-article-body', '.trb_ar_main'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.trb_ar_la': function trb_ar_la($node) {
        var $figure = $node.find('figure');
        $node.replaceWith($figure);
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.trb_ar_by', '.trb_ar_cr']
  }
};
var PagesixComExtractor = {
  domain: 'pagesix.com',
  supportedDomains: ['nypost.com'],
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['.byline']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['#featured-image-wrapper', '.entry-content'], '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '#featured-image-wrapper': 'figure',
      '.wp-caption-text': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.modal-trigger']
  }
};
var ThefederalistpapersOrgExtractor = {
  domain: 'thefederalistpapers.org',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['.author-meta-title', 'main span.entry-author-name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['header', '.article-sharing', '.after-article', '.type-commenting', '.more-posts', ['p[style]']]
  }
};
var WwwCbssportsComExtractor = {
  domain: 'www.cbssports.com',
  title: {
    selectors: ['.Article-headline', '.article-headline']
  },
  author: {
    selectors: ['.ArticleAuthor-nameText', '.author-name']
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']],
    timezone: 'UTC'
  },
  dek: {
    selectors: ['.Article-subline', '.article-subline']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwMsnbcComExtractor = {
  domain: 'www.msnbc.com',
  title: {
    selectors: ['h1', 'h1.is-title-pane']
  },
  author: {
    selectors: ['.byline-name', '.author']
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value'], ['meta[name="DC.date.issued"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-body__content', '.pane-node-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.pane-node-body': function paneNodeBody($node, $) {
        var _WwwMsnbcComExtractor = _slicedToArray$1(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
            selector = _WwwMsnbcComExtractor[0],
            attr = _WwwMsnbcComExtractor[1];

        var src = $(selector).attr(attr);

        if (src) {
          $node.prepend("<img src=\"".concat(src, "\" />"));
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwThepoliticalinsiderComExtractor = {
  domain: 'www.thepoliticalinsider.com',
  title: {
    selectors: [['meta[name="sailthru.title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="sailthru.author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div#article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwMentalflossComExtractor = {
  domain: 'www.mentalfloss.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'h1.title', '.title-group', '.inner']
  },
  author: {
    selectors: ['a[data-vars-label*="authors"]', '.field-name-field-enhanced-authors']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], '.date-display-single'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article main', 'div.field.field-name-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['small']
  }
};
var AbcnewsGoComExtractor = {
  domain: 'abcnews.go.com',
  title: {
    selectors: ['div[class*="Article_main__body"] h1', '.article-header h1']
  },
  author: {
    selectors: ['.ShareByline span:nth-child(2)', '.authors'],
    clean: ['.author-overlay', '.by-text']
  },
  date_published: {
    selectors: ['.ShareByline', '.timestamp'],
    format: 'MMMM D, YYYY h:mm a',
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article', '.article-copy'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwNydailynewsComExtractor = {
  domain: 'www.nydailynews.com',
  title: {
    selectors: ['h1.headline', 'h1#ra-headline']
  },
  author: {
    selectors: ['.article_byline span', ['meta[name="parsely-author"]', 'value']]
  },
  date_published: {
    selectors: ['time', ['meta[name="sailthru.date"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article', 'article#ra-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom']
  }
};
var WwwCnbcComExtractor = {
  domain: 'www.cnbc.com',
  title: {
    selectors: ['h1.title', 'h1.ArticleHeader-headline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div#article_body.content', 'div.story', 'div.ArticleBody-articleBody'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwPopsugarComExtractor = {
  domain: 'www.popsugar.com',
  title: {
    selectors: ['h2.post-title', 'title-text']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.share-copy-title', '.post-tags', '.reactions']
  }
};
var ObserverComExtractor = {
  domain: 'observer.com',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['.author', '.vcard']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var PeopleComExtractor = {
  domain: 'people.com',
  title: {
    selectors: ['.article-header h1', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="sailthru.author"]', 'value'], 'a.author.url.fn']
  },
  date_published: {
    selectors: ['.mntl-attribution__item-date', ['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: ['.article-header h2']
  },
  content: {
    selectors: ['div[class^="loc article-content"]', 'div.article-body__inner'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwUsmagazineComExtractor = {
  domain: 'www.usmagazine.com',
  title: {
    selectors: ['header h1']
  },
  author: {
    selectors: ['a.author', 'a.article-byline.tracked-offpage']
  },
  date_published: {
    timezone: 'America/New_York',
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.module-related']
  }
};
var WwwRollingstoneComExtractor = {
  domain: 'www.rollingstone.com',
  title: {
    selectors: ['h1.l-article-header__row--title', 'h1.content-title']
  },
  author: {
    selectors: ['a.c-byline__link', 'a.content-author.tracked-offpage']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], 'time.content-published-date'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['h2.l-article-header__row--lead', '.content-description']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.l-article-content', ['.lead-container', '.article-content'], '.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.c-related-links-wrapper', '.module-related']
  }
};
var twofortysevensportsComExtractor = {
  domain: '247sports.com',
  title: {
    selectors: ['title', 'article header h1']
  },
  author: {
    selectors: ['.article-cnt__author', '.author']
  },
  date_published: {
    selectors: [['time[data-published]', 'data-published']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-body', 'section.body.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var UproxxComExtractor = {
  domain: 'uproxx.com',
  title: {
    selectors: ['div.entry-header h1']
  },
  author: {
    selectors: [['meta[name="qc:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.image': 'figure',
      'div.image .wp-media-credit': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwEonlineComExtractor = {
  domain: 'www.eonline.com',
  title: {
    selectors: ['h1.article-detail__title', 'h1.article__title']
  },
  author: {
    selectors: ['.article-detail__meta__author', '.entry-meta__author a']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.article-detail__main-content section'], ['.post-content section, .post-content div.post-content__image']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.post-content__image': 'figure',
      'div.post-content__image .image__credits': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwMiamiheraldComExtractor = {
  domain: 'www.miamiherald.com',
  title: {
    selectors: ['h1.title']
  },
  date_published: {
    selectors: ['p.published-date'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.dateline-storybody'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwRefinery29ComExtractor = {
  domain: 'www.refinery29.com',
  title: {
    selectors: ['h1.title']
  },
  author: {
    selectors: ['.contributor']
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.full-width-opener', '.article-content'], '.article-content', '.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.loading noscript': function divLoadingNoscript($node) {
        var imgHtml = $node.html();
        $node.parents('.loading').replaceWith(imgHtml);
      },
      '.section-image': 'figure',
      '.section-image .content-caption': 'figcaption',
      '.section-text': 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.story-share']
  }
};
var WwwMacrumorsComExtractor = {
  domain: 'www.macrumors.com',
  title: {
    selectors: ['h1', 'h1.title']
  },
  author: {
    selectors: ['article a[rel="author"]', '.author-url']
  },
  date_published: {
    selectors: [['time', 'datetime']],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article', '.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwAndroidcentralComExtractor = {
  domain: 'www.androidcentral.com',
  title: {
    selectors: ['h1', 'h1.main-title']
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.intro', 'blockquote']
  }
};
var WwwSiComExtractor = {
  domain: 'www.si.com',
  title: {
    selectors: ['h1', 'h1.headline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="published"]', 'value']],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['.m-detail-header--dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.m-detail--body', ['p', '.marquee_large_2x', '.component.image']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'figure';
        }

        return null;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [['.inline-thumb', '.primary-message', '.description', '.instructions']]
  }
};
var WwwRawstoryComExtractor = {
  domain: 'www.rawstory.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], '.blog-title']
  },
  author: {
    selectors: ['div.main-post-head .social-author__name', '.blog-author a:first-of-type']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], '.blog-author a:last-of-type'],
    timezone: 'EST'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.post-body', '.blog-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwCnetComExtractor = {
  domain: 'www.cnet.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['span.author', 'a.author']
  },
  date_published: {
    selectors: ['time'],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: ['.c-head_dek', '.article-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['img.__image-lead__', '.article-main-body'], '.article-main-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'figure.image': function figureImage($node) {
        var $img = $node.find('img');
        $img.attr('width', '100%');
        $img.attr('height', '100%');
        $img.addClass('__image-lead__');
        $node.remove('.imgContainer').prepend($img);
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwTodayComExtractor = {
  domain: 'www.today.com',
  title: {
    selectors: ['h1.article-hero-headline__htag', 'h1.entry-headline']
  },
  author: {
    selectors: ['span.byline-name', ['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: ['time[datetime]', ['meta[name="DC.date.issued"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-body__content', '.entry-container'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.label-comment']
  }
};
var WwwAlComExtractor = {
  domain: 'www.al.com',
  title: {
    selectors: [['meta[name="title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="article_author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article_date_original"]', 'value']],
    timezone: 'EST'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwThepennyhoarderComExtractor = {
  domain: 'www.thepennyhoarder.com',
  title: {
    selectors: [['meta[name="dcterms.title"]', 'value']]
  },
  author: {
    selectors: [['link[rel="author"]', 'title']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.post-img', '.post-text'], '.post-text', '.single-post-content-inner'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwWesternjournalismComExtractor = {
  domain: 'www.westernjournalism.com',
  title: {
    selectors: ['title', 'h1.entry-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']]
  },
  dek: {
    selectors: ['.subtitle']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-sharing.top + div'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.ad-notice-small']
  }
};
var WwwAmericanowComExtractor = {
  domain: 'www.americanow.com',
  title: {
    selectors: ['.title', ['meta[name="title"]', 'value']]
  },
  author: {
    selectors: ['.byline']
  },
  date_published: {
    selectors: [['meta[name="publish_date"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.article-content', '.image', '.body'], '.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.article-video-wrapper', '.show-for-small-only']
  }
};
var ScienceflyComExtractor = {
  domain: 'sciencefly.com',
  title: {
    selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title']
  },
  author: {
    selectors: ['div.cb-author', 'div.cb-author-title']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['div.theiaPostSlider_slides img', 'src']]
  },
  content: {
    selectors: ['div.theiaPostSlider_slides'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var HellogigglesComExtractor = {
  domain: 'hellogiggles.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], '.title']
  },
  author: {
    selectors: ['.byline-wrapper span.author_name', '.author-link']
  },
  date_published: {
    selectors: [['meta[property="article:published_time"]', 'content'], ['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.main-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var ThoughtcatalogComExtractor = {
  domain: 'thoughtcatalog.com',
  title: {
    selectors: ['h1.title', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['cite a', 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 'h1.writer-name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry.post'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.tc_mark', 'figcaption']
  }
};
var WwwInquisitrComExtractor = {
  domain: 'www.inquisitr.com',
  title: {
    selectors: ['h1.entry-title.story--header--title']
  },
  author: {
    selectors: ['div.story--header--author']
  },
  date_published: {
    selectors: [['meta[name="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article.story', '.entry-content.'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.post-category', '.story--header--socials', '.story--header--content']
  }
};
var WwwNbcnewsComExtractor = {
  domain: 'www.nbcnews.com',
  title: {
    selectors: ['div.article-hero-headline h1', 'div.article-hed h1']
  },
  author: {
    selectors: ['div.article-inline-byline span.byline-name', 'span.byline_author']
  },
  date_published: {
    selectors: [['meta[name="article:published"]', 'value'], ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-body__content', 'div.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var FortuneComExtractor = {
  domain: 'fortune.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: ['.MblGHNMJ'],
    timezone: 'UTC'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['picture', 'article.row'], 'article.row'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwLinkedinComExtractor = {
  domain: 'www.linkedin.com',
  title: {
    selectors: ['.article-title', 'h1']
  },
  author: {
    selectors: ['.main-author-card h3', ['meta[name="article:author"]', 'value'], '.entity-name a[rel=author]']
  },
  date_published: {
    selectors: ['.base-main-card__metadata', ['time[itemprop="datePublished"]', 'datetime']],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-content__body', ['header figure', '.prose'], '.prose'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.entity-image']
  }
};
var ObamawhitehouseArchivesGovExtractor = {
  domain: 'obamawhitehouse.archives.gov',
  supportedDomains: ['whitehouse.gov'],
  title: {
    selectors: ['h1', '.pane-node-title']
  },
  author: {
    selectors: ['.blog-author-link', '.node-person-name-link']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.field-name-field-forall-summary']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    defaultCleaner: false,
    selectors: ['div#content-start', '.pane-node-field-forall-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pane-node-title', '.pane-custom.pane-1']
  }
};
var WwwOpposingviewsComExtractor = {
  domain: 'www.opposingviews.com',
  title: {
    selectors: ['h1.m-detail-header--title', 'h1.title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], 'div.date span span a']
  },
  date_published: {
    selectors: [['meta[name="published"]', 'value'], ['meta[name="publish_date"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.m-detail--body', '.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.show-for-small-only']
  }
};
var WwwProspectmagazineCoUkExtractor = {
  domain: 'www.prospectmagazine.co.uk',
  title: {
    selectors: ['.blog-header__title', '.page-title']
  },
  author: {
    selectors: ['.blog-header__author-link', '.aside_author .title']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], '.post-info'],
    timezone: 'Europe/London'
  },
  dek: {
    selectors: ['.blog-header__description', '.page-subtitle']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.blog__container', 'article .post_content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var ForwardComExtractor = {
  domain: 'forward.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['.post-author a', '.author-name', ['meta[name="sailthru.author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[name="date"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.content-container article', ['.post-item-media-wrap', '.post-item p']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.post-author', '.donate-box', '.message', '.subtitle']
  }
};
var WwwQdailyComExtractor = {
  domain: 'www.qdaily.com',
  title: {
    selectors: ['h2', 'h2.title']
  },
  author: {
    selectors: ['.name']
  },
  date_published: {
    selectors: [['.date.smart-date', 'data-origindate']]
  },
  dek: {
    selectors: ['.excerpt']
  },
  lead_image_url: {
    selectors: [['.article-detail-hd img', 'src']]
  },
  content: {
    selectors: ['.detail'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.lazyload', '.lazylad', '.lazylood']
  }
};
var GothamistComExtractor = {
  domain: 'gothamist.com',
  supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
  title: {
    selectors: ['h1', '.entry-header h1']
  },
  author: {
    // There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
    selectors: ['.article-metadata:nth-child(3) .byline-author', '.author']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], 'abbr', 'abbr.published'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: [null]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-body', '.entry-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.image-none': 'figure',
      '.image-none i': 'figcaption',
      'div.image-left': 'figure',
      '.image-left i': 'figcaption',
      'div.image-right': 'figure',
      '.image-right i': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
  }
};
var WwwFoolComExtractor = {
  domain: 'www.fool.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.author-inline .author-name']
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value'], 'header h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.tailwind-article-body', '.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.caption img': function captionImg($node) {
        var src = $node.attr('src');
        $node.parent().replaceWith("<figure><img src=\"".concat(src, "\"/></figure>"));
      },
      '.caption': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['#pitch']
  }
};
var WwwSlateComExtractor = {
  domain: 'www.slate.com',
  title: {
    selectors: ['.hed', 'h1']
  },
  author: {
    selectors: ['a[rel=author]']
  },
  date_published: {
    selectors: ['.pub-date'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['.dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment']
  }
};
var IciRadioCanadaCaExtractor = {
  domain: 'ici.radio-canada.ca',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="dc.creator"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="dc.date.created"]', 'value']],
    format: 'YYYY-MM-DD|HH[h]mm',
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['div.lead-container', '.bunker-component.lead']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['section.document-content-style', ['.main-multimedia-item', '.news-story-content']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwFortinetComExtractor = {
  domain: 'www.fortinet.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.b15-blog-meta__author']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12'],
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'figure';
        }

        return null;
      }
    }
  }
};
var WwwFastcompanyComExtractor = {
  domain: 'www.fastcompany.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.post__deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.post__article']
  }
};
var BlisterreviewComExtractor = {
  domain: 'blisterreview.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
  },
  author: {
    selectors: ['span.author-name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['time.entry-date', 'datetime'], ['meta[itemprop="datePublished"]', 'content']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[itemprop="image"]', 'content'], ['meta[name="twitter:image"]', 'content'], ['img.attachment-large', 'src']]
  },
  content: {
    selectors: [['.elementor-section-wrap', '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      figcaption: 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.comments-area']
  }
};
var NewsMynaviJpExtractor = {
  domain: 'news.mynavi.jp',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['a.articleHeader_name', 'main div.article-author a.article-author__name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-body', 'main article div'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      img: function img($node) {
        var src = $node.attr('data-original');

        if (src !== '') {
          $node.attr('src', src);
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var ClinicaltrialsGovExtractor = {
  domain: 'clinicaltrials.gov',
  title: {
    selectors: ['h1.tr-solo_record']
  },
  author: {
    selectors: ['div#sponsor.tr-info-text']
  },
  date_published: {
    // selectors: ['span.term[data-term="Last Update Posted"]'],
    selectors: ['div:has(> span.term[data-term="Last Update Posted"])']
  },
  content: {
    selectors: ['div#tab-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.usa-alert> img']
  }
};
var GithubComExtractor = {
  domain: 'github.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [// enter author selectors
    ]
  },
  date_published: {
    selectors: [['relative-time[datetime]', 'datetime'], ['span[itemprop="dateModified"] relative-time', 'datetime']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value'], 'span[itemprop="about"]']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['#readme article']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwRedditComExtractor = {
  domain: 'www.reddit.com',
  title: {
    selectors: ['div[data-test-id="post-content"] h1', 'div[data-test-id="post-content"] h2']
  },
  author: {
    selectors: ['div[data-test-id="post-content"] a[href*="user/"]']
  },
  date_published: {
    selectors: ['div[data-test-id="post-content"] span[data-click-id="timestamp"]', 'div[data-test-id="post-content"] a[data-click-id="timestamp"]']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['div[data-test-id="post-content"] p'], // text post
    ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
    'div[data-test-id="post-content"] div[data-click-id="media"]'], // external link with media preview (YouTube, imgur album, etc...)
    ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
    ['div[data-test-id="post-content"] a'], // external link
    'div[data-test-id="post-content"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div[role="img"]': function divRoleImg($node) {
        // External link image preview
        var $img = $node.find('img');
        var bgImg = $node.css('background-image');

        if ($img.length === 1 && bgImg) {
          $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
          return $img;
        }

        return $node;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.icon', 'span[id^="PostAwardBadges"]', 'div a[data-test-id="comments-page-link-num-comments"]']
  }
};
var OtrsComExtractor = {
  domain: 'otrs.com',
  title: {
    selectors: ['#main article h1']
  },
  author: {
    selectors: ['div.dateplusauthor a']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#main article'],
    defaultCleaner: false,
    transforms: {},
    clean: ['div.dateplusauthor', 'div.gr-12.push-6.footershare', '#atftbx', 'div.category-modul']
  }
};
var WwwOssnewsJpExtractor = {
  domain: 'www.ossnews.jp',
  title: {
    selectors: ['#alpha-block h1.hxnewstitle']
  },
  author: null,
  date_published: {
    selectors: ['p.fs12'],
    format: 'YYYY年MM月DD日 HH:mm',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#alpha-block .section:has(h1.hxnewstitle)'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};
var BuzzapJpExtractor = {
  domain: 'buzzap.jp',
  title: {
    selectors: ['h1.entry-title']
  },
  author: null,
  date_published: {
    selectors: [['time.entry-date', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.ctiframe'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};
var WwwAsahiComExtractor = {
  domain: 'www.asahi.com',
  title: {
    selectors: ['main h1', '.ArticleTitle h1']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="pubdate"]', 'value']]
  },
  dek: null,
  excerpt: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['main'],
    defaultCleaner: false,
    transforms: {},
    clean: ['div.AdMod', 'div.LoginSelectArea', 'time', 'div.notPrint']
  }
};
var WwwSanwaCoJpExtractor = {
  domain: 'www.sanwa.co.jp',
  title: {
    selectors: ['#newsContent h1']
  },
  author: null,
  date_published: {
    selectors: ['dl.date'],
    format: 'YYYY.MM.DD',
    timezone: 'Asia/Tokyo'
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#newsContent'],
    defaultCleaner: false,
    transforms: {},
    clean: ['#smartphone', 'div.sns_box', 'div.contentFoot']
  }
};
var WwwElecomCoJpExtractor = {
  domain: 'www.elecom.co.jp',
  title: {
    selectors: ['title']
  },
  author: null,
  date_published: {
    selectors: ['p.section-last'],
    format: 'YYYY.MM.DD',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['td.TableMain2'],
    defaultCleaner: false,
    transforms: {
      table: function table($node) {
        $node.attr('width', 'auto');
      }
    },
    clean: []
  }
};
var ScanNetsecurityNeJpExtractor = {
  domain: 'scan.netsecurity.ne.jp',
  title: {
    selectors: ['header.arti-header h1.head']
  },
  author: null,
  date_published: {
    selectors: [['meta[name="article:modified_time"]', 'value']]
  },
  dek: {
    selectors: ['header.arti-header p.arti-summary']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.arti-content.arti-content--thumbnail'],
    defaultCleaner: false,
    transforms: {},
    clean: ['aside.arti-giga']
  }
};
var JvndbJvnJpExtractor = {
  domain: 'jvndb.jvn.jp',
  title: {
    selectors: ['title']
  },
  author: null,
  date_published: {
    selectors: ['div.modifytxt:nth-child(2)'],
    format: 'YYYY/MM/DD',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['#news-list'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};
var GeniusComExtractor = {
  domain: 'genius.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['h2 a']
  },
  date_published: {
    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
      var json = JSON.parse(res);
      return json.song.release_date;
    }]]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
      var json = JSON.parse(res);
      return json.song.album.cover_art_url;
    }]]
  },
  content: {
    selectors: ['.lyrics'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwJnsaOrgExtractor = {
  domain: 'www.jnsa.org',
  title: {
    selectors: ['#wgtitle h2']
  },
  author: null,
  date_published: null,
  dek: null,
  excerpt: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#main_area'],
    transforms: {},
    clean: ['#pankuzu', '#side']
  }
};
var PhpspotOrgExtractor = {
  domain: 'phpspot.org',
  title: {
    selectors: ['h3.hl']
  },
  author: null,
  date_published: {
    selectors: ['h4.hl'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['div.entrybody'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};
var WwwInfoqComExtractor = {
  domain: 'www.infoq.com',
  title: {
    selectors: ['h1.heading']
  },
  author: {
    selectors: ['div.widget.article__authors']
  },
  date_published: {
    selectors: ['.article__readTime.date'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article__data'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};
var WwwMoongiftJpExtractor = {
  domain: 'www.moongift.jp',
  title: {
    selectors: ['h1.title a']
  },
  author: null,
  date_published: {
    selectors: ['ul.meta li:not(.social):first-of-type'],
    timezone: 'Asia/Tokyo'
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#main'],
    transforms: {},
    clean: ['ul.mg_service.cf']
  }
};
var WwwItmediaCoJpExtractor = {
  domain: 'www.itmedia.co.jp',
  supportedDomains: ['www.atmarkit.co.jp', 'techtarget.itmedia.co.jp', 'nlab.itmedia.co.jp'],
  title: {
    selectors: ['#cmsTitle h1']
  },
  author: {
    selectors: ['#byline']
  },
  date_published: {
    selectors: [['meta[name="article:modified_time"]', 'value']]
  },
  dek: {
    selectors: ['#cmsAbstract h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#cmsBody'],
    defaultCleaner: false,
    transforms: {},
    clean: ['#snsSharebox']
  }
};
var WwwPublickey1JpExtractor = {
  domain: 'www.publickey1.jp',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.bloggerinchief p:first-of-type', '#subcol p:has(img)']
  },
  date_published: {
    selectors: ['div.pubdate'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#maincol'],
    defaultCleaner: false,
    transforms: {},
    clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer']
  }
};
var TakagihiromitsuJpExtractor = {
  domain: 'takagi-hiromitsu.jp',
  title: {
    selectors: ['h3']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[http-equiv="Last-Modified"]', 'value']]
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['div.body'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};
var BookwalkerJpExtractor = {
  domain: 'bookwalker.jp',
  title: {
    selectors: ['h1.p-main__title', 'h1.main-heading']
  },
  author: {
    selectors: ['div.p-author__list', 'div.authors']
  },
  date_published: {
    selectors: ['dl.p-information__data dd:nth-of-type(7)', '.work-info .work-detail:first-of-type .work-detail-contents:last-of-type'],
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.p-main__information', ['div.main-info', 'div.main-cover-inner']],
    defaultCleaner: false,
    transforms: {},
    clean: ['span.label.label--trial', 'dt.info-head.info-head--coin', 'dd.info-contents.info-contents--coin', 'div.info-notice.fn-toggleClass']
  }
};
var WwwYomiuriCoJpExtractor = {
  domain: 'www.yomiuri.co.jp',
  title: {
    selectors: ['h1.title-article.c-article-title']
  },
  author: null,
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.p-main-contents'],
    transforms: {},
    clean: []
  }
};
var JapanCnetComExtractor = {
  domain: 'japan.cnet.com',
  title: {
    selectors: ['.leaf-headline-ttl']
  },
  author: {
    selectors: ['.writer']
  },
  date_published: {
    selectors: ['.date'],
    format: 'YYYY年MM月DD日 HH時mm分',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article_body'],
    transforms: {},
    clean: []
  }
};
var DeadlineComExtractor = {
  domain: 'deadline.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['section.author h2']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'],
    transforms: {
      '.embed-twitter': function embedTwitter($node) {
        var innerHtml = $node.html();
        $node.replaceWith(innerHtml);
      }
    },
    clean: ['figcaption']
  }
};
var WwwGizmodoJpExtractor = {
  domain: 'www.gizmodo.jp',
  title: {
    selectors: ['h1.p-post-title']
  },
  author: {
    selectors: ['li.p-post-AssistAuthor']
  },
  date_published: {
    selectors: [['li.p-post-AssistTime time', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article.p-post'],
    transforms: {
      'img.p-post-thumbnailImage': function imgPPostThumbnailImage($node) {
        var src = $node.attr('src');
        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
      }
    },
    clean: ['h1.p-post-title', 'ul.p-post-Assist']
  }
};
var GetnewsJpExtractor = {
  domain: 'getnews.jp',
  title: {
    selectors: ['article h1']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value'], 'span.prof']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['ul.cattag-top time', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.post-bodycopy'],
    transforms: {},
    clean: []
  }
};
var WwwLifehackerJpExtractor = {
  domain: 'www.lifehacker.jp',
  title: {
    selectors: ['h1[class^="article_pArticle_Title"]', 'h1.lh-summary-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], 'p.lh-entryDetailInner--credit']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['div.lh-entryDetail-header time', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div[class^="article_pArticle_Body__"]', 'div.lh-entryDetail-body'],
    transforms: {
      'img.lazyload': function imgLazyload($node) {
        var src = $node.attr('src');
        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
      }
    },
    clean: ['p.lh-entryDetailInner--credit']
  }
};
var SectIijAdJpExtractor = {
  domain: 'sect.iij.ad.jp',
  title: {
    selectors: ['div.title-box-inner h1', 'h3']
  },
  author: {
    selectors: ['p.post-author a', 'dl.entrydate dd']
  },
  date_published: {
    selectors: ['time'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-inner', '#article'],
    transforms: {},
    clean: ['dl.entrydate']
  }
};
var WwwOreillyCoJpExtractor = {
  domain: 'www.oreilly.co.jp',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'h3']
  },
  author: {
    selectors: ['span[itemprop="author"]', 'li[itemprop="author"]']
  },
  date_published: {
    selectors: [['dd[itemprop="datePublished"]', 'content'], ['meta[itemprop="datePublished"]', 'value']],
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image:secure_url"]', 'value'], ['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['section.detail', '#content'],
    defaultCleaner: false,
    transforms: {},
    clean: ['.social-tools']
  }
};
var WwwIpaGoJpExtractor = {
  domain: 'www.ipa.go.jp',
  title: {
    selectors: ['h1']
  },
  author: null,
  date_published: {
    selectors: ['p.ipar_text_right'],
    format: 'YYYY年M月D日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['#ipar_main'],
    defaultCleaner: false,
    transforms: {},
    clean: ['p.ipar_text_right']
  }
};
var WeeklyAsciiJpExtractor = {
  domain: 'weekly.ascii.jp',
  title: {
    selectors: ['article h1', 'h1[itemprop="headline"]']
  },
  author: {
    selectors: ['p.author']
  },
  date_published: {
    selectors: ['p.date', ['meta[name="odate"]', 'value']],
    format: 'YYYY年MM月DD日 HH:mm',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div#contents_detail', 'div.article'],
    transforms: {},
    clean: []
  }
};
var TechlogIijAdJpExtractor = {
  domain: 'techlog.iij.ad.jp',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['a[rel="author"]']
  },
  date_published: {
    selectors: [['time.entry-date', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.entry-content'],
    defaultCleaner: false,
    transforms: {},
    clean: ['.wp_social_bookmarking_light']
  }
};
var WiredJpExtractor = {
  domain: 'wired.jp',
  title: {
    selectors: ['h1[data-testid="ContentHeaderHed"]', 'h1.post-title']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value'], 'p[itemprop="author"]']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['time', 'datetime']]
  },
  dek: {
    selectors: ['div[class^="ContentHeaderDek"]', '.post-intro']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div[data-attribute-verso-pattern="article-body"]', 'article.article-detail'],
    transforms: {
      'img[data-original]': function imgDataOriginal($node) {
        var dataOriginal = $node.attr('data-original');
        var src = $node.attr('src');
        var url = URL$1.resolve(src, dataOriginal);
        $node.attr('src', url);
      }
    },
    clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer']
  }
};
var JapanZdnetComExtractor = {
  domain: 'japan.zdnet.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="cXenseParse:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article_body'],
    transforms: {},
    clean: []
  }
};
var WwwRbbtodayComExtractor = {
  domain: 'www.rbbtoday.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.writer.writer-name']
  },
  date_published: {
    selectors: [['header time', 'datetime']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value'], '.arti-summary']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.arti-content'],
    transforms: {},
    clean: ['.arti-giga']
  }
};
var WwwLemondeFrExtractor = {
  domain: 'www.lemonde.fr',
  title: {
    selectors: ['h1.article__title']
  },
  author: {
    selectors: ['.author__name']
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.article__desc']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article__content'],
    transforms: {},
    clean: ['figcaption']
  }
};
var WwwPhoronixComExtractor = {
  domain: 'www.phoronix.com',
  title: {
    selectors: ['article h1', 'article header']
  },
  author: {
    selectors: ['.author a:first-child']
  },
  date_published: {
    selectors: ['.author'],
    // 1 June 2019 at 08:34 PM EDT
    format: 'D MMMM YYYY at hh:mm',
    timezone: 'America/New_York'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var PitchforkComExtractor = {
  domain: 'pitchfork.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'title']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value'], '.authors-detail__display-name']
  },
  date_published: {
    selectors: ['div[class^="InfoSliceWrapper-"]', ['.pub-date', 'datetime']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value'], '.review-detail__abstract']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['.single-album-tombstone__art img', 'src']]
  },
  content: {
    selectors: ['div.body__inner-container', '.review-detail__text']
  },
  extend: {
    score: {
      selectors: ['p[class*="Rating"]', '.score']
    }
  }
};
var BiorxivOrgExtractor = {
  domain: 'biorxiv.org',
  title: {
    selectors: ['h1#page-title']
  },
  author: {
    selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors']
  },
  content: {
    selectors: ['div#abstract-1'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var EpaperZeitDeExtractor = {
  domain: 'epaper.zeit.de',
  title: {
    selectors: ['p.title']
  },
  author: {
    selectors: ['.article__author']
  },
  date_published: null,
  excerpt: {
    selectors: ['subtitle']
  },
  lead_image_url: null,
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'p.title': 'h1',
      '.article__author': 'p',
      byline: 'p',
      linkbox: 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['image-credits', 'box[type=citation]']
  }
};
var WwwLadbibleComExtractor = {
  domain: 'www.ladbible.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['[class*=Byline]']
  },
  date_published: {
    selectors: ['time'],
    timezone: 'Europe/London'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['[class*=ArticleContainer]'],
    clean: ['time', 'source', 'a[href^="https://www.ladbible.com/"]', 'picture', '[class*=StyledCardBlock]']
  }
};
var TimesofindiaIndiatimesComExtractor = {
  domain: 'timesofindia.indiatimes.com',
  title: {
    selectors: ['h1']
  },
  extend: {
    reporter: {
      selectors: ['div.byline'],
      transforms: {}
    }
  },
  date_published: {
    selectors: ['.byline'],
    format: 'MMM D, YYYY, HH:mm z',
    timezone: 'Asia/Kolkata'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.contentwrapper:has(section)'],
    defaultCleaner: false,
    clean: ['section', 'h1', '.byline', '.img_cptn', '.icon_share_wrap', 'ul[itemtype="https://schema.org/BreadcrumbList"]']
  }
};
var MaTtiasBeExtractor = {
  domain: 'ma.ttias.be',
  title: {
    selectors: [['meta[name="twitter:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  content: {
    selectors: [['.content']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: function h2($node) {
        // The "id" attribute values would result in low scores and the element being
        // removed.
        $node.attr('id', null); // h1 elements will be demoted to h2, so demote h2 elements to h3.

        return 'h3';
      },
      h1: function h1($node) {
        // The "id" attribute values would result in low scores and the element being
        // removed.
        $node.attr('id', null); // A subsequent h2 will be removed if there is not a paragraph before it, so
        // add a paragraph here. It will be removed anyway because it is empty.

        $node.after('<p></p>');
      },
      ul: function ul($node) {
        // Articles contain lists of links which look like, but are not, navigation
        // elements. Adding this class attribute avoids them being incorrectly removed.
        $node.attr('class', 'entry-content-asset');
      }
    }
  }
};
var PastebinComExtractor = {
  domain: 'pastebin.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.username', '.paste_box_line2 .t_us + a']
  },
  date_published: {
    selectors: ['.date', '.paste_box_line2 .t_da + span'],
    timezone: 'America/New_York',
    format: 'MMMM D, YYYY'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.source', '#selectable .text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      ol: 'div',
      li: 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
/* eslint-disable no-nested-ternary */

/* eslint-disable no-unused-expressions */

var WwwAbendblattDeExtractor = {
  domain: 'www.abendblatt.de',
  title: {
    selectors: ['h2.article__header__headline']
  },
  author: {
    selectors: ['span.author-info__name-text']
  },
  date_published: {
    selectors: [['time.teaser-stream-time', 'datetime'], ['time.article__header__date', 'datetime']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article__body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      p: function p($node) {
        if (!$node.hasClass('obfuscated')) return null;
        var o = '';
        var n = 0;

        for (var i = $node.text(); n < i.length; n += 1) {
          var r = i.charCodeAt(n);
          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
        }

        $node.html(o);
        $node.removeClass('obfuscated');
        $node.addClass('deobfuscated');
        return null;
      },
      div: function div($node) {
        if (!$node.hasClass('obfuscated')) return null;
        var o = '';
        var n = 0;

        for (var i = $node.text(); n < i.length; n += 1) {
          var r = i.charCodeAt(n);
          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
        }

        $node.html(o);
        $node.removeClass('obfuscated');
        $node.addClass('deobfuscated');
        return null;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var WwwGrueneDeExtractor = {
  domain: 'www.gruene.de',
  title: {
    selectors: ['header h1']
  },
  author: null,
  date_published: null,
  dek: null,
  lead_image_url: {
    selectors: [['meta[property="og:image"]', 'content']]
  },
  content: {
    // selectors: ['section'],
    selectors: [['section header', 'section h2', 'section p', 'section ol']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['figcaption', 'p[class]']
  }
};
var WwwEngadgetComExtractor = {
  domain: 'www.engadget.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['a.th-meta[data-ylk*="subsec:author"]']
  },
  // Engadget stories have publish dates, but the only representation of them on the page
  // is in a format like "2h ago". There are also these tags with blank values:
  // <meta class="swiftype" name="published_at" data-type="date" value="">
  date_published: {
    selectors: [// enter selectors
    ]
  },
  dek: {
    selectors: ['div[class*="o-title_mark"] div']
  },
  // Engadget stories do have lead images specified by an og:image meta tag, but selecting
  // the value attribute of that tag fails. I believe the "&#x2111;" sequence of characters
  // is triggering this inability to select the attribute value.
  lead_image_url: {
    selectors: [// enter selectors
    ]
  },
  content: {
    selectors: [[// Some figures will be inside div.article-text, but some header figures/images
    // will not.
    '#page_body figure:not(div.article-text figure)', 'div.article-text']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};
var ArstechnicaComExtractor = {
  domain: 'arstechnica.com',
  // Articles from this site are often paginated, but I was unable to write a CSS
  // selector to find the next page. On the last page, there will be a link with a CSS
  // selector indicating that the previous page is next. But the parser appears to find
  // the next page without this extractor finding it, as long as the fallback option is
  // left at its default value of true.
  title: {
    selectors: ['title']
  },
  author: {
    selectors: ['*[rel="author"] *[itemprop="name"]']
  },
  date_published: {
    selectors: [['.byline time', 'datetime']]
  },
  dek: {
    selectors: ['h2[itemprop="description"]']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div[itemprop="articleBody"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: function h2($node) {
        // Some pages have an element h2 that is significant, and that the parser will
        // remove if not following a paragraph. Adding this empty paragraph fixes it, and
        // the empty paragraph will be removed anyway.
        $node.before('<p></p>');
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result.
    clean: [// Remove enlarge links and separators inside image captions.
    'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
    // removed them.
    'figure.video', // Image galleries that do not work.
    '.gallery', 'aside', '.sidebar']
  }
};
var WwwNdtvComExtractor = {
  domain: 'www.ndtv.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
  },
  author: {
    selectors: ['span[itemprop="author"] span[itemprop="name"]']
  },
  date_published: {
    selectors: [['span[itemprop="dateModified"]', 'content']]
  },
  dek: {
    selectors: ['h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div[itemprop="articleBody"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // This site puts a dateline in a 'b' above the first paragraph, and then somehow
      // blends it into the first paragraph with CSS. This transform moves the dateline
      // to the first paragraph.
      '.place_cont': function place_cont($node) {
        if (!$node.parents('p').length) {
          var nextSibling = $node.next('p');

          if (nextSibling) {
            $node.remove();
            nextSibling.prepend($node);
          }
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.highlghts_Wdgt', '.ins_instory_dv_caption', 'input', '._world-wrapper .mt20']
  }
};
var SpektrumExtractor = {
  domain: 'www.spektrum.de',
  title: {
    selectors: ['.content__title']
  },
  author: {
    selectors: ['.content__author__info__name']
  },
  date_published: {
    selectors: ['.content__meta__date'],
    timezone: 'Europe/Berlin'
  },
  dek: {
    selectors: ['.content__intro']
  },
  lead_image_url: {
    selectors: [// This is how the meta tag appears in the original source code.
    ['meta[name="og:image"]', 'value'], // This is how the meta tag appears in the DOM in Chrome.
    // The selector is included here to make the code work within the browser as well.
    ['meta[property="og:image"]', 'content'], // This is the image that is shown on the page.
    // It can be slightly cropped compared to the original in the meta tag.
    '.image__article__top img']
  },
  content: {
    selectors: ['article.content'],
    clean: ['.breadcrumbs', '.hide-for-print', 'aside', 'header h2', '.image__article__top', '.content__author', '.copyright', '.callout-box']
  }
};
var PostlightComExtractor = {
  domain: 'postlight.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.single-hero__abstract']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['main.post'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['section.pl-post-link', 'aside', 'section.insights_featured_case_studies']
  }
};
var WwwInvestmentexecutiveComExtractor = {
  domain: 'www.investmentexecutive.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['div[itemprop="author"]']
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['section.article-body'],
    clean: ['.hidden']
  }
};
var WwwCbcCaExtractor = {
  domain: 'www.cbc.ca',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.authorText', '.bylineDetails']
  },
  date_published: {
    selectors: [['.timeStamp[datetime]', 'datetime']]
  },
  dek: {
    selectors: ['.deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.story'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var CustomExtractors =
/*#__PURE__*/
_Object$freeze({
  BloggerExtractor: BloggerExtractor,
  NYMagExtractor: NYMagExtractor,
  WikipediaExtractor: WikipediaExtractor,
  TwitterExtractor: TwitterExtractor,
  NYTimesExtractor: NYTimesExtractor,
  TheAtlanticExtractor: TheAtlanticExtractor,
  NewYorkerExtractor: NewYorkerExtractor,
  WiredExtractor: WiredExtractor,
  MSNExtractor: MSNExtractor,
  YahooExtractor: YahooExtractor,
  BuzzfeedExtractor: BuzzfeedExtractor,
  WikiaExtractor: WikiaExtractor,
  LittleThingsExtractor: LittleThingsExtractor,
  PoliticoExtractor: PoliticoExtractor,
  DeadspinExtractor: DeadspinExtractor,
  BroadwayWorldExtractor: BroadwayWorldExtractor,
  ApartmentTherapyExtractor: ApartmentTherapyExtractor,
  MediumExtractor: MediumExtractor,
  WwwTmzComExtractor: WwwTmzComExtractor,
  WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
  WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
  NewrepublicComExtractor: NewrepublicComExtractor,
  MoneyCnnComExtractor: MoneyCnnComExtractor,
  WwwThevergeComExtractor: WwwThevergeComExtractor,
  WwwCnnComExtractor: WwwCnnComExtractor,
  WwwAolComExtractor: WwwAolComExtractor,
  WwwYoutubeComExtractor: WwwYoutubeComExtractor,
  WwwTheguardianComExtractor: WwwTheguardianComExtractor,
  WwwSbnationComExtractor: WwwSbnationComExtractor,
  WwwBloombergComExtractor: WwwBloombergComExtractor,
  WwwBustleComExtractor: WwwBustleComExtractor,
  WwwNprOrgExtractor: WwwNprOrgExtractor,
  WwwRecodeNetExtractor: WwwRecodeNetExtractor,
  QzComExtractor: QzComExtractor,
  WwwDmagazineComExtractor: WwwDmagazineComExtractor,
  WwwReutersComExtractor: WwwReutersComExtractor,
  MashableComExtractor: MashableComExtractor,
  WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
  WwwVoxComExtractor: WwwVoxComExtractor,
  NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
  WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
  WwwLatimesComExtractor: WwwLatimesComExtractor,
  PagesixComExtractor: PagesixComExtractor,
  ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
  WwwCbssportsComExtractor: WwwCbssportsComExtractor,
  WwwMsnbcComExtractor: WwwMsnbcComExtractor,
  WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
  WwwMentalflossComExtractor: WwwMentalflossComExtractor,
  AbcnewsGoComExtractor: AbcnewsGoComExtractor,
  WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
  WwwCnbcComExtractor: WwwCnbcComExtractor,
  WwwPopsugarComExtractor: WwwPopsugarComExtractor,
  ObserverComExtractor: ObserverComExtractor,
  PeopleComExtractor: PeopleComExtractor,
  WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
  WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
  twofortysevensportsComExtractor: twofortysevensportsComExtractor,
  UproxxComExtractor: UproxxComExtractor,
  WwwEonlineComExtractor: WwwEonlineComExtractor,
  WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
  WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
  WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
  WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
  WwwSiComExtractor: WwwSiComExtractor,
  WwwRawstoryComExtractor: WwwRawstoryComExtractor,
  WwwCnetComExtractor: WwwCnetComExtractor,
  WwwTodayComExtractor: WwwTodayComExtractor,
  WwwAlComExtractor: WwwAlComExtractor,
  WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
  WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
  WwwAmericanowComExtractor: WwwAmericanowComExtractor,
  ScienceflyComExtractor: ScienceflyComExtractor,
  HellogigglesComExtractor: HellogigglesComExtractor,
  ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
  WwwInquisitrComExtractor: WwwInquisitrComExtractor,
  WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
  FortuneComExtractor: FortuneComExtractor,
  WwwLinkedinComExtractor: WwwLinkedinComExtractor,
  ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
  WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
  WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
  ForwardComExtractor: ForwardComExtractor,
  WwwQdailyComExtractor: WwwQdailyComExtractor,
  GothamistComExtractor: GothamistComExtractor,
  WwwFoolComExtractor: WwwFoolComExtractor,
  WwwSlateComExtractor: WwwSlateComExtractor,
  IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
  WwwFortinetComExtractor: WwwFortinetComExtractor,
  WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
  BlisterreviewComExtractor: BlisterreviewComExtractor,
  NewsMynaviJpExtractor: NewsMynaviJpExtractor,
  ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
  GithubComExtractor: GithubComExtractor,
  WwwRedditComExtractor: WwwRedditComExtractor,
  OtrsComExtractor: OtrsComExtractor,
  WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
  BuzzapJpExtractor: BuzzapJpExtractor,
  WwwAsahiComExtractor: WwwAsahiComExtractor,
  WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
  WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
  ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
  JvndbJvnJpExtractor: JvndbJvnJpExtractor,
  GeniusComExtractor: GeniusComExtractor,
  WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
  PhpspotOrgExtractor: PhpspotOrgExtractor,
  WwwInfoqComExtractor: WwwInfoqComExtractor,
  WwwMoongiftJpExtractor: WwwMoongiftJpExtractor,
  WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor,
  WwwPublickey1JpExtractor: WwwPublickey1JpExtractor,
  TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
  BookwalkerJpExtractor: BookwalkerJpExtractor,
  WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
  JapanCnetComExtractor: JapanCnetComExtractor,
  DeadlineComExtractor: DeadlineComExtractor,
  WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
  GetnewsJpExtractor: GetnewsJpExtractor,
  WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
  SectIijAdJpExtractor: SectIijAdJpExtractor,
  WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
  WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
  WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
  TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
  WiredJpExtractor: WiredJpExtractor,
  JapanZdnetComExtractor: JapanZdnetComExtractor,
  WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
  WwwLemondeFrExtractor: WwwLemondeFrExtractor,
  WwwPhoronixComExtractor: WwwPhoronixComExtractor,
  PitchforkComExtractor: PitchforkComExtractor,
  BiorxivOrgExtractor: BiorxivOrgExtractor,
  EpaperZeitDeExtractor: EpaperZeitDeExtractor,
  WwwLadbibleComExtractor: WwwLadbibleComExtractor,
  TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
  MaTtiasBeExtractor: MaTtiasBeExtractor,
  PastebinComExtractor: PastebinComExtractor,
  WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
  WwwGrueneDeExtractor: WwwGrueneDeExtractor,
  WwwEngadgetComExtractor: WwwEngadgetComExtractor,
  ArstechnicaComExtractor: ArstechnicaComExtractor,
  WwwNdtvComExtractor: WwwNdtvComExtractor,
  SpektrumExtractor: SpektrumExtractor,
  PostlightComExtractor: PostlightComExtractor,
  WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
  WwwCbcCaExtractor: WwwCbcCaExtractor
});

var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
  var extractor = CustomExtractors[key];
  return _objectSpread({}, acc, mergeSupportedDomains(extractor));
}, {}); // CLEAN AUTHOR CONSTANTS


var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS

var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.

var MS_DATE_STRING = /^\d{13}$/i;
var SEC_DATE_STRING = /^\d{10}$/i;
var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
var TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
var timeUnits = ['seconds?', 'minutes?', 'hours?', 'days?', 'weeks?', 'months?', 'years?'];
var allTimeUnits = timeUnits.join('|');
var TIME_AGO_STRING = new RegExp("(\\d+)\\s+(".concat(allTimeUnits, ")\\s+ago"), 'i');
var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
var allMonths = months.join('|');
var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
var timestamp3 = '-[0-9]{3,4}$';
var SPLIT_DATE_STRING = new RegExp("(".concat(timestamp1, ")|(").concat(timestamp2, ")|(").concat(timestamp3, ")|([0-9]{1,4})|(").concat(allMonths, ")"), 'ig'); // 2016-11-22T08:57-500
// Check if datetime string has an offset at the end

var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.

var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g'); // just the name(s): 'David Smith'.

function cleanAuthor(author) {
  return normalizeSpaces$1(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}

function clean$1(leadImageUrl) {
  leadImageUrl = leadImageUrl.trim();

  if (validUrl$1.isWebUri(leadImageUrl)) {
    return leadImageUrl;
  }

  return null;
} // Return None if the dek wasn't good enough.


function cleanDek(dek, _ref) {
  var $ = _ref.$,
      excerpt = _ref.excerpt; // Sanity check that we didn't get too short or long of a dek.

  if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt

  if (excerpt && excerptContent$1(excerpt, 10) === excerptContent$1(dek, 10)) return null;
  var dekText = stripTags$1(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's
  // not a good dek - bail.

  if (TEXT_LINK_RE.test(dekText)) return null;
  return normalizeSpaces$1(dekText.trim());
}

function cleanDateString(dateString) {
  return (dateString.match(SPLIT_DATE_STRING) || []).join(' ').replace(TIME_MERIDIAN_DOTS_RE, 'm').replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3').replace(CLEAN_DATE_STRING_RE, '$1').trim();
}

function createDate(dateString, timezone, format) {
  if (TIME_WITH_OFFSET_RE.test(dateString)) {
    return moment(new Date(dateString));
  }

  if (TIME_AGO_STRING.test(dateString)) {
    var fragments = TIME_AGO_STRING.exec(dateString);
    return moment().subtract(fragments[1], fragments[2]);
  }

  if (TIME_NOW_STRING.test(dateString)) {
    return moment();
  }

  return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString));
} // Take a date published string, and hopefully return a date out of
// it. Return none if we fail.


function cleanDatePublished(dateString) {
  var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
      timezone = _ref.timezone,
      format = _ref.format; // If string is in milliseconds or seconds, convert to int and return


  if (MS_DATE_STRING.test(dateString)) {
    return new Date(_parseInt$1(dateString, 10)).toISOString();
  }

  if (SEC_DATE_STRING.test(dateString)) {
    return new Date(_parseInt$1(dateString, 10) * 1000).toISOString();
  }

  var date = createDate(dateString, timezone, format);

  if (!date.isValid()) {
    dateString = cleanDateString(dateString);
    date = createDate(dateString, timezone, format);
  }

  return date.isValid() ? date.toISOString() : null;
}

function extractCleanNode(article, _ref) {
  var $ = _ref.$,
      _ref$cleanConditional = _ref.cleanConditionally,
      cleanConditionally = _ref$cleanConditional === void 0 ? true : _ref$cleanConditional,
      _ref$title = _ref.title,
      title = _ref$title === void 0 ? '' : _ref$title,
      _ref$url = _ref.url,
      url = _ref$url === void 0 ? '' : _ref$url,
      _ref$defaultCleaner = _ref.defaultCleaner,
      defaultCleaner = _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner; // Rewrite the tag name to div if it's a top level node like body or
  // html to avoid later complications with multiple body tags.

  rewriteTopLevel$$1(article, $); // Drop small images and spacer images
  // Only do this is defaultCleaner is set to true;
  // this can sometimes be too aggressive.

  if (defaultCleaner) cleanImages$1(article, $); // Make links absolute

  makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed.
  // E.g., stripJunkTags will remove iframes, so we're going to mark
  // YouTube/Vimeo videos as elements we want to keep.

  markToKeep$1(article, $, url); // Drop certain tags like <title>, etc
  // This is -mostly- for cleanliness, not security.

  stripJunkTags$1(article, $); // H1 tags are typically the article title, which should be extracted
  // by the title extractor instead. If there's less than 3 of them (<3),
  // strip them. Otherwise, turn 'em into H2s.

  cleanHOnes$$1(article, $); // Clean headers

  cleanHeaders$1(article, $, title); // We used to clean UL's and OL's here, but it was leading to
  // too many in-article lists being removed. Consider a better
  // way to detect menus particularly and remove them.
  // Also optionally running, since it can be overly aggressive.

  if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes

  removeEmpty$1(article, $); // Remove unnecessary attributes

  cleanAttributes$$1(article, $);
  return article;
}

function cleanTitle$$1(title, _ref) {
  var url = _ref.url,
      $ = _ref.$; // If title has |, :, or - in it, see if
  // we can clean it up.

  if (TITLE_SPLITTERS_RE.test(title)) {
    title = resolveSplitTitle(title, url);
  } // Final sanity check that we didn't get a crazy title.
  // if (title.length > 150 || title.length < 15) {


  if (title.length > 150) {
    // If we did, return h1 from the document if it exists
    var h1 = $('h1');

    if (h1.length === 1) {
      title = h1.text();
    }
  } // strip any html tags in the title text


  return normalizeSpaces$1(stripTags$1(title, $).trim());
}

function extractBreadcrumbTitle(splitTitle, text) {
  // This must be a very breadcrumbed title, like:
  // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
  // NYTimes - Blogs - Bits - The Best Gadgets on Earth
  if (splitTitle.length >= 6) {
    // Look to see if we can find a breadcrumb splitter that happens
    // more than once. If we can, we'll be able to better pull out
    // the title.
    var termCounts = splitTitle.reduce(function (acc, titleText) {
      acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
      return acc;
    }, {});

    var _Reflect$ownKeys$redu = _Reflect$ownKeys$1(termCounts).reduce(function (acc, key) {
      if (acc[1] < termCounts[key]) {
        return [key, termCounts[key]];
      }

      return acc;
    }, [0, 0]),
        _Reflect$ownKeys$redu2 = _slicedToArray$1(_Reflect$ownKeys$redu, 2),
        maxTerm = _Reflect$ownKeys$redu2[0],
        termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it
    // is probably the breadcrumber. Split our title on that instead.
    // Note: max_term should be <= 4 characters, so that " >> "
    // will match, but nothing longer than that.


    if (termCount >= 2 && maxTerm.length <= 4) {
      splitTitle = text.split(maxTerm);
    }

    var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
    var longestEnd = splitEnds.reduce(function (acc, end) {
      return acc.length > end.length ? acc : end;
    }, '');

    if (longestEnd.length > 10) {
      return longestEnd;
    }

    return text;
  }

  return null;
}

function cleanDomainFromTitle(splitTitle, url) {
  // Search the ends of the title, looking for bits that fuzzy match
  // the URL too closely. If one is found, discard it and return the
  // rest.
  //
  // Strip out the big TLDs - it just makes the matching a bit more
  // accurate. Not the end of the world if it doesn't strip right.
  var _URL$parse = URL$1.parse(url),
      host = _URL$parse.host;

  var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
  var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
  var startSlugRatio = wuzzy$1.levenshtein(startSlug, nakedDomain);

  if (startSlugRatio > 0.4 && startSlug.length > 5) {
    return splitTitle.slice(2).join('');
  }

  var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
  var endSlugRatio = wuzzy$1.levenshtein(endSlug, nakedDomain);

  if (endSlugRatio > 0.4 && endSlug.length >= 5) {
    return splitTitle.slice(0, -2).join('');
  }

  return null;
} // Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.


function resolveSplitTitle(title) {
  var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : ''; // Splits while preserving splitters, like:
  // ['The New New York', ' - ', 'The Washington Post']

  var splitTitle = title.split(TITLE_SPLITTERS_RE);

  if (splitTitle.length === 1) {
    return title;
  }

  var newTitle = extractBreadcrumbTitle(splitTitle, title);
  if (newTitle) return newTitle;
  newTitle = cleanDomainFromTitle(splitTitle, url);
  if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit.
  // Just return it all.

  return title;
}

var Cleaners = {
  author: cleanAuthor,
  lead_image_url: clean$1,
  dek: cleanDek,
  date_published: cleanDatePublished,
  content: extractCleanNode,
  title: cleanTitle$$1
}; // likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns a cheerio object $

function extractBestNode($, opts) {
  if (opts.stripUnlikelyCandidates) {
    $ = stripUnlikelyCandidates$1($);
  }

  $ = convertToParagraphs$$1($);
  $ = scoreContent$$1($, opts.weightNodes);
  var $topCandidate = findTopCandidate$$1($);
  return $topCandidate;
}

var GenericContentExtractor = {
  defaultOpts: {
    stripUnlikelyCandidates: true,
    weightNodes: true,
    cleanConditionally: true
  },
  // Extract the content for this resource - initially, pass in our
  // most restrictive opts which will return the highest quality
  // content. On each failure, retry with slightly more lax opts.
  //
  // :param return_type: string. If "node", should return the content
  // as a cheerio node rather than as an HTML string.
  //
  // Opts:
  // stripUnlikelyCandidates: Remove any elements that match
  // non-article-like criteria first.(Like, does this element
  //   have a classname of "comment")
  //
  // weightNodes: Modify an elements score based on whether it has
  // certain classNames or IDs. Examples: Subtract if a node has
  // a className of 'comment', Add if a node has an ID of
  // 'entry-content'.
  //
  // cleanConditionally: Clean the node to return of some
  // superfluous content. Things like forms, ads, etc.
  extract: function extract(_ref, opts) {
    var $ = _ref.$,
        html = _ref.html,
        title = _ref.title,
        url = _ref.url;
    opts = _objectSpread({}, this.defaultOpts, opts);
    $ = $ || cheerio$1.load(html); // Cascade through our extraction-specific opts in an ordered fashion,
    // turning them off as we try to extract content.

    var node = this.getContentNode($, title, url, opts);

    if (nodeIsSufficient$1(node)) {
      return this.cleanAndReturnNode(node, $);
    } // We didn't succeed on first pass, one by one disable our
    // extraction opts and try again.
    // eslint-disable-next-line no-restricted-syntax


    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (var _iterator = _getIterator$1(_Reflect$ownKeys$1(opts).filter(function (k) {
        return opts[k] === true;
      })), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
        var key = _step.value;
        opts[key] = false;
        $ = cheerio$1.load(html);
        node = this.getContentNode($, title, url, opts);

        if (nodeIsSufficient$1(node)) {
          break;
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return this.cleanAndReturnNode(node, $);
  },
  // Get node given current options
  getContentNode: function getContentNode($, title, url, opts) {
    return extractCleanNode(extractBestNode($, opts), {
      $: $,
      cleanConditionally: opts.cleanConditionally,
      title: title,
      url: url
    });
  },
  // Once we got here, either we're at our last-resort node, or
  // we broke early. Make sure we at least have -something- before we
  // move forward.
  cleanAndReturnNode: function cleanAndReturnNode(node, $) {
    if (!node) {
      return null;
    }

    return normalizeSpaces$1($.html(node));
  }
}; // TODO: It would be great if we could merge the meta and selector lists into
// a list of objects, because we could then rank them better. For example,
// .hentry .entry-title is far better suited than <meta title>.
// An ordered list of meta tag names that denote likely article titles. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.

var STRONG_TITLE_META_TAGS = ['tweetmeme-title', 'dc.title', 'rbtitle', 'headline', 'title']; // og:title is weak because it typically contains context that we don't like,
// for example the source site's name. Gotta get that brand into facebook!

var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.

var STRONG_TITLE_SELECTORS = ['.hentry .entry-title', 'h1#articleHeader', 'h1.articleHeader', 'h1.article', '.instapaper_title', '#meebo-title'];
var WEAK_TITLE_SELECTORS = ['article h1', '#entry-title', '.entry-title', '#entryTitle', '#entrytitle', '.entryTitle', '.entrytitle', '#articleTitle', '.articleTitle', 'post post-title', 'h1.title', 'h2.article', 'h1', 'html head title', 'title'];
var GenericTitleExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        metaCache = _ref.metaCache; // First, check to see if we have a matching meta tag that we can make
    // use of that is strongly associated with the headline.

    var title;
    title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // Second, look through our content selectors for the most likely
    // article title that is strongly associated with the headline.

    title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // Third, check for weaker meta tags that may match.

    title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // Last, look for weaker selector tags that may match.

    title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // If no matches, return an empty string

    return '';
  }
}; // An ordered list of meta tag names that denote likely article authors. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
//
// Note: "author" is too often the -developer- of the page, so it is not
// added here.

var AUTHOR_META_TAGS = ['byl', 'clmst', 'dc.author', 'dcsext.author', 'dc.creator', 'rbauthors', 'authors'];
var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.

var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .vcard .fn', '.byline.vcard .fn', '.byline .vcard .fn', '.byline .by .author', '.byline .by', '.byline .author', '.post-author.vcard', '.post-author .vcard', 'a[rel=author]', '#by_author', '.by_author', '#entryAuthor', '.entryAuthor', '.byline a[href*=author]', '#author .authorname', '.author .authorname', '#author', '.author', '.articleauthor', '.ArticleAuthor', '.byline']; // An ordered list of Selectors to find likely article authors, with
// regular expression for content.

var bylineRe = /^[\n\s]*By/i;
var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];
var GenericAuthorExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        metaCache = _ref.metaCache;
    var author; // First, check to see if we have a matching
    // meta tag that we can make use of.

    author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache);

    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    } // Second, look through our selectors looking for potential authors.


    author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2);

    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    } // Last, use our looser regular-expression based selectors for
    // potential authors.
    // eslint-disable-next-line no-restricted-syntax


    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (var _iterator = _getIterator$1(BYLINE_SELECTORS_RE), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
        var _step$value = _slicedToArray$1(_step.value, 2),
            selector = _step$value[0],
            regex = _step$value[1];

        var node = $(selector);

        if (node.length === 1) {
          var text = node.text();

          if (regex.test(text)) {
            return cleanAuthor(text);
          }
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return null;
  }
}; // An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.

var DATE_PUBLISHED_META_TAGS = ['article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date']; // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.

var DATE_PUBLISHED_SELECTORS = ['.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate']; // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse

var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
var DATE_PUBLISHED_URL_RES = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), new RegExp("/(20\\d{2}/".concat(abbrevMonthsStr, "/[0-3]\\d)/"), 'i')];
var GenericDatePublishedExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        metaCache = _ref.metaCache;
    var datePublished; // First, check to see if we have a matching meta tag
    // that we can make use of.
    // Don't try cleaning tags from this string

    datePublished = extractFromMeta$$1($, DATE_PUBLISHED_META_TAGS, metaCache, false);
    if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential
    // date_published's.

    datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL

    datePublished = extractFromUrl$1(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);
    return null;
  }
}; // Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.

var GenericDekExtractor = {
  extract: function extract() {
    return null;
  }
}; // An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.

var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src'];
var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]'];
var POSITIVE_LEAD_IMAGE_URL_HINTS = ['upload', 'wp-content', 'large', 'photo', 'wp-image'];
var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
var NEGATIVE_LEAD_IMAGE_URL_HINTS = ['spacer', 'sprite', 'blank', 'throbber', 'gradient', 'tile', 'bg', 'background', 'icon', 'social', 'header', 'hdr', 'advert', 'spinner', 'loader', 'loading', 'default', 'rating', 'share', 'facebook', 'twitter', 'theme', 'promo', 'ads', 'wp-includes'];
var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
var GIF_RE = /\.gif(\?.*)?$/i;
var JPG_RE = /\.jpe?g(\?.*)?$/i;

function getSig($node) {
  return "".concat($node.attr('class') || '', " ").concat($node.attr('id') || '');
} // Scores image urls based on a variety of heuristics.


function scoreImageUrl(url) {
  url = url.trim();
  var score = 0;

  if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
    score += 20;
  }

  if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
    score -= 20;
  } // TODO: We might want to consider removing this as
  // gifs are much more common/popular than they once were


  if (GIF_RE.test(url)) {
    score -= 10;
  }

  if (JPG_RE.test(url)) {
    score += 10;
  } // PNGs are neutral.


  return score;
} // Alt attribute usually means non-presentational image.


function scoreAttr($img) {
  if ($img.attr('alt')) {
    return 5;
  }

  return 0;
} // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them


function scoreByParents($img) {
  var score = 0;
  var $figParent = $img.parents('figure').first();

  if ($figParent.length === 1) {
    score += 25;
  }

  var $parent = $img.parent();
  var $gParent;

  if ($parent.length === 1) {
    $gParent = $parent.parent();
  }

  [$parent, $gParent].forEach(function ($node) {
    if (PHOTO_HINTS_RE$1$1.test(getSig($node))) {
      score += 15;
    }
  });
  return score;
} // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.


function scoreBySibling($img) {
  var score = 0;
  var $sibling = $img.next();
  var sibling = $sibling.get(0);

  if (sibling && sibling.tagName.toLowerCase() === 'figcaption') {
    score += 25;
  }

  if (PHOTO_HINTS_RE$1$1.test(getSig($sibling))) {
    score += 15;
  }

  return score;
}

function scoreByDimensions($img) {
  var score = 0;

  var width = _parseFloat$1($img.attr('width'));

  var height = _parseFloat$1($img.attr('height'));

  var src = $img.attr('src'); // Penalty for skinny images

  if (width && width <= 50) {
    score -= 50;
  } // Penalty for short images


  if (height && height <= 50) {
    score -= 50;
  }

  if (width && height && !src.includes('sprite')) {
    var area = width * height;

    if (area < 5000) {
      // Smaller than 50 x 100
      score -= 100;
    } else {
      score += Math.round(area / 1000);
    }
  }

  return score;
}

function scoreByPosition($imgs, index) {
  return $imgs.length / 2 - index;
} // it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
//   * domain
//   * weird aspect ratio


var GenericLeadImageUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        content = _ref.content,
        metaCache = _ref.metaCache,
        html = _ref.html;
    var cleanUrl;

    if (!$.browser && $('head').length === 0) {
      $('*').first().prepend(html);
    } // Check to see if we have a matching meta tag that we can make use of.
    // Moving this higher because common practice is now to use large
    // images on things like Open Graph or Twitter cards.
    // images usually have for things like Open Graph.


    var imageUrl = extractFromMeta$$1($, LEAD_IMAGE_URL_META_TAGS, metaCache, false);

    if (imageUrl) {
      cleanUrl = clean$1(imageUrl);
      if (cleanUrl) return cleanUrl;
    } // Next, try to find the "best" image via the content.
    // We'd rather not have to fetch each image and check dimensions,
    // so try to do some analysis and determine them instead.


    var $content = $(content);
    var imgs = $('img', $content).toArray();
    var imgScores = {};
    imgs.forEach(function (img, index) {
      var $img = $(img);
      var src = $img.attr('src');
      if (!src) return;
      var score = scoreImageUrl(src);
      score += scoreAttr($img);
      score += scoreByParents($img);
      score += scoreBySibling($img);
      score += scoreByDimensions($img);
      score += scoreByPosition(imgs, index);
      imgScores[src] = score;
    });

    var _Reflect$ownKeys$redu = _Reflect$ownKeys$1(imgScores).reduce(function (acc, key) {
      return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
    }, [null, 0]),
        _Reflect$ownKeys$redu2 = _slicedToArray$1(_Reflect$ownKeys$redu, 2),
        topUrl = _Reflect$ownKeys$redu2[0],
        topScore = _Reflect$ownKeys$redu2[1];

    if (topScore > 0) {
      cleanUrl = clean$1(topUrl);
      if (cleanUrl) return cleanUrl;
    } // If nothing else worked, check to see if there are any really
    // probable nodes in the doc, like <link rel="image_src" />.
    // eslint-disable-next-line no-restricted-syntax


    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (var _iterator = _getIterator$1(LEAD_IMAGE_URL_SELECTORS), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
        var selector = _step.value;
        var $node = $(selector).first();
        var src = $node.attr('src');

        if (src) {
          cleanUrl = clean$1(src);
          if (cleanUrl) return cleanUrl;
        }

        var href = $node.attr('href');

        if (href) {
          cleanUrl = clean$1(href);
          if (cleanUrl) return cleanUrl;
        }

        var value = $node.attr('value');

        if (value) {
          cleanUrl = clean$1(value);
          if (cleanUrl) return cleanUrl;
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return null;
  }
};

function scoreSimilarity(score, articleUrl, href) {
  // Do this last and only if we have a real candidate, because it's
  // potentially expensive computationally. Compare the link to this
  // URL using difflib to get the % similarity of these URLs. On a
  // sliding scale, subtract points from this link based on
  // similarity.
  if (score > 0) {
    var similarity = new difflib$1.SequenceMatcher(null, articleUrl, href).ratio(); // Subtract .1 from diff_percent when calculating modifier,
    // which means that if it's less than 10% different, we give a
    // bonus instead. Ex:
    //  3% different = +17.5 points
    // 10% different = 0 points
    // 20% different = -25 points

    var diffPercent = 1.0 - similarity;
    var diffModifier = -(250 * (diffPercent - 0.2));
    return score + diffModifier;
  }

  return 0;
}

function scoreLinkText(linkText, pageNum) {
  // If the link text can be parsed as a number, give it a minor
  // bonus, with a slight bias towards lower numbered pages. This is
  // so that pages that might not have 'next' in their text can still
  // get scored, and sorted properly by score.
  var score = 0;

  if (IS_DIGIT_RE$1.test(linkText.trim())) {
    var linkTextAsNum = _parseInt$1(linkText, 10); // If it's the first page, we already got it on the first call.
    // Give it a negative score. Otherwise, up to page 10, give a
    // small bonus.


    if (linkTextAsNum < 2) {
      score = -30;
    } else {
      score = Math.max(0, 10 - linkTextAsNum);
    } // If it appears that the current page number is greater than
    // this links page number, it's a very bad sign. Give it a big
    // penalty.


    if (pageNum && pageNum >= linkTextAsNum) {
      score -= 50;
    }
  }

  return score;
}

function scorePageInLink(pageNum, isWp) {
  // page in the link = bonus. Intentionally ignore wordpress because
  // their ?p=123 link style gets caught by this even though it means
  // separate documents entirely.
  if (pageNum && !isWp) {
    return 50;
  }

  return 0;
}

var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.

var EXTRANEOUS_LINK_HINTS$1 = ['print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated'];
var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(EXTRANEOUS_LINK_HINTS$1.join('|'), 'i'); // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.

var NEXT_LINK_TEXT_RE$1 = new RegExp('(next|weiter|continue|>([^|]|$)|»([^|]|$))', 'i'); // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.

var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous
// page.

var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination

function scoreExtraneousLinks(href) {
  // If the URL itself contains extraneous values, give a penalty.
  if (EXTRANEOUS_LINK_HINTS_RE$1.test(href)) {
    return -25;
  }

  return 0;
}

function makeSig($link) {
  return "".concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
}

function scoreByParents$1($link) {
  // If a parent node contains paging-like classname or id, give a
  // bonus. Additionally, if a parent_node contains bad content
  // (like 'sponsor'), give a penalty.
  var $parent = $link.parent();
  var positiveMatch = false;
  var negativeMatch = false;
  var score = 0;

  _Array$from(range(0, 4)).forEach(function () {
    if ($parent.length === 0) {
      return;
    }

    var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good
    // sign. Add a bonus.

    if (!positiveMatch && PAGE_RE$1.test(parentData)) {
      positiveMatch = true;
      score += 25;
    } // If we have 'comment' or something in our data, and
    // we don't have something like 'content' as well, that's
    // a bad sign. Give a penalty.


    if (!negativeMatch && NEGATIVE_SCORE_RE$2.test(parentData) && EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)) {
      if (!POSITIVE_SCORE_RE$2.test(parentData)) {
        negativeMatch = true;
        score -= 25;
      }
    }

    $parent = $parent.parent();
  });

  return score;
}

function scorePrevLink(linkData) {
  // If the link has something like "previous", its definitely
  // an old link, skip it.
  if (PREV_LINK_TEXT_RE$1.test(linkData)) {
    return -200;
  }

  return 0;
}

function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls) {
  // skip if we've already fetched this url
  if (previousUrls.find(function (url) {
    return href === url;
  }) !== undefined) {
    return false;
  } // If we've already parsed this URL, or the URL matches the base
  // URL, or is empty, skip it.


  if (!href || href === articleUrl || href === baseUrl) {
    return false;
  }

  var hostname = parsedUrl.hostname;

  var _URL$parse = URL$1.parse(href),
      linkHost = _URL$parse.hostname; // Domain mismatch.


  if (linkHost !== hostname) {
    return false;
  } // If href doesn't contain a digit after removing the base URL,
  // it's certainly not the next page.


  var fragment = href.replace(baseUrl, '');

  if (!DIGIT_RE$2.test(fragment)) {
    return false;
  } // This link has extraneous content (like "comment") in its link
  // text, so we skip it.


  if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) {
    return false;
  } // Next page link text is never long, skip if it is too long.


  if (linkText.length > 25) {
    return false;
  }

  return true;
}

function scoreBaseUrl(href, baseRegex) {
  // If the baseUrl isn't part of this URL, penalize this
  // link. It could still be the link, but the odds are lower.
  // Example:
  // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
  if (!baseRegex.test(href)) {
    return -25;
  }

  return 0;
}

function scoreNextLinkText(linkData) {
  // Things like "next", ">>", etc.
  if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
    return 50;
  }

  return 0;
}

function scoreCapLinks(linkData) {
  // Cap links are links like "last", etc.
  if (CAP_LINK_TEXT_RE$1.test(linkData)) {
    // If we found a link like "last", but we've already seen that
    // this link is also "next", it's fine. If it's not been
    // previously marked as "next", then it's probably bad.
    // Penalize.
    if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
      return -65;
    }
  }

  return 0;
}

function makeBaseRegex(baseUrl) {
  return new RegExp("^".concat(baseUrl), 'i');
}

function makeSig$1($link, linkText) {
  return "".concat(linkText || $link.text(), " ").concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
}

function scoreLinks(_ref) {
  var links = _ref.links,
      articleUrl = _ref.articleUrl,
      baseUrl = _ref.baseUrl,
      parsedUrl = _ref.parsedUrl,
      $ = _ref.$,
      _ref$previousUrls = _ref.previousUrls,
      previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
  parsedUrl = parsedUrl || URL$1.parse(articleUrl);
  var baseRegex = makeBaseRegex(baseUrl);
  var isWp = isWordpress$1($); // Loop through all links, looking for hints that they may be next-page
  // links. Things like having "page" in their textContent, className or
  // id, or being a child of a node with a page-y className or id.
  //
  // After we do that, assign each page a score, and pick the one that
  // looks most like the next page link, as long as its score is strong
  // enough to have decent confidence.

  var scoredPages = links.reduce(function (possiblePages, link) {
    // Remove any anchor data since we don't do a good job
    // standardizing URLs (it's hard), we're going to do
    // some checking with and without a trailing slash
    var attrs = getAttrs$1(link); // if href is undefined, return

    if (!attrs.href) return possiblePages;
    var href = removeAnchor$1(attrs.href);
    var $link = $(link);
    var linkText = $link.text();

    if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
      return possiblePages;
    } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##


    if (!possiblePages[href]) {
      possiblePages[href] = {
        score: 0,
        linkText: linkText,
        href: href
      };
    } else {
      possiblePages[href].linkText = "".concat(possiblePages[href].linkText, "|").concat(linkText);
    }

    var possiblePage = possiblePages[href];
    var linkData = makeSig$1($link, linkText);
    var pageNum = pageNumFromUrl$1(href);
    var score = scoreBaseUrl(href, baseRegex);
    score += scoreNextLinkText(linkData);
    score += scoreCapLinks(linkData);
    score += scorePrevLink(linkData);
    score += scoreByParents$1($link);
    score += scoreExtraneousLinks(href);
    score += scorePageInLink(pageNum, isWp);
    score += scoreLinkText(linkText, pageNum);
    score += scoreSimilarity(score, articleUrl, href);
    possiblePage.score = score;
    return possiblePages;
  }, {});
  return _Reflect$ownKeys$1(scoredPages).length === 0 ? null : scoredPages;
} // for multi-page articles


var GenericNextPageUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        parsedUrl = _ref.parsedUrl,
        _ref$previousUrls = _ref.previousUrls,
        previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
    parsedUrl = parsedUrl || URL$1.parse(url);
    var articleUrl = removeAnchor$1(url);
    var baseUrl = articleBaseUrl$1(url, parsedUrl);
    var links = $('a[href]').toArray();
    var scoredLinks = scoreLinks({
      links: links,
      articleUrl: articleUrl,
      baseUrl: baseUrl,
      parsedUrl: parsedUrl,
      $: $,
      previousUrls: previousUrls
    }); // If no links were scored, return null

    if (!scoredLinks) return null; // now that we've scored all possible pages,
    // find the biggest one.

    var topPage = _Reflect$ownKeys$1(scoredLinks).reduce(function (acc, link) {
      var scoredLink = scoredLinks[link];
      return scoredLink.score > acc.score ? scoredLink : acc;
    }, {
      score: -100
    }); // If the score is less than 50, we're not confident enough to use it,
    // so we fail.


    if (topPage.score >= 50) {
      return topPage.href;
    }

    return null;
  }
};
var CANONICAL_META_SELECTORS = ['og:url'];

function parseDomain(url) {
  var parsedUrl = URL$1.parse(url);
  var hostname = parsedUrl.hostname;
  return hostname;
}

function result(url) {
  return {
    url: url,
    domain: parseDomain(url)
  };
}

var GenericUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        metaCache = _ref.metaCache;
    var $canonical = $('link[rel=canonical]');

    if ($canonical.length !== 0) {
      var href = $canonical.attr('href');

      if (href) {
        return result(href);
      }
    }

    var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache);

    if (metaUrl) {
      return result(metaUrl);
    }

    return result(url);
  }
};
var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];

function clean$2(content, $) {
  var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
  content = content.replace(/[\s\n]+/g, ' ').trim();
  return ellipsize$1(content, maxLength, {
    ellipse: '&hellip;'
  });
}

var GenericExcerptExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        content = _ref.content,
        metaCache = _ref.metaCache;
    var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache);

    if (excerpt) {
      return clean$2(stripTags$1(excerpt, $));
    } // Fall back to excerpting from the extracted content


    var maxLength = 200;
    var shortContent = content.slice(0, maxLength * 5);
    return clean$2($(shortContent).text(), $, maxLength);
  }
};

var getWordCount = function getWordCount(content) {
  var $ = cheerio$1.load(content);
  var $content = $('div').first();
  var text = normalizeSpaces$1($content.text());
  return text.split(/\s/).length;
};

var getWordCountAlt = function getWordCountAlt(content) {
  content = content.replace(/<[^>]*>/g, ' ');
  content = content.replace(/\s+/g, ' ');
  content = content.trim();
  return content.split(' ').length;
};

var GenericWordCountExtractor = {
  extract: function extract(_ref) {
    var content = _ref.content;
    var count = getWordCount(content);
    if (count === 1) count = getWordCountAlt(content);
    return count;
  }
};
var GenericExtractor = {
  // This extractor is the default for all domains
  domain: '*',
  title: GenericTitleExtractor.extract,
  date_published: GenericDatePublishedExtractor.extract,
  author: GenericAuthorExtractor.extract,
  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
  lead_image_url: GenericLeadImageUrlExtractor.extract,
  dek: GenericDekExtractor.extract,
  next_page_url: GenericNextPageUrlExtractor.extract,
  url_and_domain: GenericUrlExtractor.extract,
  excerpt: GenericExcerptExtractor.extract,
  word_count: GenericWordCountExtractor.extract,
  direction: function direction(_ref) {
    var title = _ref.title;
    return stringDirection$1.getDirection(title);
  },
  extract: function extract(options) {
    var html = options.html,
        $ = options.$;

    if (html && !$) {
      var loaded = cheerio$1.load(html);
      options.$ = loaded;
    }

    var title = this.title(options);
    var date_published = this.date_published(options);
    var author = this.author(options);
    var content = this.content(_objectSpread({}, options, {
      title: title
    }));
    var lead_image_url = this.lead_image_url(_objectSpread({}, options, {
      content: content
    }));
    var dek = this.dek(_objectSpread({}, options, {
      content: content
    }));
    var next_page_url = this.next_page_url(options);
    var excerpt = this.excerpt(_objectSpread({}, options, {
      content: content
    }));
    var word_count = this.word_count(_objectSpread({}, options, {
      content: content
    }));
    var direction = this.direction({
      title: title
    });

    var _this$url_and_domain = this.url_and_domain(options),
        url = _this$url_and_domain.url,
        domain = _this$url_and_domain.domain;

    return {
      title: title,
      author: author,
      date_published: date_published || null,
      dek: dek,
      lead_image_url: lead_image_url,
      content: content,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
      excerpt: excerpt,
      word_count: word_count,
      direction: direction
    };
  }
};
var Detectors = {
  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
  'meta[name="generator"][value="blogger"]': BloggerExtractor
};

function detectByHtml($) {
  var selector = _Reflect$ownKeys$1(Detectors).find(function (s) {
    return $(s).length > 0;
  });

  return Detectors[selector];
}

function getExtractor(url, parsedUrl, $) {
  parsedUrl = parsedUrl || URL$1.parse(url);
  var _parsedUrl = parsedUrl,
      hostname = _parsedUrl.hostname;
  var baseDomain = hostname.split('.').slice(-2).join('.');
  return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
}

function cleanBySelectors($content, $, _ref) {
  var clean = _ref.clean;
  if (!clean) return $content;
  $(clean.join(','), $content).remove();
  return $content;
} // Transform matching elements


function transformElements($content, $, _ref2) {
  var transforms = _ref2.transforms;
  if (!transforms) return $content;

  _Reflect$ownKeys$1(transforms).forEach(function (key) {
    var $matches = $(key, $content);
    var value = transforms[key]; // If value is a string, convert directly

    if (typeof value === 'string') {
      $matches.each(function (index, node) {
        convertNodeTo$$1($(node), $, transforms[key]);
      });
    } else if (typeof value === 'function') {
      // If value is function, apply function to node
      $matches.each(function (index, node) {
        var result = value($(node), $); // If function returns a string, convert node to that value

        if (typeof result === 'string') {
          convertNodeTo$$1($(node), $, result);
        }
      });
    }
  });

  return $content;
}

function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
  return selectors.find(function (selector) {
    if (_Array$isArray(selector)) {
      if (extractHtml) {
        return selector.reduce(function (acc, s) {
          return acc && $(s).length > 0;
        }, true);
      }

      var _selector = _slicedToArray$1(selector, 2),
          s = _selector[0],
          attr = _selector[1];

      return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
    }

    return (allowMultiple || !allowMultiple && $(selector).length === 1) && $(selector).text().trim() !== '';
  });
}

function select(opts) {
  var $ = opts.$,
      type = opts.type,
      extractionOpts = opts.extractionOpts,
      _opts$extractHtml = opts.extractHtml,
      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type

  if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
  // contributors), return the string

  if (typeof extractionOpts === 'string') return extractionOpts;
  var selectors = extractionOpts.selectors,
      _extractionOpts$defau = extractionOpts.defaultCleaner,
      defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
      allowMultiple = extractionOpts.allowMultiple;
  var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
  var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
  if (!matchingSelector) return null;

  function transformAndClean($node) {
    makeLinksAbsolute$$1($node, $, opts.url || '');
    cleanBySelectors($node, $, extractionOpts);
    transformElements($node, $, extractionOpts);
    return $node;
  }

  function selectHtml() {
    // If the selector type requests html as its return type
    // transform and clean the element with provided selectors
    var $content; // If matching selector is an array, we're considering this a
    // multi-match selection, which allows the parser to choose several
    // selectors to include in the result. Note that all selectors in the
    // array must match in order for this selector to trigger

    if (_Array$isArray(matchingSelector)) {
      $content = $(matchingSelector.join(','));
      var $wrapper = $('<div></div>');
      $content.each(function (_, element) {
        $wrapper.append(element);
      });
      $content = $wrapper;
    } else {
      $content = $(matchingSelector);
    } // Wrap in div so transformation can take place on root element


    $content.wrap($('<div></div>'));
    $content = $content.parent();
    $content = transformAndClean($content);

    if (Cleaners[type]) {
      Cleaners[type]($content, _objectSpread({}, opts, {
        defaultCleaner: defaultCleaner
      }));
    }

    if (allowMultiple) {
      return $content.children().toArray().map(function (el) {
        return $.html($(el));
      });
    }

    return $.html($content);
  }

  if (extractHtml) {
    return selectHtml(matchingSelector);
  }

  var $match;
  var result; // if selector is an array (e.g., ['img', 'src']),
  // extract the attr

  if (_Array$isArray(matchingSelector)) {
    var _matchingSelector = _slicedToArray$1(matchingSelector, 3),
        selector = _matchingSelector[0],
        attr = _matchingSelector[1],
        transform = _matchingSelector[2];

    $match = $(selector);
    $match = transformAndClean($match);
    result = $match.map(function (_, el) {
      var item = $(el).attr(attr).trim();
      return transform ? transform(item) : item;
    });
  } else {
    $match = $(matchingSelector);
    $match = transformAndClean($match);
    result = $match.map(function (_, el) {
      return $(el).text().trim();
    });
  }

  result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0]; // Allow custom extractor to skip default cleaner
  // for this type; defaults to true

  if (defaultCleaner && Cleaners[type]) {
    return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
  }

  return result;
}

function selectExtendedTypes(extend, opts) {
  var results = {};

  _Reflect$ownKeys$1(extend).forEach(function (t) {
    if (!results[t]) {
      results[t] = select(_objectSpread({}, opts, {
        type: t,
        extractionOpts: extend[t]
      }));
    }
  });

  return results;
}

function extractResult(opts) {
  var type = opts.type,
      extractor = opts.extractor,
      _opts$fallback = opts.fallback,
      fallback = _opts$fallback === void 0 ? true : _opts$fallback;
  var result = select(_objectSpread({}, opts, {
    extractionOpts: extractor[type]
  })); // If custom parser succeeds, return the result

  if (result) {
    return result;
  } // If nothing matches the selector, and fallback is enabled,
  // run the Generic extraction


  if (fallback) return GenericExtractor[type](opts);
  return null;
}

var RootExtractor = {
  extract: function extract() {
    var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
    var opts = arguments.length > 1 ? arguments[1] : undefined;
    var _opts = opts,
        contentOnly = _opts.contentOnly,
        extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method

    if (extractor.domain === '*') return extractor.extract(opts);
    opts = _objectSpread({}, opts, {
      extractor: extractor
    });

    if (contentOnly) {
      var _content = extractResult(_objectSpread({}, opts, {
        type: 'content',
        extractHtml: true,
        title: extractedTitle
      }));

      return {
        content: _content
      };
    }

    var title = extractResult(_objectSpread({}, opts, {
      type: 'title'
    }));
    var date_published = extractResult(_objectSpread({}, opts, {
      type: 'date_published'
    }));
    var author = extractResult(_objectSpread({}, opts, {
      type: 'author'
    }));
    var next_page_url = extractResult(_objectSpread({}, opts, {
      type: 'next_page_url'
    }));
    var content = extractResult(_objectSpread({}, opts, {
      type: 'content',
      extractHtml: true,
      title: title
    }));
    var lead_image_url = extractResult(_objectSpread({}, opts, {
      type: 'lead_image_url',
      content: content
    }));
    var excerpt = extractResult(_objectSpread({}, opts, {
      type: 'excerpt',
      content: content
    }));
    var dek = extractResult(_objectSpread({}, opts, {
      type: 'dek',
      content: content,
      excerpt: excerpt
    }));
    var word_count = extractResult(_objectSpread({}, opts, {
      type: 'word_count',
      content: content
    }));
    var direction = extractResult(_objectSpread({}, opts, {
      type: 'direction',
      title: title
    }));

    var _ref3 = extractResult(_objectSpread({}, opts, {
      type: 'url_and_domain'
    })) || {
      url: null,
      domain: null
    },
        url = _ref3.url,
        domain = _ref3.domain;

    var extendedResults = {};

    if (extractor.extend) {
      extendedResults = selectExtendedTypes(extractor.extend, opts);
    }

    return _objectSpread({
      title: title,
      content: content,
      author: author,
      date_published: date_published,
      lead_image_url: lead_image_url,
      dek: dek,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
      excerpt: excerpt,
      word_count: word_count,
      direction: direction
    }, extendedResults);
  }
};

function collectAllPages(_x) {
  return _collectAllPages.apply(this, arguments);
}

function _collectAllPages() {
  _collectAllPages = _asyncToGenerator(
  /*#__PURE__*/
  _regeneratorRuntime.mark(function _callee(_ref) {
    var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count;
    return _regeneratorRuntime.wrap(function _callee$(_context) {
      while (1) {
        switch (_context.prev = _context.next) {
          case 0:
            next_page_url = _ref.next_page_url, html = _ref.html, $ = _ref.$, metaCache = _ref.metaCache, result = _ref.result, Extractor = _ref.Extractor, title = _ref.title, url = _ref.url; // At this point, we've fetched just the first page

            pages = 1;
            previousUrls = [removeAnchor$1(url)];
          // If we've gone over 26 pages, something has
          // likely gone wrong.

          case 3:
            if (!(next_page_url && pages < 26)) {
              _context.next = 16;
              break;
            }

            pages += 1; // eslint-disable-next-line no-await-in-loop

            _context.next = 7;
            return Resource.create(next_page_url);

          case 7:
            $ = _context.sent;
            html = $.html();
            extractorOpts = {
              url: next_page_url,
              html: html,
              $: $,
              metaCache: metaCache,
              extractedTitle: title,
              previousUrls: previousUrls
            };
            nextPageResult = RootExtractor.extract(Extractor, extractorOpts);
            previousUrls.push(next_page_url);
            result = _objectSpread({}, result, {
              content: "".concat(result.content, "<hr><h4>Page ").concat(pages, "</h4>").concat(nextPageResult.content)
            }); // eslint-disable-next-line prefer-destructuring

            next_page_url = nextPageResult.next_page_url;
            _context.next = 3;
            break;

          case 16:
            word_count = GenericExtractor.word_count({
              content: "<div>".concat(result.content, "</div>")
            });
            return _context.abrupt("return", _objectSpread({}, result, {
              total_pages: pages,
              rendered_pages: pages,
              word_count: word_count
            }));

          case 18:
          case "end":
            return _context.stop();
        }
      }
    }, _callee, this);
  }));
  return _collectAllPages.apply(this, arguments);
}

var Parser = {
  parse: function () {
    var _parse = _asyncToGenerator(
    /*#__PURE__*/
    _regeneratorRuntime.mark(function _callee(url) {
      var _ref,
          html,
          opts,
          _opts$fetchAllPages,
          fetchAllPages,
          _opts$fallback,
          fallback,
          _opts$contentType,
          contentType,
          _opts$headers,
          headers,
          extend,
          customExtractor,
          parsedUrl,
          $,
          Extractor,
          metaCache,
          extendedTypes,
          result,
          _result,
          title,
          next_page_url,
          turndownService,
          _args = arguments;

      return _regeneratorRuntime.wrap(function _callee$(_context) {
        while (1) {
          switch (_context.prev = _context.next) {
            case 0:
              _ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
              _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
              // set url to window.location.href and load the html
              // from the current page

              if (!url && cheerio$1.browser) {
                url = window.location.href; // eslint-disable-line no-undef

                html = html || cheerio$1.html();
              }

              parsedUrl = URL$1.parse(url);

              if (validateUrl(parsedUrl)) {
                _context.next = 6;
                break;
              }

              return _context.abrupt("return", {
                error: true,
                message: 'The url parameter passed does not look like a valid URL. Please check your URL and try again.'
              });

            case 6:
              _context.next = 8;
              return Resource.create(url, html, parsedUrl, headers);

            case 8:
              $ = _context.sent;

              if (!$.failed) {
                _context.next = 11;
                break;
              }

              return _context.abrupt("return", $);

            case 11:
              // Add custom extractor via cli.
              if (customExtractor) {
                addExtractor(customExtractor);
              }

              Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
              // if html still has not been set (i.e., url passed to Parser.parse),
              // set html from the response of Resource.create

              if (!html) {
                html = $.html();
              } // Cached value of every meta name in our document.
              // Used when extracting title/author/date_published/dek


              metaCache = $('meta').map(function (_, node) {
                return $(node).attr('name');
              }).toArray();
              extendedTypes = {};

              if (extend) {
                extendedTypes = selectExtendedTypes(extend, {
                  $: $,
                  url: url,
                  html: html
                });
              }

              result = RootExtractor.extract(Extractor, {
                url: url,
                html: html,
                $: $,
                metaCache: metaCache,
                parsedUrl: parsedUrl,
                fallback: fallback,
                contentType: contentType
              });
              _result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found

              if (!(fetchAllPages && next_page_url)) {
                _context.next = 25;
                break;
              }

              _context.next = 22;
              return collectAllPages({
                Extractor: Extractor,
                next_page_url: next_page_url,
                html: html,
                $: $,
                metaCache: metaCache,
                result: result,
                title: title,
                url: url
              });

            case 22:
              result = _context.sent;
              _context.next = 26;
              break;

            case 25:
              result = _objectSpread({}, result, {
                total_pages: 1,
                rendered_pages: 1
              });

            case 26:
              if (contentType === 'markdown') {
                turndownService = new TurndownService();
                result.content = turndownService.turndown(result.content);
              } else if (contentType === 'text') {
                result.content = $.text($(result.content));
              }

              return _context.abrupt("return", _objectSpread({}, result, extendedTypes));

            case 28:
            case "end":
              return _context.stop();
          }
        }
      }, _callee, this);
    }));

    function parse(_x) {
      return _parse.apply(this, arguments);
    }

    return parse;
  }(),
  browser: !!cheerio$1.browser,
  // A convenience method for getting a resource
  // to work with, e.g., for custom extractor generator
  fetchResource: function fetchResource(url) {
    return Resource.create(url);
  },
  addExtractor: function addExtractor$$1(extractor) {
    return addExtractor(extractor);
  }
};
var mercury = Parser;

function insertValues(strings) {
  for (var _len = arguments.length, values = new Array(_len > 1 ? _len - 1 : 0), _key = 1; _key < _len; _key++) {
    values[_key - 1] = arguments[_key];
  }

  if (values.length) {
    return strings.reduce(function (result, part, idx) {
      var value = values[idx];

      if (value && typeof value.toString === 'function') {
        value = value.toString();
      } else {
        value = '';
      }

      return result + part + value;
    }, '');
  }

  return strings.join('');
}

var bodyPattern = /^\n([\s\S]+)\s{2}$/gm;
var trailingWhitespace = /\s+$/;
function template(strings) {
  for (var _len = arguments.length, values = new Array(_len > 1 ? _len - 1 : 0), _key = 1; _key < _len; _key++) {
    values[_key - 1] = arguments[_key];
  }

  var compiled = insertValues.apply(void 0, [strings].concat(values));

  var _ref = compiled.match(bodyPattern) || [],
      _ref2 = _slicedToArray(_ref, 1),
      body = _ref2[0];

  var indentLevel = /^\s{0,4}(.+)$/g;

  if (!body) {
    body = compiled;
    indentLevel = /^\s{0,2}(.+)$/g;
  }

  return body.split('\n').slice(1).map(function (line) {
    line = line.replace(indentLevel, '$1');

    if (trailingWhitespace.test(line)) {
      line = line.replace(trailingWhitespace, '');
    }

    return line;
  }).join('\n');
}

function _templateObject() {
  var data = _taggedTemplateLiteral(["\n    export const ", " = {\n      domain: '", "',\n\n      title: {\n        selectors: [\n          // enter title selectors\n        ],\n      },\n\n      author: {\n        selectors: [\n          // enter author selectors\n        ],\n      },\n\n      date_published: {\n        selectors: [\n          // enter selectors\n        ],\n      },\n\n      dek: {\n        selectors: [\n          // enter selectors\n        ],\n      },\n\n      lead_image_url: {\n        selectors: [\n          // enter selectors\n        ],\n      },\n\n      content: {\n        selectors: [\n          // enter content selectors\n        ],\n\n        // Is there anything in the content you selected that needs transformed\n        // before it's consumable content? E.g., unusual lazy loaded images\n        transforms: {\n        },\n\n        // Is there anything that is in the result that shouldn't be?\n        // The clean selectors will remove anything that matches from\n        // the result\n        clean: [\n\n        ]\n      },\n    }\n  "]);

  _templateObject = function _templateObject() {
    return data;
  };

  return data;
}
function extractorTemplate (hostname, name) {
  return template(_templateObject(), name, hostname);
}

function _templateObject2() {
  var data = _taggedTemplateLiteral(["\n    import assert from 'assert';\n    import URL from 'url';\n    import cheerio from 'cheerio';\n\n    import Parser from 'mercury';\n    import getExtractor from 'extractors/get-extractor';\n    import { excerptContent } from 'utils/text';\n\n    const fs = require('fs');\n\n    describe('", "', () => {\n      describe('initial test case', () => {\n        let result;\n        let url;\n        beforeAll(() => {\n          url =\n            '", "';\n          const html =\n            fs.readFileSync('", "');\n          result =\n            Parser.parse(url, { html, fallback: false });\n        });\n\n        it('is selected properly', () => {\n          // This test should be passing by default.\n          // It sanity checks that the correct parser\n          // is being selected for URLs from this domain\n          const extractor = getExtractor(url);\n          assert.equal(extractor.domain, URL.parse(url).hostname)\n        })\n\n          ", "\n\n        it('returns the content', async () => {\n          // To pass this test, fill out the content selector\n          // in ", "/index.js.\n          // You may also want to make use of the clean and transform\n          // options.\n          const { content } = await result;\n\n          const $ = cheerio.load(content || '');\n\n          const first13 = excerptContent($('*').first().text(), 13)\n\n          // Update these values with the expected values from\n          // the article.\n          assert.equal(first13, 'Add the first 13 words of the article here');\n        });\n      });\n    });\n  "]);

  _templateObject2 = function _templateObject2() {
    return data;
  };

  return data;
}

function _templateObject$1() {
  var data = _taggedTemplateLiteral(["\n  it('returns the ", "', async () => {\n            // To pass this test, fill out the ", " selector\n            // in ", "/index.js.\n            const { ", " } = await result\n\n            // Update these values with the expected values from\n            // the article.\n            assert.equal(", ", ", ")\n          });\n    "]);

  _templateObject$1 = function _templateObject() {
    return data;
  };

  return data;
}
var IGNORE = ['url', 'domain', 'content', 'word_count', 'next_page_url', 'excerpt', 'direction', 'total_pages', 'rendered_pages'];

function testFor(key, value, dir) {
  if (IGNORE.find(function (k) {
    return k === key;
  })) return '';
  return template(_templateObject$1(), key, key, dir, key, key, value ? "`".concat(value, "`") : "''");
}

function extractorTestTemplate (file, url, dir, result, name) {
  return template(_templateObject2(), name, url, file, _Reflect$ownKeys(result).map(function (k) {
    return testFor(k, result[k], dir);
  }).join('\n\n'), dir);
}

var questions = [{
  type: 'input',
  name: 'website',
  message: "Paste a url to an article you'd like to create or extend a parser for:",
  validate: function validate(value) {
    var _URL$parse = URL.parse(value),
        hostname = _URL$parse.hostname;

    if (hostname) return true;
    return false;
  }
}];
var spinner;

function confirm(fn, args, msg, newParser) {
  spinner = ora({
    text: msg
  });
  spinner.start();
  var result = fn.apply(void 0, _toConsumableArray(args));

  if (result && result.then) {
    result.then(function (r) {
      return savePage(r, args, newParser);
    });
  } else {
    spinner.succeed();
  }

  return result;
}

function confirmCreateDir(dir, msg) {
  if (!fs.existsSync(dir)) {
    confirm(fs.mkdirSync, [dir], msg);
  }
}

function getDir(url) {
  var _URL$parse2 = URL.parse(url),
      hostname = _URL$parse2.hostname;

  return "./src/extractors/custom/".concat(hostname);
}

function scaffoldCustomParser(url) {
  var dir = getDir(url);

  var _URL$parse3 = URL.parse(url),
      hostname = _URL$parse3.hostname;

  var newParser = false;

  if (!fs.existsSync(dir)) {
    newParser = true;
    confirmCreateDir(dir, "Creating ".concat(hostname, " directory"));
    confirmCreateDir("./fixtures/".concat(hostname), 'Creating fixtures directory');
  }

  confirm(mercury.fetchResource, [url], 'Fetching fixture', newParser);
} // if has arg, just assume that arg is a url and skip prmopt


var urlArg = process.argv[2];

if (urlArg) {
  scaffoldCustomParser(urlArg);
} else {
  inquirer.prompt(questions).then(function (answers) {
    scaffoldCustomParser(answers.website);
  });
}

function generateScaffold(url, file, result) {
  var _URL$parse4 = URL.parse(url),
      hostname = _URL$parse4.hostname;

  var extractor = extractorTemplate(hostname, extractorName(hostname));
  var extractorTest = extractorTestTemplate(file, url, getDir(url), result, extractorName(hostname));
  fs.writeFileSync("".concat(getDir(url), "/index.js"), extractor);
  fs.writeFileSync("".concat(getDir(url), "/index.test.js"), extractorTest);
  fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));
  child_process.exec("npm run lint-fix-quiet -- ".concat(getDir(url), "/*.js"));
}

function savePage($, _ref, newParser) {
  var _ref2 = _slicedToArray(_ref, 1),
      url = _ref2[0];

  var _URL$parse5 = URL.parse(url),
      hostname = _URL$parse5.hostname;

  spinner.succeed();
  var filename = new Date().getTime();
  var file = "./fixtures/".concat(hostname, "/").concat(filename, ".html"); // fix http(s) relative links:

  makeLinksAbsolute$$1($('*').first(), $, url);
  $('[src], [href]').each(function (index, node) {
    var $node = $(node);
    var link = $node.attr('src');

    if (link && link.slice(0, 2) === '//') {
      $node.attr('src', "http:".concat(link));
    }
  });
  var html = stripJunkTags($('*').first(), $, ['script']).html();
  fs.writeFileSync(file, html);
  mercury.parse(url, {
    html: html
  }).then(function (result) {
    if (newParser) {
      confirm(generateScaffold, [url, file, result], 'Generating parser and tests');
      console.log("Your custom site extractor has been set up. To get started building it, run\n      yarn watch:test -- ".concat(hostname, "\n        -- OR --\n      npm run watch:test -- ").concat(hostname));
    } else {
      console.log("\n  It looks like you already have a custom parser for this url.\n  The page you linked to has been added to ".concat(file, ". Copy and paste\n  the following code to use that page in your tests:\n  const html = fs.readFileSync('").concat(file, "');"));
    }
  });
}

function exportString(url) {
  var _URL$parse6 = URL.parse(url),
      hostname = _URL$parse6.hostname;

  return "export * from './".concat(hostname, "';");
}

function extractorName(hostname) {
  var name = hostname.split('.').map(function (w) {
    return "".concat(w.charAt(0).toUpperCase()).concat(w.slice(1));
  }).join('');
  return "".concat(name, "Extractor");
}
//# sourceMappingURL=generate-custom-parser.js.map
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								'use strict';
 								function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
 								var _toConsumableArray = _interopDefault(require('@babel/runtime-corejs2/helpers/toConsumableArray'));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								var fs = _interopDefault(require('fs'));
 								var URL = _interopDefault(require('url'));
 								var inquirer = _interopDefault(require('inquirer'));
 								var ora = _interopDefault(require('ora'));
-												feat: parser auto-generates name; lint is more specific

											
										
										
											8 years ago
+								var child_process = require('child_process');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _Reflect$ownKeys = _interopDefault(require('@babel/runtime-corejs2/core-js/reflect/own-keys'));
 								var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
 								var defineProperty = _interopDefault(require('@babel/runtime-corejs2/helpers/defineProperty'));
 								var objectSpread = _interopDefault(require('@babel/runtime-corejs2/helpers/objectSpread'));
 								var _parseFloat = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-float'));
 								var iconvLite = _interopDefault(require('iconv-lite'));
 								var set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
 								var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
 								var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
 								var _Object$freeze = _interopDefault(require('@babel/runtime-corejs2/core-js/object/freeze'));
 								var regenerator = _interopDefault(require('@babel/runtime-corejs2/regenerator'));
 								var objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/helpers/objectWithoutProperties'));
 								var asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								var cheerio = _interopDefault(require('cheerio'));
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var turndown = _interopDefault(require('turndown'));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var promise = _interopDefault(require('@babel/runtime-corejs2/core-js/promise'));
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var postmanRequest = _interopDefault(require('postman-request'));
 								var assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								var stringDirection = _interopDefault(require('string-direction'));
 								var validUrl = _interopDefault(require('valid-url'));
-												Refactor: running tests more efficiently (#49)

Only running one parser per page we're testing rather than a parser per field we're testing.
											
										
										
											8 years ago
+								var momentTimezone = _interopDefault(require('moment-timezone'));
 								var momentParseformat = _interopDefault(require('moment-parseformat'));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								var wuzzy = _interopDefault(require('wuzzy'));
 								var difflib = _interopDefault(require('difflib'));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var from = _interopDefault(require('@babel/runtime-corejs2/core-js/array/from'));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								var ellipsize = _interopDefault(require('ellipsize'));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var isArray = _interopDefault(require('@babel/runtime-corejs2/core-js/array/is-array'));
 								var _taggedTemplateLiteral = _interopDefault(require('@babel/runtime-corejs2/helpers/taggedTemplateLiteral'));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Spacer images to be removed
-												updated generator templates for new style of import/export. also some
adjustments for usability

											
										
										
											8 years ago
+								// but would normally remove
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var KEEP_CLASS = 'mercury-parser-keep';
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function stripJunkTags(article, $) {
 								  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (tags.length === 0) {
 								    tags = STRIP_OUTPUT_TAGS;
 								  } // Remove matching elements, but ignore
 								  // any element with a class of mercury-parser-keep
-												Feat: LinkedIn parser (#123)

* feat: rebuild custom parser

* feat: linkedin custom parser

											
										
										
											7 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  $(tags.join(','), article).not(".".concat(KEEP_CLASS)).remove();
 								  return $;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// // CONTENT FETCHING CONSTANTS ////
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// return 1 for every comma in text
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// Given a node type to search for, and a list of regular expressions,
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// An expression that looks to try to find the page digit within a URL, if
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// Given a string, return True if it appears to have an ending sentence
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// Scoring
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								function absolutize($, rootUrl, attr) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var baseUrl = $('base').attr('href');
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  $("[".concat(attr, "]")).each(function (_, node) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var attrs = getAttrs(node);
 								    var url = attrs[attr];
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    if (!url) return;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
 								    setAttr(node, attr, absoluteUrl);
 								  });
 								}
 								function absolutizeSet($, rootUrl, $content) {
 								  $('[srcset]', $content).each(function (_, node) {
 								    var attrs = getAttrs(node);
 								    var urlSet = attrs.srcset;
 								    if (urlSet) {
 								      // a comma should be considered part of the candidate URL unless preceded by a descriptor
 								      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
 								      // space characters inside the URL should be encoded (%20 or +)
 								      var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      if (!candidates) return;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      var absoluteCandidates = candidates.map(function (candidate) {
 								        // a candidate URL cannot start or end with a comma
 								        // descriptors are separated from the URLs by unescaped whitespace
 								        var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
 								        parts[0] = URL.resolve(rootUrl, parts[0]);
 								        return parts.join(' ');
 								      });
 								      var absoluteUrlSet = _toConsumableArray(new set(absoluteCandidates)).join(', ');
 								      setAttr(node, 'srcset', absoluteUrlSet);
 								    }
 								  });
 								}
 								function makeLinksAbsolute$$1($content, $, url) {
 								  ['href', 'src'].forEach(function (attr) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    return absolutize($, url, attr);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  });
 								  absolutizeSet($, url, $content);
 								  return $content;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// strips all tags from a string of text
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// Given a node, determine if it's article-like enough to return
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function getAttrs(node) {
 								  var attribs = node.attribs,
 								      attributes = node.attributes;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (!attribs && attributes) {
 								    var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
 								      var attr = attributes[index];
 								      if (!attr.name || !attr.value) return acc;
 								      acc[attr.name] = attr.value;
 								      return acc;
 								    }, {});
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    return attrs;
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return attribs;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function setAttr(node, attr, val) {
 								  if (node.attribs) {
 								    node.attribs[attr] = val;
 								  } else if (node.attributes) {
 								    node.setAttribute(attr, val);
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return node;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// DOM manipulation
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function _interopDefault$1(ex) {
 								  return ex && _typeof(ex) === 'object' && 'default' in ex ? ex['default'] : ex;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _regeneratorRuntime = _interopDefault$1(regenerator);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _objectSpread = _interopDefault$1(objectSpread);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _objectWithoutProperties = _interopDefault$1(objectWithoutProperties);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _asyncToGenerator = _interopDefault$1(asyncToGenerator);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var URL$1 = _interopDefault$1(URL);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var cheerio$1 = _interopDefault$1(cheerio);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var TurndownService = _interopDefault$1(turndown);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var iconv = _interopDefault$1(iconvLite);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _parseInt$1 = _interopDefault$1(_parseInt);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _slicedToArray$1 = _interopDefault$1(_slicedToArray);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _Promise = _interopDefault$1(promise);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var request = _interopDefault$1(postmanRequest);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _Reflect$ownKeys$1 = _interopDefault$1(_Reflect$ownKeys);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _toConsumableArray$1 = _interopDefault$1(_toConsumableArray);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _defineProperty = _interopDefault$1(defineProperty);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _parseFloat$1 = _interopDefault$1(_parseFloat);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _Set = _interopDefault$1(set);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _typeof$1 = _interopDefault$1(_typeof);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _getIterator$1 = _interopDefault$1(_getIterator);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var _Object$assign = _interopDefault$1(assign);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var _Object$keys = _interopDefault$1(keys);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var stringDirection$1 = _interopDefault$1(stringDirection);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var validUrl$1 = _interopDefault$1(validUrl);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var moment = _interopDefault$1(momentTimezone);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var parseFormat = _interopDefault$1(momentParseformat);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var wuzzy$1 = _interopDefault$1(wuzzy);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var difflib$1 = _interopDefault$1(difflib);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _Array$from = _interopDefault$1(from);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var ellipsize$1 = _interopDefault$1(ellipsize);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _Array$isArray = _interopDefault$1(isArray);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var NORMALIZE_RE$1 = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function normalizeSpaces$1(text) {
 								  return text.replace(NORMALIZE_RE$1, ' ').trim();
 								} // Given a node type to search for, and a list of regular expressions,
 								// look to see if this extraction can be found in the URL. Expects
 								// that each expression in r_list will return group(1) as the proper
 								// string to be cleaned.
 								// Only used for date_published currently.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function extractFromUrl$1(url, regexList) {
 								  var matchRe = regexList.find(function (re) {
 								    return re.test(url);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  });
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (matchRe) {
 								    return matchRe.exec(url)[1];
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return null;
 								} // An expression that looks to try to find the page digit within a URL, if
 								// it exists.
 								// Matches:
 								//  page=1
 								//  pg=1
 								//  p=1
 								//  paging=12
 								//  pag=7
 								//  pagination/1
 								//  paging/88
 								//  pa/83
 								//  p/11
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								//
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// Does not match:
 								//  pg=102
 								//  page:2
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var PAGE_IN_HREF_RE$1 = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
 								var HAS_ALPHA_RE$1 = /[a-z]/i;
 								var IS_ALPHA_RE$1 = /^[a-z]+$/i;
 								var IS_DIGIT_RE$1 = /^[0-9]+$/i;
 								var ENCODING_RE$1 = /charset=([\w-]+)\b/;
 								var DEFAULT_ENCODING$1 = 'utf-8';
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function pageNumFromUrl$1(url) {
 								  var matches = url.match(PAGE_IN_HREF_RE$1);
 								  if (!matches) return null;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var pageNum = _parseInt$1(matches[6], 10); // Return pageNum < 100, otherwise
 								  // return null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return pageNum < 100 ? pageNum : null;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function removeAnchor$1(url) {
 								  return url.split('#')[0].replace(/\/$/, '');
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function isGoodSegment$1(segment, index, firstSegmentHasLetters) {
 								  var goodSegment = true; // If this is purely a number, and it's the first or second
 								  // url_segment, it's probably a page number. Remove it.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (index < 2 && IS_DIGIT_RE$1.test(segment) && segment.length < 3) {
 								    goodSegment = true;
 								  } // If this is the first url_segment and it's just "index",
 								  // remove it
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (index === 0 && segment.toLowerCase() === 'index') {
 								    goodSegment = false;
 								  } // If our first or second url_segment is smaller than 3 characters,
 								  // and the first url_segment had no alphas, remove it.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
 								    goodSegment = false;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return goodSegment;
 								} // Take a URL, and return the article base of said URL. That is, no
 								// pagination data exists in it. Useful for comparing to other links
 								// that might have pagination data within them.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function articleBaseUrl$1(url, parsed) {
 								  var parsedUrl = parsed || URL$1.parse(url);
 								  var protocol = parsedUrl.protocol,
 								      host = parsedUrl.host,
 								      path = parsedUrl.path;
 								  var firstSegmentHasLetters = false;
 								  var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
 								    var segment = rawSegment; // Split off and save anything that looks like a file type.
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (segment.includes('.')) {
 								      var _segment$split = segment.split('.'),
 								          _segment$split2 = _slicedToArray$1(_segment$split, 2),
 								          possibleSegment = _segment$split2[0],
 								          fileExt = _segment$split2[1];
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      if (IS_ALPHA_RE$1.test(fileExt)) {
 								        segment = possibleSegment;
 								      }
 								    } // If our first or second segment has anything looking like a page
 								    // number, remove it.
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (PAGE_IN_HREF_RE$1.test(segment) && index < 2) {
 								      segment = segment.replace(PAGE_IN_HREF_RE$1, '');
 								    } // If we're on the first segment, check to see if we have any
 								    // characters in it. The first segment is actually the last bit of
 								    // the URL, and this will be helpful to determine if we're on a URL
 								    // segment that looks like "/2/" for example.
 								    if (index === 0) {
 								      firstSegmentHasLetters = HAS_ALPHA_RE$1.test(segment);
 								    } // If it's not marked for deletion, push it to cleaned_segments.
 								    if (isGoodSegment$1(segment, index, firstSegmentHasLetters)) {
 								      acc.push(segment);
 								    }
 								    return acc;
 								  }, []);
 								  return "".concat(protocol, "//").concat(host).concat(cleanedSegments.reverse().join('/'));
 								} // Given a string, return True if it appears to have an ending sentence
 								// within it, false otherwise.
 								var SENTENCE_END_RE$1 = new RegExp('.( |$)');
 								function hasSentenceEnd$1(text) {
 								  return SENTENCE_END_RE$1.test(text);
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function excerptContent$1(content) {
 								  var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
 								  return content.trim().split(/\s+/).slice(0, words).join(' ');
 								} // used in our fetchResource function to
 								// ensure correctly encoded responses
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function getEncoding$1(str) {
 								  var encoding = DEFAULT_ENCODING$1;
 								  var matches = ENCODING_RE$1.exec(str);
 								  if (matches !== null) {
 								    var _matches = _slicedToArray$1(matches, 2);
 								    str = _matches[1];
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (iconv.encodingExists(str)) {
 								    encoding = str;
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return encoding;
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var REQUEST_HEADERS = cheerio$1.browser ? {} : {
 								  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
 								}; // The number of milliseconds to attempt to fetch a resource before timing out.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];
 								var BAD_CONTENT_TYPES_RE = new RegExp("^(".concat(BAD_CONTENT_TYPES.join('|'), ")$"), 'i'); // Use this setting as the maximum size an article can be
 								// for us to attempt parsing. Defaults to 5 MB.
 								var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off
 								function get(options) {
 								  return new _Promise(function (resolve, reject) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    request(options, function (err, response, body) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      if (err) {
 								        reject(err);
 								      } else {
 								        resolve({
 								          body: body,
 								          response: response
 								        });
 								      }
 								    });
 								  });
 								} // Evaluate a response to ensure it's something we should be keeping.
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								// This does not validate in the sense of a response being 200 or not.
 								// Validation here means that we haven't found reason to bail from
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								// further processing of this url.
 								function validateResponse(response) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  var parseNon200 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; // Check if we got a valid status code
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  // This isn't great, but I'm requiring a statusMessage to be set
 								  // before short circuiting b/c nock doesn't set it in tests
 								  // statusMessage only not set in nock response, in which case
 								  // I check statusCode, which is currently only 200 for OK responses
 								  // in tests
 								  if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) {
 								    if (!response.statusCode) {
 								      throw new Error("Unable to fetch content. Original exception was ".concat(response.error));
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    } else if (!parseNon200) {
 								      throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-200 status codes."));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    }
 								  }
 								  var _response$headers = response.headers,
 								      contentType = _response$headers['content-type'],
 								      contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES
 								  if (BAD_CONTENT_TYPES_RE.test(contentType)) {
 								    throw new Error("Content-type for this resource was ".concat(contentType, " and is not allowed."));
 								  } // Check that the content length is below maximum
 								  if (contentLength > MAX_CONTENT_LENGTH) {
 								    throw new Error("Content for this resource was too large. Maximum content length is ".concat(MAX_CONTENT_LENGTH, "."));
 								  }
 								  return true;
 								} // Grabs the last two pieces of the URL and joins them back together
 								// TODO: This should gracefully handle timeouts and raise the
 								//       proper exceptions on the many failure cases of HTTP.
 								// TODO: Ensure we are not fetching something enormous. Always return
 								//       unicode content for HTML, with charset conversion.
 								function fetchResource(_x, _x2) {
 								  return _fetchResource.apply(this, arguments);
 								}
 								function _fetchResource() {
 								  _fetchResource = _asyncToGenerator(
 								  /*#__PURE__*/
 								  _regeneratorRuntime.mark(function _callee(url, parsedUrl) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    var headers,
 								        options,
 								        _ref2,
 								        response,
 								        body,
 								        _args = arguments;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								    return _regeneratorRuntime.wrap(function _callee$(_context) {
 								      while (1) {
 								        switch (_context.prev = _context.next) {
 								          case 0:
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								            parsedUrl = parsedUrl || URL$1.parse(encodeURI(url));
 								            options = _objectSpread({
 								              url: parsedUrl.href,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              headers: _objectSpread({}, REQUEST_HEADERS, headers),
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              timeout: FETCH_TIMEOUT,
 								              // Accept cookies
 								              jar: true,
 								              // Set to null so the response returns as binary and body as buffer
 								              // https://github.com/request/request#requestoptions-callback
 								              encoding: null,
 								              // Accept and decode gzip
 								              gzip: true,
 								              // Follow any non-GET redirects
 								              followAllRedirects: true
 								            }, typeof window !== 'undefined' ? {} : {
 								              // Follow GET redirects; this option is for Node only
 								              followRedirect: true
 								            });
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            _context.next = 5;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								            return get(options);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          case 5:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								            _ref2 = _context.sent;
 								            response = _ref2.response;
 								            body = _ref2.body;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            _context.prev = 8;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								            validateResponse(response);
 								            return _context.abrupt("return", {
 								              body: body,
 								              response: response
 								            });
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          case 13:
 								            _context.prev = 13;
 								            _context.t0 = _context["catch"](8);
 								            return _context.abrupt("return", {
 								              error: true,
 								              message: _context.t0.message
 								            });
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          case 16:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								          case "end":
 								            return _context.stop();
 								        }
 								      }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    }, _callee, this, [[8, 13]]);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }));
 								  return _fetchResource.apply(this, arguments);
 								}
 								function convertMetaProp($, from$$1, to) {
 								  $("meta[".concat(from$$1, "]")).each(function (_, node) {
 								    var $node = $(node);
 								    var value = $node.attr(from$$1);
 								    $node.attr(to, value);
 								    $node.removeAttr(from$$1);
 								  });
 								  return $;
 								} // For ease of use in extracting from meta tags,
 								// replace the "content" attribute on meta tags with the
 								// "value" attribute.
 								//
 								// In addition, normalize 'property' attributes to 'name' for ease of
 								// querying later. See, e.g., og or twitter meta tags.
 								function normalizeMetaTags($) {
 								  $ = convertMetaProp($, 'content', 'value');
 								  $ = convertMetaProp($, 'property', 'name');
 								  return $;
 								} // Spacer images to be removed
 								var SPACER_RE$1 = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep
 								// but would normally remove
 								var KEEP_CLASS$1 = 'mercury-parser-keep';
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var KEEP_SELECTORS$1 = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var STRIP_OUTPUT_TAGS$1 = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var WHITELIST_ATTRS$1 = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var WHITELIST_ATTRS_RE$1 = new RegExp("^(".concat(WHITELIST_ATTRS$1.join('|'), ")$"), 'i'); // removeEmpty
 								var CLEAN_CONDITIONALLY_TAGS$1 = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
 								var HEADER_TAGS$1 = ['h2', 'h3', 'h4', 'h5', 'h6'];
 								var HEADER_TAG_LIST$1 = HEADER_TAGS$1.join(','); // // CONTENT FETCHING CONSTANTS ////
 								// A list of strings that can be considered unlikely candidates when
 								// extracting content from a resource. These strings are joined together
 								// and then tested for existence using re:test, so may contain simple,
 								// non-pipe style regular expression queries if necessary.
 								var UNLIKELY_CANDIDATES_BLACKLIST$2 = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', // 'form', // This is too generic, has too many false positives
 								'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.
 								'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box
 								'presence_control_external', // lifehacker.com container full of false positives
 								'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools']; // A list of strings that can be considered LIKELY candidates when
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// extracting content from a resource. Essentially, the inverse of the
 								// blacklist above - if something matches both blacklist and whitelist,
 								// it is kept. This is useful, for example, if something has a className
 								// of "rss-content entry-content". It matched 'rss', so it would normally
 								// be removed, however, it's also the entry content, so it should be left
 								// alone.
 								//
 								// These strings are joined together and then tested for existence using
 								// re:test, so may contain simple, non-pipe style regular expression queries
 								// if necessary.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var UNLIKELY_CANDIDATES_WHITELIST$2 = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form
 								'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow']; // A list of tags which, if found inside, should cause a <div /> to NOT
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// be turned into a paragraph tag. Shallow div tags without these elements
 								// should be turned into <p /> tags.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var DIV_TO_P_BLOCK_TAGS$2 = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(','); // A list of tags that should be ignored when trying to find the top candidate
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var POSITIVE_SCORE_HINTS$2 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
 								'\\Bcopy']; // The above list, joined into a matching regular expression
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var POSITIVE_SCORE_RE$2 = new RegExp(POSITIVE_SCORE_HINTS$2.join('|'), 'i'); // Readability publisher-specific guidelines
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var NEGATIVE_SCORE_HINTS$2 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
 								'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
 								'promo', 'pr_', // autoblog - press release
 								'related', 'respond', 'roundcontent', // lifehacker restricted content warning
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var NEGATIVE_SCORE_RE$2 = new RegExp(NEGATIVE_SCORE_HINTS$2.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var IS_WP_SELECTOR$1 = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var PAGE_RE$1 = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// http://bit.ly/qneNIT
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var BLOCK_LEVEL_TAGS$2 = ['article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video'];
 								var BLOCK_LEVEL_TAGS_RE$2 = new RegExp("^(".concat(BLOCK_LEVEL_TAGS$2.join('|'), ")$"), 'i'); // The removal is implemented as a blacklist and whitelist, this test finds
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// blacklisted elements that aren't whitelisted. We do this all in one
 								// expression-both because it's only one pass, and because this skips the
 								// serialization for whitelisted nodes.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var candidatesBlacklist$2 = UNLIKELY_CANDIDATES_BLACKLIST$2.join('|');
 								var CANDIDATES_BLACKLIST$2 = new RegExp(candidatesBlacklist$2, 'i');
 								var candidatesWhitelist$2 = UNLIKELY_CANDIDATES_WHITELIST$2.join('|');
 								var CANDIDATES_WHITELIST$2 = new RegExp(candidatesWhitelist$2, 'i');
 								function stripUnlikelyCandidates$1($) {
 								  //  Loop through the provided document and remove any non-link nodes
 								  //  that are unlikely candidates for article content.
 								  //
 								  //  Links are ignored because there are very often links to content
 								  //  that are identified as non-body-content, but may be inside
 								  //  article-like content.
 								  //
 								  //  :param $: a cheerio object to strip nodes from
 								  //  :return $: the cleaned cheerio object
 								  $('*').not('a').each(function (index, node) {
 								    var $node = $(node);
 								    var classes = $node.attr('class');
 								    var id = $node.attr('id');
 								    if (!id && !classes) return;
 								    var classAndId = "".concat(classes || '', " ").concat(id || '');
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (CANDIDATES_WHITELIST$2.test(classAndId)) {
 								      return;
 								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (CANDIDATES_BLACKLIST$2.test(classAndId)) {
 								      $node.remove();
 								    }
 								  });
 								  return $;
 								} // Another good candidate for refactoring/optimizing.
 								// Very imperative code, I don't love it. - AP
 								//  Given cheerio object, convert consecutive <br /> tags into
 								//  <p /> tags instead.
 								//
 								//  :param $: A cheerio object
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function brsToPs$$1($) {
 								  var collapsing = false;
 								  $('br').each(function (index, element) {
 								    var $element = $(element);
 								    var nextElement = $element.next().get(0);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (nextElement && nextElement.tagName.toLowerCase() === 'br') {
 								      collapsing = true;
 								      $element.remove();
 								    } else if (collapsing) {
 								      collapsing = false;
 								      paragraphize$1(element, $, true);
 								    }
 								  });
 								  return $;
 								} // make sure it conforms to the constraints of a P tag (I.E. does
 								// not contain any other block tags.)
 								//
 								// If the node is a <br />, it treats the following inline siblings
 								// as if they were its children.
 								//
 								// :param node: The node to paragraphize; this is a raw node
 								// :param $: The cheerio object to handle dom manipulation
 								// :param br: Whether or not the passed node is a br
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function paragraphize$1(node, $) {
 								  var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
 								  var $node = $(node);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (br) {
 								    var sibling = node.nextSibling;
 								    var p = $('<p></p>'); // while the next node is text or not a block level element
 								    // append it to a new p node
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE$2.test(sibling.tagName))) {
 								      var _sibling = sibling,
 								          nextSibling = _sibling.nextSibling;
 								      $(sibling).appendTo(p);
 								      sibling = nextSibling;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    $node.replaceWith(p);
 								    $node.remove();
 								    return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function convertDivs$1($) {
 								  $('div').each(function (index, div) {
 								    var $div = $(div);
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    var convertible = $div.children(DIV_TO_P_BLOCK_TAGS$2).length === 0;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    if (convertible) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      convertNodeTo$$1($div, $, 'p');
 								    }
 								  });
 								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function convertSpans$2($) {
 								  $('span').each(function (index, span) {
 								    var $span = $(span);
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    var convertible = $span.parents('p, div, li, figcaption').length === 0;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    if (convertible) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      convertNodeTo$$1($span, $, 'p');
 								    }
 								  });
 								  return $;
 								} // Loop through the provided doc, and convert any p-like elements to
 								// actual paragraph tags.
 								//
 								//   Things fitting this criteria:
 								//   * Multiple consecutive <br /> tags.
 								//   * <div /> tags without block level elements inside of them
 								//   * <span /> tags who are not children of <p /> or <div /> tags.
 								//
 								//   :param $: A cheerio object to search
 								//   :return cheerio object with new p elements
 								//   (By-reference mutation, though. Returned just for convenience.)
 								function convertToParagraphs$$1($) {
 								  $ = brsToPs$$1($);
 								  $ = convertDivs$1($);
 								  $ = convertSpans$2($);
 								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function convertNodeTo$$1($node, $) {
 								  var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
 								  var node = $node.get(0);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (!node) {
 								    return $;
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var attrs = getAttrs$1(node) || {};
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var attribString = _Reflect$ownKeys$1(attrs).map(function (key) {
 								    return "".concat(key, "=").concat(attrs[key]);
 								  }).join(' ');
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var html;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if ($.browser) {
 								    // In the browser, the contents of noscript tags aren't rendered, therefore
 								    // transforms on the noscript tag (commonly used for lazy-loading) don't work
 								    // as expected. This test case handles that
 								    html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
 								  } else {
 								    html = $node.contents();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  $node.replaceWith("<".concat(tag, " ").concat(attribString, ">").concat(html, "</").concat(tag, ">"));
 								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function cleanForHeight$1($img, $) {
 								  var height = _parseInt$1($img.attr('height'), 10);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var width = _parseInt$1($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or
 								  // widths, because they are most likely shims or icons,
 								  // which aren't very useful for reading.
 								  if ((height || 20) < 10 || width < 10) {
 								    $img.remove();
 								  } else if (height) {
 								    // Don't ever specify a height on images, so that we can
 								    // scale with respect to width without screwing up the
 								    // aspect ratio.
 								    $img.removeAttr('height');
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return $;
 								} // Cleans out images where the source string matches transparent/spacer/etc
 								// TODO This seems very aggressive - AP
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function removeSpacers$1($img, $) {
 								  if (SPACER_RE$1.test($img.attr('src'))) {
 								    $img.remove();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function cleanImages$1($article, $) {
 								  $article.find('img').each(function (index, img) {
 								    var $img = $(img);
 								    cleanForHeight$1($img, $);
 								    removeSpacers$1($img, $);
 								  });
 								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function markToKeep$1(article, $, url) {
 								  var tags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];
 								  if (tags.length === 0) {
 								    tags = KEEP_SELECTORS$1;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (url) {
 								    var _URL$parse = URL$1.parse(url),
 								        protocol = _URL$parse.protocol,
 								        hostname = _URL$parse.hostname;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    tags = [].concat(_toConsumableArray$1(tags), ["iframe[src^=\"".concat(protocol, "//").concat(hostname, "\"]")]);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  $(tags.join(','), article).addClass(KEEP_CLASS$1);
 								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function stripJunkTags$1(article, $) {
 								  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];
 								  if (tags.length === 0) {
 								    tags = STRIP_OUTPUT_TAGS$1;
 								  } // Remove matching elements, but ignore
 								  // any element with a class of mercury-parser-keep
 								  $(tags.join(','), article).not(".".concat(KEEP_CLASS$1)).remove();
 								  return $;
 								} // by the title extractor instead. If there's less than 3 of them (<3),
 								// strip them. Otherwise, turn 'em into H2s.
 								function cleanHOnes$$1(article, $) {
 								  var $hOnes = $('h1', article);
 								  if ($hOnes.length < 3) {
 								    $hOnes.each(function (index, node) {
 								      return $(node).remove();
 								    });
 								  } else {
 								    $hOnes.each(function (index, node) {
 								      convertNodeTo$$1($(node), $, 'h2');
 								    });
 								  }
 								  return $;
 								}
 								function removeAllButWhitelist$1($article, $) {
 								  $article.find('*').each(function (index, node) {
 								    var attrs = getAttrs$1(node);
 								    setAttrs$1(node, _Reflect$ownKeys$1(attrs).reduce(function (acc, attr) {
 								      if (WHITELIST_ATTRS_RE$1.test(attr)) {
 								        return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr]));
 								      }
 								      return acc;
 								    }, {}));
 								  }); // Remove the mercury-parser-keep class from result
 								  $(".".concat(KEEP_CLASS$1), $article).removeClass(KEEP_CLASS$1);
 								  return $article;
 								} // Remove attributes like style or align
 								function cleanAttributes$$1($article, $) {
 								  // Grabbing the parent because at this point
 								  // $article will be wrapped in a div which will
 								  // have a score set on it.
 								  return removeAllButWhitelist$1($article.parent().length ? $article.parent() : $article, $);
 								}
 								function removeEmpty$1($article, $) {
 								  $article.find('p').each(function (index, p) {
 								    var $p = $(p);
 								    if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove();
 								  });
 								  return $;
 								} // // CONTENT FETCHING CONSTANTS ////
 								// for a document.
 								var NON_TOP_CANDIDATE_TAGS$1$1 = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
 								var NON_TOP_CANDIDATE_TAGS_RE$1$1 = new RegExp("^(".concat(NON_TOP_CANDIDATE_TAGS$1$1.join('|'), ")$"), 'i'); // A list of selectors that specify, very clearly, either hNews or other
 								// very content-specific style content, like Blogger templates.
 								// More examples here: http://microformats.org/wiki/blog-post-formats
 								var HNEWS_CONTENT_SELECTORS$1$1 = [['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body']];
 								var PHOTO_HINTS$1$1 = ['figure', 'photo', 'image', 'caption'];
 								var PHOTO_HINTS_RE$1$1 = new RegExp(PHOTO_HINTS$1$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being
 								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
 								var POSITIVE_SCORE_HINTS$1$1 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
 								'\\Bcopy']; // The above list, joined into a matching regular expression
 								var POSITIVE_SCORE_RE$1$1 = new RegExp(POSITIVE_SCORE_HINTS$1$1.join('|'), 'i'); // Readability publisher-specific guidelines
 								var READABILITY_ASSET$1$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being
 								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
 								var NEGATIVE_SCORE_HINTS$1$1 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
 								'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
 								'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
 								'promo', 'pr_', // autoblog - press release
 								'related', 'respond', 'roundcontent', // lifehacker restricted content warning
 								'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression
 								var NEGATIVE_SCORE_RE$1$1 = new RegExp(NEGATIVE_SCORE_HINTS$1$1.join('|'), 'i'); // Match a digit. Pretty clear.
 								var PARAGRAPH_SCORE_TAGS$1$1 = new RegExp('^(p|li|span|pre)$', 'i');
 								var CHILD_CONTENT_TAGS$1$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
 								var BAD_TAGS$1$1 = new RegExp('^(address|form)$', 'i');
 								function getWeight$1(node) {
 								  var classes = node.attr('class');
 								  var id = node.attr('id');
 								  var score = 0;
 								  if (id) {
 								    // if id exists, try to score on both positive and negative
 								    if (POSITIVE_SCORE_RE$1$1.test(id)) {
 								      score += 25;
 								    }
 								    if (NEGATIVE_SCORE_RE$1$1.test(id)) {
 								      score -= 25;
 								    }
 								  }
 								  if (classes) {
 								    if (score === 0) {
 								      // if classes exist and id did not contribute to score
 								      // try to score on both positive and negative
 								      if (POSITIVE_SCORE_RE$1$1.test(classes)) {
 								        score += 25;
 								      }
 								      if (NEGATIVE_SCORE_RE$1$1.test(classes)) {
 								        score -= 25;
 								      }
 								    } // even if score has been set by id, add score for
 								    // possible photo matches
 								    // "try to keep photos if we can"
 								    if (PHOTO_HINTS_RE$1$1.test(classes)) {
 								      score += 10;
 								    } // add 25 if class matches entry-content-asset,
 								    // a class apparently instructed for use in the
 								    // Readability publisher guidelines
 								    // https://www.readability.com/developers/guidelines
 								    if (READABILITY_ASSET$1$1.test(classes)) {
 								      score += 25;
 								    }
 								  }
 								  return score;
 								} // returns the score of a node based on
 								// the node's score attribute
 								// returns null if no score set
 								function getScore$1($node) {
 								  return _parseFloat$1($node.attr('score')) || null;
 								} // return 1 for every comma in text
 								function scoreCommas$1(text) {
 								  return (text.match(/,/g) || []).length;
 								}
 								var idkRe$1 = new RegExp('^(p|pre)$', 'i');
 								function scoreLength$1(textLength) {
 								  var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
 								  var chunks = textLength / 50;
 								  if (chunks > 0) {
 								    var lengthBonus; // No idea why p or pre are being tamped down here
 								    // but just following the source for now
 								    // Not even sure why tagName is included here,
 								    // since this is only being called from the context
 								    // of scoreParagraph
 								    if (idkRe$1.test(tagName)) {
 								      lengthBonus = chunks - 2;
 								    } else {
 								      lengthBonus = chunks - 1.25;
 								    }
 								    return Math.min(Math.max(lengthBonus, 0), 3);
 								  }
 								  return 0;
 								} // commas, etc. Higher is better.
 								function scoreParagraph$$1(node) {
 								  var score = 1;
 								  var text = node.text().trim();
 								  var textLength = text.length; // If this paragraph is less than 25 characters, don't count it.
 								  if (textLength < 25) {
 								    return 0;
 								  } // Add points for any commas within this paragraph
 								  score += scoreCommas$1(text); // For every 50 characters in this paragraph, add another point. Up
 								  // to 3 points.
 								  score += scoreLength$1(textLength); // Articles can end with short paragraphs when people are being clever
 								  // but they can also end with short paragraphs setting up lists of junk
 								  // that we strip. This negative tweaks junk setup paragraphs just below
 								  // the cutoff threshold.
 								  if (text.slice(-1) === ':') {
 								    score -= 1;
 								  }
 								  return score;
 								}
 								function setScore$1($node, $, score) {
 								  $node.attr('score', score);
 								  return $node;
 								}
 								function addScore$$1($node, $, amount) {
 								  try {
 								    var score = getOrInitScore$$1($node, $) + amount;
 								    setScore$1($node, $, score);
 								  } catch (e) {// Ignoring; error occurs in scoreNode
 								  }
 								  return $node;
 								}
 								function addToParent$$1(node, $, score) {
 								  var parent = node.parent();
 								  if (parent) {
 								    addScore$$1(parent, $, score * 0.25);
 								  }
 								  return node;
 								} // if not, initializes a score based on
 								// the node's tag type
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function getOrInitScore$$1($node, $) {
 								  var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
 								  var score = getScore$1($node);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								  if (score) {
 								    return score;
 								  }
 								  score = scoreNode$$1($node);
 								  if (weightNodes) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    score += getWeight$1($node);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
 								  addToParent$$1($node, $, score);
 								  return score;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // just scores based on tag.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								function scoreNode$$1($node) {
 								  var _$node$get = $node.get(0),
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      tagName = _$node$get.tagName; // TODO: Consider ordering by most likely.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // E.g., if divs are a more common tag on a page,
 								  // Could save doing that regex test on every node – AP
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (PARAGRAPH_SCORE_TAGS$1$1.test(tagName)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return scoreParagraph$$1($node);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								  if (tagName.toLowerCase() === 'div') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return 5;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								  if (CHILD_CONTENT_TAGS$1$1.test(tagName)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return 3;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								  if (BAD_TAGS$1$1.test(tagName)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return -3;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								  if (tagName.toLowerCase() === 'th') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return -5;
 								  }
 								  return 0;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function convertSpans$1$1($node, $) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if ($node.get(0)) {
 								    var _$node$get = $node.get(0),
 								        tagName = _$node$get.tagName;
 								    if (tagName === 'span') {
 								      // convert spans to divs
 								      convertNodeTo$$1($node, $, 'div');
 								    }
 								  }
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function addScoreTo$1($node, $, score) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if ($node) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    convertSpans$1$1($node, $);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    addScore$$1($node, $, score);
 								  }
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function scorePs$1($, weightNodes) {
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								  $('p, pre').not('[score]').each(function (index, node) {
 								    // The raw score for this paragraph, before we add any parent/child
 								    // scores.
 								    var $node = $(node);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    $node = setScore$1($node, $, getOrInitScore$$1($node, $, weightNodes));
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								    var $parent = $node.parent();
 								    var rawScore = scoreNode$$1($node);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    addScoreTo$1($parent, $, rawScore, weightNodes);
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
 								    if ($parent) {
 								      // Add half of the individual content score to the
 								      // grandparent
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      addScoreTo$1($parent.parent(), $, rawScore / 2, weightNodes);
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								    }
 								  });
 								  return $;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // score content. Parents get the full value of their children's
 								// content score, grandparents half
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function scoreContent$$1($) {
 								  var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true; // First, look for special hNews based selectors and give them a big
 								  // boost, if they exist
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  HNEWS_CONTENT_SELECTORS$1$1.forEach(function (_ref) {
 								    var _ref2 = _slicedToArray$1(_ref, 2),
 								        parentSelector = _ref2[0],
 								        childSelector = _ref2[1];
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    $("".concat(parentSelector, " ").concat(childSelector)).each(function (index, node) {
 								      addScore$$1($(node).parent(parentSelector), $, 80);
 								    });
 								  }); // Doubling this again
 								  // Previous solution caused a bug
 								  // in which parents weren't retaining
 								  // scores. This is not ideal, and
 								  // should be fixed.
 								  scorePs$1($, weightNodes);
 								  scorePs$1($, weightNodes);
 								  return $;
 								} // it to see if any of them are decently scored. If they are, they
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// may be split parts of the content (Like two divs, a preamble and
 								// a body.) Example:
 								// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function mergeSiblings$1($candidate, topScore, $) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (!$candidate.parent().length) {
 								    return $candidate;
 								  }
 								  var siblingScoreThreshold = Math.max(10, topScore * 0.25);
 								  var wrappingDiv = $('<div></div>');
 								  $candidate.parent().children().each(function (index, sibling) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var $sibling = $(sibling); // Ignore tags like BR, HR, etc
 								    if (NON_TOP_CANDIDATE_TAGS_RE$1$1.test(sibling.tagName)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      return null;
 								    }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var siblingScore = getScore$1($sibling);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (siblingScore) {
 								      if ($sibling.get(0) === $candidate.get(0)) {
 								        wrappingDiv.append($sibling);
 								      } else {
 								        var contentBonus = 0;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								        var density = linkDensity$1($sibling); // If sibling has a very low link density,
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        // give it a small bonus
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        if (density < 0.05) {
 								          contentBonus += 20;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								        } // If sibling has a high link density,
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        // give it a penalty
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        if (density >= 0.5) {
 								          contentBonus -= 20;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								        } // If sibling node has the same class as
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        // candidate, give it a bonus
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        if ($sibling.attr('class') === $candidate.attr('class')) {
 								          contentBonus += topScore * 0.2;
 								        }
 								        var newScore = siblingScore + contentBonus;
 								        if (newScore >= siblingScoreThreshold) {
 								          return wrappingDiv.append($sibling);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								        }
 								        if (sibling.tagName === 'p') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          var siblingContent = $sibling.text();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								          var siblingContentLength = textLength$1(siblingContent);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								          if (siblingContentLength > 80 && density < 0.25) {
 								            return wrappingDiv.append($sibling);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								          }
 								          if (siblingContentLength <= 80 && density === 0 && hasSentenceEnd$1(siblingContent)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								            return wrappingDiv.append($sibling);
 								          }
 								        }
 								      }
 								    }
 								    return null;
 								  });
 								  if (wrappingDiv.children().length === 1 && wrappingDiv.children().first().get(0) === $candidate.get(0)) {
 								    return $candidate;
 								  }
 								  return wrappingDiv;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // candidate nodes we found and find the one with the highest score.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function findTopCandidate$$1($) {
 								  var $candidate;
 								  var topScore = 0;
 								  $('[score]').each(function (index, node) {
 								    // Ignore tags like BR, HR, etc
 								    if (NON_TOP_CANDIDATE_TAGS_RE$1$1.test(node.tagName)) {
 								      return;
 								    }
 								    var $node = $(node);
 								    var score = getScore$1($node);
 								    if (score > topScore) {
 								      topScore = score;
 								      $candidate = $node;
 								    }
 								  }); // If we don't have a candidate, return the body
 								  // or whatever the first element is
 								  if (!$candidate) {
 								    return $('body') || $('*').first();
 								  }
 								  $candidate = mergeSiblings$1($candidate, topScore, $);
 								  return $candidate;
 								} // Scoring
 								function removeUnlessContent$1($node, $, weight) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // Explicitly save entry-content-asset tags, which are
 								  // noted as valuable in the Publisher guidelines. For now
 								  // this works everywhere. We may want to consider making
 								  // this less of a sure-thing later.
 								  if ($node.hasClass('entry-content-asset')) {
 								    return;
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var content = normalizeSpaces$1($node.text());
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (scoreCommas$1(content) < 10) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var pCount = $('p', $node).length;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var inputCount = $('input', $node).length; // Looks like a form, too many inputs.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								    if (inputCount > pCount / 3) {
 								      $node.remove();
 								      return;
 								    }
 								    var contentLength = content.length;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var imgCount = $('img', $node).length; // Content is too short, and there are no images, so
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // this is probably junk content.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (contentLength < 25 && imgCount === 0) {
 								      $node.remove();
 								      return;
 								    }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var density = linkDensity$1($node); // Too high of link density, is probably a menu or
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // something similar.
 								    // console.log(weight, density, contentLength)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (weight < 25 && density > 0.2 && contentLength > 75) {
 								      $node.remove();
 								      return;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    } // Too high of a link density, despite the score being
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // high.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (weight >= 25 && density > 0.5) {
 								      // Don't remove the node if it's a list and the
 								      // previous sibling starts with a colon though. That
 								      // means it's probably content.
 								      var tagName = $node.get(0).tagName.toLowerCase();
 								      var nodeIsList = tagName === 'ol' || tagName === 'ul';
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      if (nodeIsList) {
 								        var previousNode = $node.prev();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								        if (previousNode && normalizeSpaces$1(previousNode.text()).slice(-1) === ':') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          return;
 								        }
 								      }
 								      $node.remove();
 								      return;
 								    }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var scriptCount = $('script', $node).length; // Too many script tags, not enough content.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								    if (scriptCount > 0 && contentLength < 150) {
 								      $node.remove();
 								    }
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // Given an article, clean it of some superfluous content specified by
 								// tags. Things like forms, ads, etc.
 								//
 								// Tags is an array of tag name's to search through. (like div, form,
 								// etc)
 								//
 								// Return this same doc.
 								function cleanTags$$1($article, $) {
 								  $(CLEAN_CONDITIONALLY_TAGS$1, $article).each(function (index, node) {
 								    var $node = $(node); // If marked to keep, skip it
 								    if ($node.hasClass(KEEP_CLASS$1) || $node.find(".".concat(KEEP_CLASS$1)).length > 0) return;
 								    var weight = getScore$1($node);
 								    if (!weight) {
 								      weight = getOrInitScore$$1($node, $);
 								      setScore$1($node, $, weight);
 								    } // drop node if its weight is < 0
 								    if (weight < 0) {
 								      $node.remove();
 								    } else {
 								      // deteremine if node seems like content
 								      removeUnlessContent$1($node, $, weight);
 								    }
 								  });
 								  return $;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function cleanHeaders$1($article, $) {
 								  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
 								  $(HEADER_TAG_LIST$1, $article).each(function (index, header) {
 								    var $header = $(header); // Remove any headers that appear before all other p tags in the
 								    // document. This probably means that it was part of the title, a
 								    // subtitle or something else extraneous like a datestamp or byline,
 								    // all of which should be handled by other metadata handling.
 								    if ($($header, $article).prevAll('p').length === 0) {
 								      return $header.remove();
 								    } // Remove any headers that match the title exactly.
 								    if (normalizeSpaces$1($(header).text()) === title) {
 								      return $header.remove();
 								    } // If this header has a negative weight, it's probably junk.
 								    // Get rid of it.
 								    if (getWeight$1($(header)) < 0) {
 								      return $header.remove();
 								    }
 								    return $header;
 								  });
 								  return $;
 								} // html to avoid later complications with multiple body tags.
 								function rewriteTopLevel$$1(article, $) {
 								  // I'm not using context here because
 								  // it's problematic when converting the
 								  // top-level/root node - AP
 								  $ = convertNodeTo$$1($('html'), $, 'div');
 								  $ = convertNodeTo$$1($('body'), $, 'div');
 								  return $;
 								}
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								function absolutize$1($, rootUrl, attr) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var baseUrl = $('base').attr('href');
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  $("[".concat(attr, "]")).each(function (_, node) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var attrs = getAttrs$1(node);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var url = attrs[attr];
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    if (!url) return;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url);
 								    setAttr$1(node, attr, absoluteUrl);
 								  });
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function absolutizeSet$1($, rootUrl, $content) {
 								  $('[srcset]', $content).each(function (_, node) {
 								    var attrs = getAttrs$1(node);
 								    var urlSet = attrs.srcset;
 								    if (urlSet) {
 								      // a comma should be considered part of the candidate URL unless preceded by a descriptor
 								      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
 								      // space characters inside the URL should be encoded (%20 or +)
 								      var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      if (!candidates) return;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      var absoluteCandidates = candidates.map(function (candidate) {
 								        // a candidate URL cannot start or end with a comma
 								        // descriptors are separated from the URLs by unescaped whitespace
 								        var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
 								        parts[0] = URL$1.resolve(rootUrl, parts[0]);
 								        return parts.join(' ');
 								      });
 								      var absoluteUrlSet = _toConsumableArray$1(new _Set(absoluteCandidates)).join(', ');
 								      setAttr$1(node, 'srcset', absoluteUrlSet);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    }
 								  });
 								}
 								function makeLinksAbsolute$$1($content, $, url) {
 								  ['href', 'src'].forEach(function (attr) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    return absolutize$1($, url, attr);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  });
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  absolutizeSet$1($, url, $content);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $content;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function textLength$1(text) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return text.trim().replace(/\s+/g, ' ').length;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // Determines what percentage of the text
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								// in a node is link text
 								// Takes a node, returns a float
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function linkDensity$1($node) {
 								  var totalTextLength = textLength$1($node.text());
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var linkText = $node.find('a').text();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var linkLength = textLength$1(linkText);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								  if (totalTextLength > 0) {
 								    return linkLength / totalTextLength;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								  if (totalTextLength === 0 && linkLength > 0) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return 1;
 								  }
 								  return 0;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // search for, find a meta tag associated.
 								function extractFromMeta$$1($, metaNames, cachedNames) {
 								  var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
 								  var foundNames = metaNames.filter(function (name) {
 								    return cachedNames.indexOf(name) !== -1;
 								  }); // eslint-disable-next-line no-restricted-syntax
 								  var _iteratorNormalCompletion = true;
 								  var _didIteratorError = false;
 								  var _iteratorError = undefined;
 								  try {
 								    var _loop = function _loop() {
 								      var name = _step.value;
 								      var type = 'name';
 								      var value = 'value';
 								      var nodes = $("meta[".concat(type, "=\"").concat(name, "\"]")); // Get the unique value of every matching node, in case there
 								      // are two meta tags with the same name and value.
 								      // Remove empty values.
 								      var values = nodes.map(function (index, node) {
 								        return $(node).attr(value);
 								      }).toArray().filter(function (text) {
 								        return text !== '';
 								      }); // If we have more than one value for the same name, we have a
 								      // conflict and can't trust any of them. Skip this name. If we have
 								      // zero, that means our meta tags had no values. Skip this name
 								      // also.
 								      if (values.length === 1) {
 								        var metaValue; // Meta values that contain HTML should be stripped, as they
 								        // weren't subject to cleaning previously.
 								        if (cleanTags) {
 								          metaValue = stripTags$1(values[0], $);
 								        } else {
 								          var _values = _slicedToArray$1(values, 1);
 								          metaValue = _values[0];
 								        }
 								        return {
 								          v: metaValue
 								        };
 								      }
 								    };
 								    for (var _iterator = _getIterator$1(foundNames), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
 								      var _ret = _loop();
 								      if (_typeof$1(_ret) === "object") return _ret.v;
 								    } // If nothing is found, return null
 								  } catch (err) {
 								    _didIteratorError = true;
 								    _iteratorError = err;
 								  } finally {
 								    try {
 								      if (!_iteratorNormalCompletion && _iterator.return != null) {
 								        _iterator.return();
 								      }
 								    } finally {
 								      if (_didIteratorError) {
 								        throw _iteratorError;
 								      }
 								    }
 								  }
 								  return null;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function isGoodNode$1($node, maxChildren) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // If it has a number of children, it's more likely a container
 								  // element. Skip it.
 								  if ($node.children().length > maxChildren) {
 								    return false;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  } // If it looks to be within a comment, skip it.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (withinComment$$1($node)) {
 								    return false;
 								  }
 								  return true;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // Given a a list of selectors find content that may
 								// be extractable from the document. This is for flat
 								// meta-information, like author, title, date published, etc.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function extractFromSelectors$$1($, selectors) {
 								  var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
 								  var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true; // eslint-disable-next-line no-restricted-syntax
 								  var _iteratorNormalCompletion = true;
 								  var _didIteratorError = false;
 								  var _iteratorError = undefined;
 								  try {
 								    for (var _iterator = _getIterator$1(selectors), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
 								      var selector = _step.value;
 								      var nodes = $(selector); // If we didn't get exactly one of this selector, this may be
 								      // a list of articles or comments. Skip it.
 								      if (nodes.length === 1) {
 								        var $node = $(nodes[0]);
 								        if (isGoodNode$1($node, maxChildren)) {
 								          var content = void 0;
 								          if (textOnly) {
 								            content = $node.text();
 								          } else {
 								            content = $node.html();
 								          }
 								          if (content) {
 								            return content;
 								          }
 								        }
 								      }
 								    }
 								  } catch (err) {
 								    _didIteratorError = true;
 								    _iteratorError = err;
 								  } finally {
 								    try {
 								      if (!_iteratorNormalCompletion && _iterator.return != null) {
 								        _iterator.return();
 								      }
 								    } finally {
 								      if (_didIteratorError) {
 								        throw _iteratorError;
 								      }
 								    }
 								  }
 								  return null;
 								} // strips all tags from a string of text
 								function stripTags$1(text, $) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // Wrapping text in html element prevents errors when text
 								  // has no html
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var cleanText = $("<span>".concat(text, "</span>")).text();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return cleanText === '' ? text : cleanText;
 								}
 								function withinComment$$1($node) {
 								  var parents = $node.parents().toArray();
 								  var commentParent = parents.find(function (parent) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var attrs = getAttrs$1(parent);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var nodeClass = attrs.class,
 								        id = attrs.id;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var classAndId = "".concat(nodeClass, " ").concat(id);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return classAndId.includes('comment');
 								  });
 								  return commentParent !== undefined;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // Given a node, determine if it's article-like enough to return
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// param: node (a cheerio node)
 								// return: boolean
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function nodeIsSufficient$1($node) {
 								  return $node.text().trim().length >= 100;
 								}
 								function isWordpress$1($) {
 								  return $(IS_WP_SELECTOR$1).length > 0;
 								}
 								function getAttrs$1(node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var attribs = node.attribs,
 								      attributes = node.attributes;
 								  if (!attribs && attributes) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var attrs = _Reflect$ownKeys$1(attributes).reduce(function (acc, index) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      var attr = attributes[index];
 								      if (!attr.name || !attr.value) return acc;
 								      acc[attr.name] = attr.value;
 								      return acc;
 								    }, {});
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return attrs;
 								  }
 								  return attribs;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function setAttr$1(node, attr, val) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (node.attribs) {
 								    node.attribs[attr] = val;
 								  } else if (node.attributes) {
 								    node.setAttribute(attr, val);
 								  }
 								  return node;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function setAttrs$1(node, attrs) {
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								  if (node.attribs) {
 								    node.attribs = attrs;
 								  } else if (node.attributes) {
 								    while (node.attributes.length > 0) {
 								      node.removeAttribute(node.attributes[0].name);
-												Refactor: running tests more efficiently (#49)

Only running one parser per page we're testing rather than a parser per field we're testing.
											
										
										
											8 years ago
+								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    _Reflect$ownKeys$1(attrs).forEach(function (key) {
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								      node.setAttribute(key, attrs[key]);
 								    });
-												Fix Encoding on Body (#143)

* fix: check encoding on body
											
										
										
											7 years ago
+								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								  return node;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // DOM manipulation
 								var IS_LINK = new RegExp('https?://', 'i');
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var IMAGE_RE = '.(png|gif|jpe?g)';
 								var IS_IMAGE = new RegExp("".concat(IMAGE_RE), 'i');
 								var IS_SRCSET = new RegExp("".concat(IMAGE_RE, "(\\?\\S+)?(\\s*[\\d.]+[wx])"), 'i');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(','); // lazy loaded images into normal images.
 								// Many sites will have img tags with no source, or an image tag with a src
 								// attribute that a is a placeholer. We need to be able to properly fill in
 								// the src attribute so the images are no longer lazy loaded.
 								function convertLazyLoadedImages($) {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  var extractSrcFromJSON = function extractSrcFromJSON(str) {
 								    try {
 								      var _JSON$parse = JSON.parse(str),
 								          src = _JSON$parse.src;
 								      if (typeof src === 'string') return src;
 								    } catch (_) {
 								      return false;
 								    }
 								    return false;
 								  };
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  $('img').each(function (_, img) {
 								    var attrs = getAttrs$1(img);
 								    _Reflect$ownKeys$1(attrs).forEach(function (attr) {
 								      var value = attrs[attr];
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
 								        $(img).attr('srcset', value);
 								      } else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
 								        var existingSrc = extractSrcFromJSON(value);
 								        if (existingSrc) {
 								          $(img).attr('src', existingSrc);
 								        } else {
 								          $(img).attr('src', value);
 								        }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      }
 								    });
 								  });
 								  return $;
-												Fix Encoding on Body (#143)

* fix: check encoding on body
											
										
										
											7 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function isComment(index, node) {
 								  return node.type === 'comment';
 								}
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function cleanComments($) {
 								  $.root().find('*').contents().filter(isComment).remove();
 								  return $;
 								}
 								function clean($) {
 								  $(TAGS_TO_REMOVE).remove();
 								  $ = cleanComments($);
 								  return $;
 								}
 								var Resource = {
 								  // Create a Resource.
 								  //
 								  // :param url: The URL for the document we should retrieve.
 								  // :param response: If set, use as the response rather than
 								  //                  attempting to fetch it ourselves. Expects a
 								  //                  string.
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  // :param headers: Custom headers to be included in the request
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  create: function () {
 								    var _create = _asyncToGenerator(
 								    /*#__PURE__*/
 								    _regeneratorRuntime.mark(function _callee(url, preparedResponse, parsedUrl) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      var headers,
 								          result,
 								          validResponse,
 								          _args = arguments;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      return _regeneratorRuntime.wrap(function _callee$(_context) {
 								        while (1) {
 								          switch (_context.prev = _context.next) {
 								            case 0:
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              headers = _args.length > 3 && _args[3] !== undefined ? _args[3] : {};
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              if (!preparedResponse) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								                _context.next = 6;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								                break;
 								              }
 								              validResponse = {
 								                statusMessage: 'OK',
 								                statusCode: 200,
 								                headers: {
 								                  'content-type': 'text/html',
 								                  'content-length': 500
 								                }
 								              };
 								              result = {
 								                body: preparedResponse,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								                response: validResponse,
 								                alreadyDecoded: true
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              };
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              _context.next = 9;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              break;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 6:
 								              _context.next = 8;
 								              return fetchResource(url, parsedUrl, headers);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 8:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              result = _context.sent;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 9:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              if (!result.error) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								                _context.next = 12;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								                break;
 								              }
 								              result.failed = true;
 								              return _context.abrupt("return", result);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 12:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              return _context.abrupt("return", this.generateDoc(result));
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 13:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								            case "end":
 								              return _context.stop();
 								          }
 								        }
 								      }, _callee, this);
 								    }));
 								    function create(_x, _x2, _x3) {
 								      return _create.apply(this, arguments);
 								    }
 								    return create;
 								  }(),
 								  generateDoc: function generateDoc(_ref) {
 								    var content = _ref.body,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								        response = _ref.response,
 								        _ref$alreadyDecoded = _ref.alreadyDecoded,
 								        alreadyDecoded = _ref$alreadyDecoded === void 0 ? false : _ref$alreadyDecoded;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    var _response$headers$con = response.headers['content-type'],
 								        contentType = _response$headers$con === void 0 ? '' : _response$headers$con; // TODO: Implement is_text function from
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
 								    if (!contentType.includes('html') && !contentType.includes('text')) {
 								      throw new Error('Content does not appear to be text.');
 								    }
 								    var $ = this.encodeDoc({
 								      content: content,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								      contentType: contentType,
 								      alreadyDecoded: alreadyDecoded
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    });
 								    if ($.root().children().length === 0) {
 								      throw new Error('No children, likely a bad parse.');
 								    }
 								    $ = normalizeMetaTags($);
 								    $ = convertLazyLoadedImages($);
 								    $ = clean($);
 								    return $;
 								  },
 								  encodeDoc: function encodeDoc(_ref2) {
 								    var content = _ref2.content,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								        contentType = _ref2.contentType,
 								        _ref2$alreadyDecoded = _ref2.alreadyDecoded,
 								        alreadyDecoded = _ref2$alreadyDecoded === void 0 ? false : _ref2$alreadyDecoded;
 								    if (alreadyDecoded) {
 								      return cheerio$1.load(content);
 								    }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var encoding = getEncoding$1(contentType);
 								    var decodedContent = iconv.decode(content, encoding);
 								    var $ = cheerio$1.load(decodedContent); // after first cheerio.load, check to see if encoding matches
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    var contentTypeSelector = cheerio$1.browser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]';
 								    var metaContentType = $(contentTypeSelector).attr('content') || $('meta[charset]').attr('charset');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    var properEncoding = getEncoding$1(metaContentType); // if encodings in the header/body dont match, use the one in the body
 								    if (metaContentType && properEncoding !== encoding) {
 								      decodedContent = iconv.decode(content, properEncoding);
 								      $ = cheerio$1.load(decodedContent);
 								    }
 								    return $;
 								  }
 								};
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var _marked =
 								/*#__PURE__*/
 								_regeneratorRuntime.mark(range);
 								function range() {
 								  var start,
 								      end,
 								      _args = arguments;
 								  return _regeneratorRuntime.wrap(function range$(_context) {
 								    while (1) {
 								      switch (_context.prev = _context.next) {
 								        case 0:
 								          start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
 								          end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;
 								        case 2:
 								          if (!(start <= end)) {
 								            _context.next = 7;
 								            break;
 								          }
 								          _context.next = 5;
 								          return start += 1;
 								        case 5:
 								          _context.next = 2;
 								          break;
 								        case 7:
 								        case "end":
 								          return _context.stop();
 								      }
 								    }
 								  }, _marked, this);
 								} // extremely simple url validation as a first step
 								function validateUrl(_ref) {
 								  var hostname = _ref.hostname; // If this isn't a valid url, return an error message
 								  return !!hostname;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var merge = function merge(extractor, domains) {
 								  return domains.reduce(function (acc, domain) {
 								    acc[domain] = extractor;
 								    return acc;
 								  }, {});
 								};
 								function mergeSupportedDomains(extractor) {
 								  return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray$1(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
 								}
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var apiExtractors = {};
 								function addExtractor(extractor) {
 								  if (!extractor || !extractor.domain) {
 								    return {
 								      error: true,
 								      message: 'Unable to add custom extractor. Invalid parameters.'
 								    };
 								  }
 								  _Object$assign(apiExtractors, mergeSupportedDomains(extractor));
 								  return apiExtractors;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var BloggerExtractor = {
 								  domain: 'blogspot.com',
 								  content: {
 								    // Blogger is insane and does not load its content
 								    // initially in the page, but it's all there
 								    // in noscript
 								    selectors: ['.post-content noscript'],
 								    // Selectors to remove from the extracted content
 								    clean: [],
 								    // Convert the noscript tag to a div
 								    transforms: {
 								      noscript: 'div'
 								    }
 								  },
 								  author: {
 								    selectors: ['.post-author-name']
 								  },
 								  title: {
 								    selectors: ['.post h2.title']
 								  },
 								  date_published: {
 								    selectors: ['span.publishdate']
 								  }
 								};
 								var NYMagExtractor = {
 								  domain: 'nymag.com',
 								  content: {
 								    // Order by most likely. Extractor will stop on first occurrence
 								    selectors: ['div.article-content', 'section.body', 'article.article'],
 								    // Selectors to remove from the extracted content
 								    clean: ['.ad', '.single-related-story'],
 								    // Object of tranformations to make on matched elements
 								    // Each key is the selector, each value is the tag to
 								    // transform to.
 								    // If a function is given, it should return a string
 								    // to convert to or nothing (in which case it will not perform
 								    // the transformation.
 								    transforms: {
 								      // Convert h1s to h2s
 								      h1: 'h2',
 								      // Convert lazy-loaded noscript images to figures
 								      noscript: function noscript($node, $) {
 								        var $children = $.browser ? $($node.text()) : $node.children();
 								        if ($children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img') {
 								          return 'figure';
 								        }
 								        return null;
 								      }
 								    }
 								  },
 								  title: {
 								    selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1']
 								  },
 								  author: {
 								    selectors: ['.by-authors', '.lede-feature-author']
 								  },
 								  dek: {
 								    selectors: ['.lede-feature-teaser']
 								  },
 								  date_published: {
 								    selectors: [['time.article-timestamp[datetime]', 'datetime'], 'time.article-timestamp']
 								  }
 								};
 								var WikipediaExtractor = {
 								  domain: 'wikipedia.org',
 								  content: {
 								    selectors: ['#mw-content-text'],
 								    defaultCleaner: false,
 								    // transform top infobox to an image with caption
 								    transforms: {
 								      '.infobox img': function infoboxImg($node) {
 								        var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox
 								        if ($parent.children('img').length === 0) {
 								          $parent.prepend($node);
 								        }
 								      },
 								      '.infobox caption': 'figcaption',
 								      '.infobox': 'figure'
 								    },
 								    // Selectors to remove from the extracted content
 								    clean: ['.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox']
 								  },
 								  author: 'Wikipedia Contributors',
 								  title: {
 								    selectors: ['h2.title']
 								  },
 								  date_published: {
 								    selectors: ['#footer-info-lastmod']
 								  }
 								};
 								var TwitterExtractor = {
 								  domain: 'twitter.com',
 								  content: {
 								    transforms: {
 								      // We're transforming essentially the whole page here.
 								      // Twitter doesn't have nice selectors, so our initial
 								      // selector grabs the whole page, then we're re-writing
 								      // it to fit our needs before we clean it up.
 								      '.permalink[role=main]': function permalinkRoleMain($node, $) {
 								        var tweets = $node.find('.tweet');
 								        var $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
 								        $tweetContainer.append(tweets);
 								        $node.replaceWith($tweetContainer);
 								      },
 								      // Twitter wraps @ with s, which
 								      // renders as a strikethrough
 								      s: 'span'
 								    },
 								    selectors: ['.permalink[role=main]'],
 								    defaultCleaner: false,
 								    clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']
 								  },
 								  author: {
 								    selectors: ['.tweet.permalink-tweet .username']
 								  },
 								  date_published: {
 								    selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']]
 								  }
 								};
 								var NYTimesExtractor = {
 								  domain: 'www.nytimes.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[data-testid="headline"]', 'h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline', 'h1 .balancedHeadline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline', ['meta[name="byl"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    transforms: {
 								      'img.g-lazy': function imgGLazy($node) {
 								        var src = $node.attr('src');
 								        var width = 640;
 								        src = src.replace('{{size}}', width);
 								        $node.attr('src', src);
 								      }
 								    },
 								    clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden', '.comments', '.supplemental', '.nocontent', '.story-footer-links']
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[name="article:published"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: null,
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var TheAtlanticExtractor = {
 								  domain: 'www.theatlantic.com',
 								  title: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: ['h1', '.c-article-header__hed']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: [['meta[name="author"]', 'value'], '.c-byline__author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: ['article', '.article-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet']
 								  },
 								  dek: {
 								    selectors: [['meta[name="description"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: [['time[itemprop="datePublished"]', 'datetime']]
 								  },
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var NewYorkerExtractor = {
 								  domain: 'www.newyorker.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[class^="content-header"]', 'h1[class^="ArticleHeader__hed"]', 'h1[class*="ContentHeaderHed"]', ['meta[name="og:title"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article header div[class^="BylinesWrapper"]', ['meta[name="article:author"]', 'value'], 'div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"]']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article__body', 'article.article.main-content', 'main[class^="Layout__content"]'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    transforms: {
 								      '.caption__text': 'figcaption',
 								      '.caption__credit': 'figcaption'
 								    },
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['footer[class^="ArticleFooter__footer"]', 'aside']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], 'time.content-header__publish-date', ['meta[name="pubdate"]', 'value']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class^="ContentHeaderDek"]', 'div.content-header__dek', 'h2[class^="ArticleHeader__dek"]']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var WiredExtractor = {
 								  domain: 'www.wired.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[data-testId="ContentHeaderHed"]']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:author"]', 'value'], 'a[rel="author"]']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article.article.main-content', 'article.content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.visually-hidden', 'figcaption img.photo', '.alert-message']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
 								    selectors: []
 								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var MSNExtractor = {
 								  domain: 'www.msn.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['span.authorname-txt']
 								  },
 								  content: {
 								    selectors: ['div.richtext'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['span.caption']
 								  },
 								  date_published: {
 								    selectors: ['span.time']
 								  },
 								  lead_image_url: {
 								    selectors: []
 								  },
 								  dek: {
 								    selectors: []
 								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var YahooExtractor = {
 								  domain: 'www.yahoo.com',
 								  title: {
 								    selectors: ['header.canvas-header']
 								  },
 								  author: {
 								    selectors: ['span.provider-name']
 								  },
 								  content: {
 								    selectors: [// enter content selectors
 								    '.content-canvas'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.figure-caption']
 								  },
 								  date_published: {
 								    selectors: [['time.date[datetime]', 'datetime']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [// enter dek selectors
 								    ]
 								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var BuzzfeedExtractor = {
 								  domain: 'www.buzzfeed.com',
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  supportedDomains: ['www.buzzfeednews.com'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.embed-headline-title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a[data-action="user/username"]', 'byline__author', ['meta[name="author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['div[class^="featureimage_featureImageWrapper"]', '.js-subbuzz-wrapper'], ['.js-subbuzz-wrapper']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    defaultCleaner: false,
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      h2: 'b',
 								      'div.longform_custom_header_media': function divLongform_custom_header_media($node) {
 								        if ($node.has('img') && $node.has('.longform_header_image_source')) {
 								          return 'figure';
 								        }
 								        return null;
 								      },
 								      'figure.longform_custom_header_media .longform_header_image_source': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.instapaper_ignore', '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline', '.share-box', '.print', '.js-inline-share-bar', '.js-ad-placement']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['time[datetime]', 'datetime']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.embed-headline-description']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var WikiaExtractor = {
 								  domain: 'fandom.wikia.com',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['.author vcard', '.fn']
 								  },
 								  content: {
 								    selectors: ['.grid-content', '.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
 								    selectors: []
 								  },
 								  next_page_url: null,
 								  excerpt: null
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var LittleThingsExtractor = {
 								  domain: 'www.littlethings.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[class*="PostHeader"]', 'h1.post-title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class^="PostHeader__ScAuthorNameSection"]', ['meta[name="author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
 								    selectors: [// enter content selectors
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    'section[class*="PostMainArticle"]', '.mainContentIntro', '.content-wrapper'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  next_page_url: null,
 								  excerpt: null
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								};
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var PoliticoExtractor = {
 								  domain: 'www.politico.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['div[itemprop="author"] meta[itemprop="name"]', 'value'], '.story-meta__authors .vcard', '.story-main-content .byline .vcard']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['.story-text'], '.story-main-content', '.story-core'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    transforms: [],
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['figcaption', '.story-meta', '.ad']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['time[itemprop="datePublished"]', 'datetime'], ['.story-meta__details time[datetime]', 'datetime'], ['.story-main-content .timestamp time[datetime]', 'datetime']],
 								    timezone: 'America/New_York'
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:description"]', 'value']]
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								};
 								var DeadspinExtractor = {
 								  domain: 'deadspin.com',
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', 'avclub.com', 'clickhole.com', 'splinternews.com', 'theonion.com', 'theroot.com', 'thetakeout.com', 'theinventory.com'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['header h1', 'h1.headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a[data-ga*="Author"]', '.author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.js_post-content', '.post-content', '.entry-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube($node) {
 								        var youtubeId = $node.attr('id').split('youtube-')[1];
 								        $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.magnifier', '.lightbox']
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['time.updated[datetime]', 'datetime']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  next_page_url: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  excerpt: {
 								    selectors: [// enter selectors
 								    ]
 								  }
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var BroadwayWorldExtractor = {
 								  domain: 'www.broadwayworld.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[itemprop=headline]', 'h1.article-title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
 								    selectors: ['span[itemprop=author]']
 								  },
 								  content: {
 								    selectors: ['div[itemprop=articlebody]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  },
 								  date_published: {
 								    selectors: [['meta[itemprop=datePublished]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
 								    selectors: []
 								  },
 								  next_page_url: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  excerpt: {
 								    selectors: [// enter selectors
 								    ]
 								  }
 								}; // Rename CustomExtractor
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to fit your publication
 								// (e.g., NYTimesExtractor)
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var ApartmentTherapyExtractor = {
 								  domain: 'www.apartmenttherapy.com',
 								  title: {
 								    selectors: ['h1.headline']
 								  },
 								  author: {
 								    selectors: ['.PostByline__name']
 								  },
 								  content: {
 								    selectors: ['div.post__content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture($node, $) {
 								        var data = JSON.parse($node.attr('data-props'));
 								        var src = data.sources[0].src;
 								        var $img = $('<img />').attr('src', src);
 								        $node.replaceWith($img);
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  },
 								  date_published: {
 								    selectors: [['.PostByline__timestamp[datetime]', 'datetime']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  dek: {
 								    selectors: []
 								  },
 								  next_page_url: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  excerpt: {
 								    selectors: [// enter selectors
 								    ]
 								  }
 								};
 								var MediumExtractor = {
 								  domain: 'medium.com',
 								  title: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: ['h1', ['meta[name="og:title"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  content: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: ['article'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								      // Allow drop cap character.
 								      'section span:first-of-type': function sectionSpanFirstOfType($node) {
 								        var $text = $node.html();
 								        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
 								          $node.replaceWith($text);
 								        }
 								      },
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      // Re-write lazy-loaded youtube videos
 								      iframe: function iframe($node) {
 								        var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
 								        var thumb = decodeURIComponent($node.attr('data-thumbnail'));
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								        var $parent = $node.parents('figure');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								        if (ytRe.test(thumb)) {
 								          var _thumb$match = thumb.match(ytRe),
 								              _thumb$match2 = _slicedToArray$1(_thumb$match, 2),
 								              _ = _thumb$match2[0],
 								              youtubeId = _thumb$match2[1]; // eslint-disable-line
 								          $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
 								          var $caption = $parent.find('figcaption');
 								          $parent.empty().append([$node, $caption]);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          return;
 								        } // If we can't draw the YouTube preview, remove the figure.
 								        $parent.remove();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      },
 								      // rewrite figures to pull out image and caption, remove rest
 								      figure: function figure($node) {
 								        // ignore if figure has an iframe
 								        if ($node.find('iframe').length > 0) return;
 								        var $img = $node.find('img').slice(-1)[0];
 								        var $caption = $node.find('figcaption');
 								        $node.empty().append([$img, $caption]);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      },
 								      // Remove any smaller images that did not get caught by the generic image
 								      // cleaner (author photo 48px, leading sentence images 79px, etc.).
 								      img: function img($node) {
 								        var width = _parseInt$1($node.attr('width'), 10);
 								        if (width < 100) $node.remove();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['span a', 'svg']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  dek: null,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  next_page_url: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  excerpt: {
 								    selectors: [// enter selectors
 								    ]
 								  }
 								};
 								var WwwTmzComExtractor = {
 								  domain: 'www.tmz.com',
 								  title: {
 								    selectors: ['.post-title-breadcrumb', 'h1', '.headline']
 								  },
 								  author: 'TMZ STAFF',
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article__published-at', '.article-posted-date'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article__blocks', '.article-content', '.all-post-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.lightbox-link']
 								  }
 								};
 								var WwwWashingtonpostComExtractor = {
 								  domain: 'www.washingtonpost.com',
 								  title: {
 								    selectors: ['h1', '#topper-headline-wrapper']
 								  },
 								  author: {
 								    selectors: ['.pb-author-name']
 								  },
 								  date_published: {
 								    selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']]
 								  },
 								  dek: {
 								    selectors: []
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.article-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.inline-content': function divInlineContent($node) {
 								        if ($node.has('img,iframe,video').length > 0) {
 								          return 'figure';
 								        }
 								        $node.remove();
 								        return null;
 								      },
 								      '.pb-caption': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.interstitial-link', '.newsletter-inline-unit']
 								  }
 								};
 								var WwwHuffingtonpostComExtractor = {
 								  domain: 'www.huffingtonpost.com',
 								  title: {
 								    selectors: ['h1.headline__title']
 								  },
 								  author: {
 								    selectors: ['span.author-card__details__name']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['h2.headline__subtitle']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.entry__body'],
 								    defaultCleaner: false,
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story']
 								  }
 								};
 								var NewrepublicComExtractor = {
 								  domain: 'newrepublic.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.article-headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['span.AuthorList']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']],
 								    timezone: 'America/New_York'
 								  },
 								  dek: {
 								    selectors: ['h2.article-subhead']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['div.article-body']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['aside']
 								  }
 								};
 								var MoneyCnnComExtractor = {
 								  domain: 'money.cnn.com',
 								  title: {
 								    selectors: ['.article-title']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value'], '.byline a']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="date"]', 'value']],
 								    timezone: 'GMT'
 								  },
 								  dek: {
 								    selectors: ['#storytext h2']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#storytext'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.inStoryHeading']
 								  }
 								};
 								var WwwThevergeComExtractor = {
 								  domain: 'www.theverge.com',
 								  supportedDomains: ['www.polygon.com'],
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.p-dek']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [// feature template multi-match
 								    ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match
 								    ['.e-image--hero', '.c-entry-content'], // feature template fallback
 								    '.l-wrapper .l-feature', // regular post fallback
 								    'div.c-entry-content'],
 								    // Transform lazy-loaded images
 								    transforms: {
 								      noscript: function noscript($node) {
 								        var $children = $node.children();
 								        if ($children.length === 1 && $children.get(0).tagName === 'img') {
 								          return 'span';
 								        }
 								        return null;
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.aside', 'img.c-dynamic-image']
 								  }
 								};
 								var WwwCnnComExtractor = {
 								  domain: 'www.cnn.com',
 								  title: {
 								    selectors: ['h1.pg-headline', 'h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [// a more specific selector to grab the lead image and the body
 								    ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above
 								    '.zn-body-text', 'div[itemprop="articleBody"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph($node) {
 								        var $text = $node.html();
 								        if ($text) {
 								          return 'p';
 								        }
 								        return null;
 								      },
 								      // this transform cleans the short, all-link sections linking
 								      // to related content but not marked as such in any way.
 								      '.zn-body__paragraph': function znBody__paragraph($node) {
 								        if ($node.has('a')) {
 								          if ($node.text().trim() === $node.find('a').text().trim()) {
 								            $node.remove();
 								          }
 								        }
 								      },
 								      '.media__video--thumbnail': 'figure'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwAolComExtractor = {
 								  domain: 'www.aol.com',
 								  title: {
 								    selectors: ['h1.p-article__title']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: ['.p-article__byline__date'],
 								    timezone: 'America/New_York'
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.article-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwYoutubeComExtractor = {
 								  domain: 'www.youtube.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="title"]', 'value'], '.watch-title', 'h1.watch-title-container']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['link[itemprop="name"]', 'content'], '.yt-user-info']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[itemProp="datePublished"]', 'value']],
 								    timezone: 'GMT'
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    defaultCleaner: false,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['#player-container-outer', 'ytd-expandable-video-description-body-renderer #description', ['#player-api', '#description']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '#player-api': function playerApi($node, $) {
 								        var videoId = $('meta[itemProp="videoId"]').attr('value');
 								        $node.html("\n          <iframe src=\"https://www.youtube.com/embed/".concat(videoId, "\" frameborder=\"0\" allowfullscreen></iframe>"));
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								      },
 								      '#player-container-outer': function playerContainerOuter($node, $) {
 								        var videoId = $('meta[itemProp="videoId"]').attr('value');
 								        var description = $('meta[itemProp="description"]').attr('value');
 								        $node.html("\n        <iframe src=\"https://www.youtube.com/embed/".concat(videoId, "\" frameborder=\"0\" allowfullscreen></iframe>\n        <div><span>").concat(description, "</span></div>"));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwTheguardianComExtractor = {
 								  domain: 'www.theguardian.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1', '.content__headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['address[data-link-name="byline"]', 'p.byline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[data-gu-name="standfirst"]', '.content__standfirst']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['#maincontent', '.content__article-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.hide-on-mobile', '.inline-icon']
 								  }
 								};
 								var WwwSbnationComExtractor = {
 								  domain: 'www.sbnation.com',
 								  title: {
 								    selectors: ['h1.c-page-title']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['p.c-entry-summary.p-dek', 'h2.c-entry-summary.p-dek']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.c-entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwBloombergComExtractor = {
 								  domain: 'www.bloomberg.com',
 								  title: {
 								    selectors: [// normal articles
 								    '.lede-headline', // /graphics/ template
 								    'h1.article-title', // /news/ template
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    'h1[class^="headline"]', 'h1.lede-text-only__hed']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
 								    selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template
 								    '.bydek', // /news/ template
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    '.author', 'p[class*="author"]']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value'], ['meta[name="parsely-pub-date"]', 'content']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  dek: {
 								    selectors: []
 								  },
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="og:image"]', 'content']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-body__content', '.body-content', // /graphics/ template
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    ['section.copy-block'], // /news/ template
 								    '.body-copy'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.inline-newsletter', '.page-ad']
 								  }
 								};
 								var WwwBustleComExtractor = {
 								  domain: 'www.bustle.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1', 'h1.post-page__title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a[href*="profile"]', 'div.content-meta__author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['time', 'datetime']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article', '.post-page__body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwNprOrgExtractor = {
 								  domain: 'www.npr.org',
 								  title: {
 								    selectors: ['h1', '.storytitle']
 								  },
 								  author: {
 								    selectors: ['p.byline__name.byline__name--block']
 								  },
 								  date_published: {
 								    selectors: [['.dateblock time[datetime]', 'datetime'], ['meta[name="date"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="twitter:image:src"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.storytext'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.bucketwrap.image': 'figure',
 								      '.bucketwrap.image .credit-caption': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['div.enlarge_measure']
 								  }
 								};
 								var WwwRecodeNetExtractor = {
 								  domain: 'www.recode.net',
 								  title: {
 								    selectors: ['h1.c-page-title']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['h2.c-entry-summary.p-dek']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var QzComExtractor = {
 								  domain: 'qz.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article header h1']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['time[datetime]', 'datetime']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[name="twitter:image"]', 'content']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['#article-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: []
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var WwwDmagazineComExtractor = {
 								  domain: 'www.dmagazine.com',
 								  title: {
 								    selectors: ['h1.story__title']
 								  },
 								  author: {
 								    selectors: ['.story__info .story__info__item:first-child']
 								  },
 								  date_published: {
 								    selectors: [// enter selectors
 								    '.story__info'],
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    timezone: 'America/Chicago',
 								    format: 'MMMM D, YYYY h:mm a'
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  dek: {
 								    selectors: ['.story__subhead']
 								  },
 								  lead_image_url: {
 								    selectors: [['article figure a:first-child', 'href']]
 								  },
 								  content: {
 								    selectors: ['.story__content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwReutersComExtractor = {
 								  domain: 'www.reuters.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[class*="ArticleHeader-headline-"]', 'h1.article-headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:article:author"]', 'value'], '.author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="og:article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.ArticleBodyWrapper', '#article-text'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.article-subtitle': 'h4'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['div[class^="ArticleBody-byline-container-"]', '#article-byline .author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var MashableComExtractor = {
 								  domain: 'mashable.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['header h1', 'h1.title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:author"]', 'value'], 'span.author_name a']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['#article', 'section.article-content.blueprint'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.image-credit': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwChicagotribuneComExtractor = {
 								  domain: 'www.chicagotribune.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article_byline span:first-of-type']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['time']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwVoxComExtractor = {
 								  domain: 'www.vox.com',
 								  title: {
 								    selectors: ['h1.c-page-title']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['.p-dek']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'figure .e-image__image noscript': function figureEImage__imageNoscript($node) {
 								        var imgHtml = $node.html();
 								        $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml);
 								      },
 								      'figure .e-image__meta': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var NewsNationalgeographicComExtractor = {
 								  domain: 'news.nationalgeographic.com',
 								  title: {
 								    selectors: ['h1', 'h1.main-title']
 								  },
 								  author: {
 								    selectors: ['.byline-component__contributors b span']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']],
 								    format: 'ddd MMM DD HH:mm:ss zz YYYY',
 								    timezone: 'EST'
 								  },
 								  dek: {
 								    selectors: ['.article__deck']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.parsys.content': function parsysContent($node, $) {
 								        var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
 								        if ($imgSrc) {
 								          $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.pull-quote.pull-quote--large']
 								  }
 								};
 								var WwwNationalgeographicComExtractor = {
 								  domain: 'www.nationalgeographic.com',
 								  title: {
 								    selectors: ['h1', 'h1.main-title']
 								  },
 								  author: {
 								    selectors: ['.byline-component__contributors b span']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.Article__Headline__Desc', '.article__deck']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['section.Article__Content', ['.parsys.content', '.__image-lead__'], '.content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.parsys.content': function parsysContent($node, $) {
 								        var $imageParent = $node.children().first();
 								        if ($imageParent.hasClass('imageGroup')) {
 								          var $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();
 								          var imgPath1 = $dataAttrContainer.data('platform-image1-path');
 								          var imgPath2 = $dataAttrContainer.data('platform-image2-path');
 								          if (imgPath2 && imgPath1) {
 								            $node.prepend($("<div class=\"__image-lead__\">\n                <img src=\"".concat(imgPath1, "\"/>\n                <img src=\"").concat(imgPath2, "\"/>\n              </div>")));
 								          }
 								        } else {
 								          var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
 								          if ($imgSrc) {
 								            $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
 								          }
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.pull-quote.pull-quote--small']
 								  }
 								};
 								var WwwLatimesComExtractor = {
 								  domain: 'www.latimes.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.headline', '.trb_ar_hl']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a[data-click="standardBylineAuthorName"]', ['meta[name="author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[itemprop="datePublished"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.page-article-body', '.trb_ar_main'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.trb_ar_la': function trb_ar_la($node) {
 								        var $figure = $node.find('figure');
 								        $node.replaceWith($figure);
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.trb_ar_by', '.trb_ar_cr']
 								  }
 								};
 								var PagesixComExtractor = {
 								  domain: 'pagesix.com',
 								  supportedDomains: ['nypost.com'],
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
 								    selectors: ['.byline']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['#featured-image-wrapper', '.entry-content'], '.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '#featured-image-wrapper': 'figure',
 								      '.wp-caption-text': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.modal-trigger']
 								  }
 								};
 								var ThefederalistpapersOrgExtractor = {
 								  domain: 'thefederalistpapers.org',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.author-meta-title', 'main span.entry-author-name']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['header', '.article-sharing', '.after-article', '.type-commenting', '.more-posts', ['p[style]']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var WwwCbssportsComExtractor = {
 								  domain: 'www.cbssports.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.Article-headline', '.article-headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.ArticleAuthor-nameText', '.author-name']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[itemprop="datePublished"]', 'value']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'UTC'
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.Article-subline', '.article-subline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.article'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwMsnbcComExtractor = {
 								  domain: 'www.msnbc.com',
 								  title: {
 								    selectors: ['h1', 'h1.is-title-pane']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.byline-name', '.author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[itemprop="datePublished"]', 'value'], ['meta[name="DC.date.issued"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  dek: {
 								    selectors: [['meta[name="description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-body__content', '.pane-node-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.pane-node-body': function paneNodeBody($node, $) {
 								        var _WwwMsnbcComExtractor = _slicedToArray$1(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
 								            selector = _WwwMsnbcComExtractor[0],
 								            attr = _WwwMsnbcComExtractor[1];
 								        var src = $(selector).attr(attr);
 								        if (src) {
 								          $node.prepend("<img src=\"".concat(src, "\" />"));
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwThepoliticalinsiderComExtractor = {
 								  domain: 'www.thepoliticalinsider.com',
 								  title: {
 								    selectors: [['meta[name="sailthru.title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['meta[name="sailthru.author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="sailthru.date"]', 'value']],
 								    timezone: 'America/New_York'
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div#article-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwMentalflossComExtractor = {
 								  domain: 'www.mentalfloss.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value'], 'h1.title', '.title-group', '.inner']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a[data-vars-label*="authors"]', '.field-name-field-enhanced-authors']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], '.date-display-single'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article main', 'div.field.field-name-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['small']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var AbcnewsGoComExtractor = {
 								  domain: 'abcnews.go.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class*="Article_main__body"] h1', '.article-header h1']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.ShareByline span:nth-child(2)', '.authors'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    clean: ['.author-overlay', '.by-text']
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.ShareByline', '.timestamp'],
 								    format: 'MMMM D, YYYY h:mm a',
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article', '.article-copy'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwNydailynewsComExtractor = {
 								  domain: 'www.nydailynews.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.headline', 'h1#ra-headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article_byline span', ['meta[name="parsely-author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['time', ['meta[name="sailthru.date"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article', 'article#ra-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom']
 								  }
 								};
 								var WwwCnbcComExtractor = {
 								  domain: 'www.cnbc.com',
 								  title: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: ['h1.title', 'h1.ArticleHeader-headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    selectors: ['div#article_body.content', 'div.story', 'div.ArticleBody-articleBody'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwPopsugarComExtractor = {
 								  domain: 'www.popsugar.com',
 								  title: {
 								    selectors: ['h2.post-title', 'title-text']
 								  },
 								  author: {
 								    selectors: [['meta[name="article:author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.share-copy-title', '.post-tags', '.reactions']
 								  }
 								};
 								var ObserverComExtractor = {
 								  domain: 'observer.com',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['.author', '.vcard']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['h2.dek']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var PeopleComExtractor = {
 								  domain: 'people.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-header h1', ['meta[name="og:title"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="sailthru.author"]', 'value'], 'a.author.url.fn']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.mntl-attribution__item-date', ['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  dek: {
 								    selectors: ['.article-header h2']
 								  },
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class^="loc article-content"]', 'div.article-body__inner'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwUsmagazineComExtractor = {
 								  domain: 'www.usmagazine.com',
 								  title: {
 								    selectors: ['header h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a.author', 'a.article-byline.tracked-offpage']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    timezone: 'America/New_York',
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.module-related']
 								  }
 								};
 								var WwwRollingstoneComExtractor = {
 								  domain: 'www.rollingstone.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.l-article-header__row--title', 'h1.content-title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a.c-byline__link', 'a.content-author.tracked-offpage']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], 'time.content-published-date'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h2.l-article-header__row--lead', '.content-description']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.l-article-content', ['.lead-container', '.article-content'], '.article-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.c-related-links-wrapper', '.module-related']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var twofortysevensportsComExtractor = {
 								  domain: '247sports.com',
 								  title: {
 								    selectors: ['title', 'article header h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-cnt__author', '.author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['time[data-published]', 'data-published']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-body', 'section.body.article'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var UproxxComExtractor = {
 								  domain: 'uproxx.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.entry-header h1']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="qc:author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.entry-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.image': 'figure',
 								      'div.image .wp-media-credit': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwEonlineComExtractor = {
 								  domain: 'www.eonline.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.article-detail__title', 'h1.article__title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-detail__meta__author', '.entry-meta__author a']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[itemprop="datePublished"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['.article-detail__main-content section'], ['.post-content section, .post-content div.post-content__image']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.post-content__image': 'figure',
 								      'div.post-content__image .image__credits': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwMiamiheraldComExtractor = {
 								  domain: 'www.miamiherald.com',
 								  title: {
 								    selectors: ['h1.title']
 								  },
 								  date_published: {
 								    selectors: ['p.published-date'],
 								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.dateline-storybody'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwRefinery29ComExtractor = {
 								  domain: 'www.refinery29.com',
 								  title: {
 								    selectors: ['h1.title']
 								  },
 								  author: {
 								    selectors: ['.contributor']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="sailthru.date"]', 'value']],
 								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['.full-width-opener', '.article-content'], '.article-content', '.body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.loading noscript': function divLoadingNoscript($node) {
 								        var imgHtml = $node.html();
 								        $node.parents('.loading').replaceWith(imgHtml);
 								      },
 								      '.section-image': 'figure',
 								      '.section-image .content-caption': 'figcaption',
 								      '.section-text': 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.story-share']
 								  }
 								};
 								var WwwMacrumorsComExtractor = {
 								  domain: 'www.macrumors.com',
 								  title: {
 								    selectors: ['h1', 'h1.title']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article a[rel="author"]', '.author-url']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['time', 'datetime']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
 								  },
 								  dek: {
 								    selectors: [['meta[name="description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article', '.article'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwAndroidcentralComExtractor = {
 								  domain: 'www.androidcentral.com',
 								  title: {
 								    selectors: ['h1', 'h1.main-title']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="parsely-author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="description"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['#article-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.intro', 'blockquote']
 								  }
 								};
 								var WwwSiComExtractor = {
 								  domain: 'www.si.com',
 								  title: {
 								    selectors: ['h1', 'h1.headline']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="published"]', 'value']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.m-detail-header--dek']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.m-detail--body', ['p', '.marquee_large_2x', '.component.image']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      noscript: function noscript($node) {
 								        var $children = $node.children();
 								        if ($children.length === 1 && $children.get(0).tagName === 'img') {
 								          return 'figure';
 								        }
 								        return null;
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: [['.inline-thumb', '.primary-message', '.description', '.instructions']]
 								  }
 								};
 								var WwwRawstoryComExtractor = {
 								  domain: 'www.rawstory.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value'], '.blog-title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.main-post-head .social-author__name', '.blog-author a:first-of-type']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], '.blog-author a:last-of-type'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'EST'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.post-body', '.blog-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwCnetComExtractor = {
 								  domain: 'www.cnet.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['span.author', 'a.author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: ['time'],
 								    timezone: 'America/Los_Angeles'
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.c-head_dek', '.article-dek']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['img.__image-lead__', '.article-main-body'], '.article-main-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'figure.image': function figureImage($node) {
 								        var $img = $node.find('img');
 								        $img.attr('width', '100%');
 								        $img.attr('height', '100%');
 								        $img.addClass('__image-lead__');
 								        $node.remove('.imgContainer').prepend($img);
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwTodayComExtractor = {
 								  domain: 'www.today.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.article-hero-headline__htag', 'h1.entry-headline']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['span.byline-name', ['meta[name="author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['time[datetime]', ['meta[name="DC.date.issued"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article-body__content', '.entry-container'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.label-comment']
 								  }
 								};
 								var WwwAlComExtractor = {
 								  domain: 'www.al.com',
 								  title: {
 								    selectors: [['meta[name="title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['meta[name="article_author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article_date_original"]', 'value']],
 								    timezone: 'EST'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwThepennyhoarderComExtractor = {
 								  domain: 'www.thepennyhoarder.com',
 								  title: {
 								    selectors: [['meta[name="dcterms.title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['link[rel="author"]', 'title']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['.post-img', '.post-text'], '.post-text', '.single-post-content-inner'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwWesternjournalismComExtractor = {
 								  domain: 'www.westernjournalism.com',
 								  title: {
 								    selectors: ['title', 'h1.entry-title']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="DC.date.issued"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['.subtitle']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article-sharing.top + div'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.ad-notice-small']
 								  }
 								};
 								var WwwAmericanowComExtractor = {
 								  domain: 'www.americanow.com',
 								  title: {
 								    selectors: ['.title', ['meta[name="title"]', 'value']]
 								  },
 								  author: {
 								    selectors: ['.byline']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="publish_date"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['.article-content', '.image', '.body'], '.body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.article-video-wrapper', '.show-for-small-only']
 								  }
 								};
 								var ScienceflyComExtractor = {
 								  domain: 'sciencefly.com',
 								  title: {
 								    selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title']
 								  },
 								  author: {
 								    selectors: ['div.cb-author', 'div.cb-author-title']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['div.theiaPostSlider_slides img', 'src']]
 								  },
 								  content: {
 								    selectors: ['div.theiaPostSlider_slides'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var HellogigglesComExtractor = {
 								  domain: 'hellogiggles.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value'], '.title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.byline-wrapper span.author_name', '.author-link']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[property="article:published_time"]', 'content'], ['meta[name="article:published_time"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.main-content', '.entry-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var ThoughtcatalogComExtractor = {
 								  domain: 'thoughtcatalog.com',
 								  title: {
 								    selectors: ['h1.title', ['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['cite a', 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 'h1.writer-name']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.entry.post'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.tc_mark', 'figcaption']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var WwwInquisitrComExtractor = {
 								  domain: 'www.inquisitr.com',
 								  title: {
 								    selectors: ['h1.entry-title.story--header--title']
 								  },
 								  author: {
 								    selectors: ['div.story--header--author']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="datePublished"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['article.story', '.entry-content.'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.post-category', '.story--header--socials', '.story--header--content']
 								  }
 								};
 								var WwwNbcnewsComExtractor = {
 								  domain: 'www.nbcnews.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article-hero-headline h1', 'div.article-hed h1']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article-inline-byline span.byline-name', 'span.byline_author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published"]', 'value'], ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article-body__content', 'div.article-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var FortuneComExtractor = {
 								  domain: 'fortune.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: ['.MblGHNMJ'],
 								    timezone: 'UTC'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['picture', 'article.row'], 'article.row'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwLinkedinComExtractor = {
 								  domain: 'www.linkedin.com',
 								  title: {
 								    selectors: ['.article-title', 'h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.main-author-card h3', ['meta[name="article:author"]', 'value'], '.entity-name a[rel=author]']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.base-main-card__metadata', ['time[itemprop="datePublished"]', 'datetime']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-content__body', ['header figure', '.prose'], '.prose'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.entity-image']
 								  }
 								};
 								var ObamawhitehouseArchivesGovExtractor = {
 								  domain: 'obamawhitehouse.archives.gov',
 								  supportedDomains: ['whitehouse.gov'],
 								  title: {
 								    selectors: ['h1', '.pane-node-title']
 								  },
 								  author: {
 								    selectors: ['.blog-author-link', '.node-person-name-link']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['.field-name-field-forall-summary']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    defaultCleaner: false,
 								    selectors: ['div#content-start', '.pane-node-field-forall-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.pane-node-title', '.pane-custom.pane-1']
 								  }
 								};
 								var WwwOpposingviewsComExtractor = {
 								  domain: 'www.opposingviews.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.m-detail-header--title', 'h1.title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value'], 'div.date span span a']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="published"]', 'value'], ['meta[name="publish_date"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.m-detail--body', '.article-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.show-for-small-only']
 								  }
 								};
 								var WwwProspectmagazineCoUkExtractor = {
 								  domain: 'www.prospectmagazine.co.uk',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.blog-header__title', '.page-title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.blog-header__author-link', '.aside_author .title']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], '.post-info'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'Europe/London'
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.blog-header__description', '.page-subtitle']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.blog__container', 'article .post_content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var ForwardComExtractor = {
 								  domain: 'forward.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.post-author a', '.author-name', ['meta[name="sailthru.author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[name="date"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.content-container article', ['.post-item-media-wrap', '.post-item p']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.post-author', '.donate-box', '.message', '.subtitle']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var WwwQdailyComExtractor = {
 								  domain: 'www.qdaily.com',
 								  title: {
 								    selectors: ['h2', 'h2.title']
 								  },
 								  author: {
 								    selectors: ['.name']
 								  },
 								  date_published: {
 								    selectors: [['.date.smart-date', 'data-origindate']]
 								  },
 								  dek: {
 								    selectors: ['.excerpt']
 								  },
 								  lead_image_url: {
 								    selectors: [['.article-detail-hd img', 'src']]
 								  },
 								  content: {
 								    selectors: ['.detail'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.lazyload', '.lazylad', '.lazylood']
 								  }
 								};
 								var GothamistComExtractor = {
 								  domain: 'gothamist.com',
 								  supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
 								  title: {
 								    selectors: ['h1', '.entry-header h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    // There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
 								    selectors: ['.article-metadata:nth-child(3) .byline-author', '.author']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], 'abbr', 'abbr.published'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  dek: {
 								    selectors: [null]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.article-body', '.entry-body'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.image-none': 'figure',
 								      '.image-none i': 'figcaption',
 								      'div.image-left': 'figure',
 								      '.image-left i': 'figcaption',
 								      'div.image-right': 'figure',
 								      '.image-right i': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
 								  }
 								};
 								var WwwFoolComExtractor = {
 								  domain: 'www.fool.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value'], '.author-inline .author-name']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="date"]', 'value']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:description"]', 'value'], 'header h2']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.tailwind-article-body', '.article-content'],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.caption img': function captionImg($node) {
 								        var src = $node.attr('src');
 								        $node.parent().replaceWith("<figure><img src=\"".concat(src, "\"/></figure>"));
 								      },
 								      '.caption': 'figcaption'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['#pitch']
 								  }
 								};
 								var WwwSlateComExtractor = {
 								  domain: 'www.slate.com',
 								  title: {
 								    selectors: ['.hed', 'h1']
 								  },
 								  author: {
 								    selectors: ['a[rel=author]']
 								  },
 								  date_published: {
 								    selectors: ['.pub-date'],
 								    timezone: 'America/New_York'
 								  },
 								  dek: {
 								    selectors: ['.dek']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment']
 								  }
 								};
 								var IciRadioCanadaCaExtractor = {
 								  domain: 'ici.radio-canada.ca',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: [['meta[name="dc.creator"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="dc.date.created"]', 'value']],
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    format: 'YYYY-MM-DD|HH[h]mm',
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    timezone: 'America/New_York'
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.lead-container', '.bunker-component.lead']
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['section.document-content-style', ['.main-multimedia-item', '.news-story-content']],
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwFortinetComExtractor = {
 								  domain: 'www.fortinet.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['.b15-blog-meta__author']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12'],
 								    transforms: {
 								      noscript: function noscript($node) {
 								        var $children = $node.children();
 								        if ($children.length === 1 && $children.get(0).tagName === 'img') {
 								          return 'figure';
 								        }
 								        return null;
 								      }
 								    }
 								  }
 								};
 								var WwwFastcompanyComExtractor = {
 								  domain: 'www.fastcompany.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value']]
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['.post__deck']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.post__article']
 								  }
 								};
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var BlisterreviewComExtractor = {
 								  domain: 'blisterreview.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['span.author-name']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value'], ['time.entry-date', 'datetime'], ['meta[itemprop="datePublished"]', 'content']]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[itemprop="image"]', 'content'], ['meta[name="twitter:image"]', 'content'], ['img.attachment-large', 'src']]
 								  },
 								  content: {
 								    selectors: [['.elementor-section-wrap', '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      figcaption: 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.comments-area']
 								  }
 								};
 								var NewsMynaviJpExtractor = {
 								  domain: 'news.mynavi.jp',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['a.articleHeader_name', 'main div.article-author a.article-author__name']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.article-body', 'main article div'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      img: function img($node) {
 								        var src = $node.attr('data-original');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								        if (src !== '') {
 								          $node.attr('src', src);
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var ClinicaltrialsGovExtractor = {
 								  domain: 'clinicaltrials.gov',
 								  title: {
 								    selectors: ['h1.tr-solo_record']
 								  },
 								  author: {
 								    selectors: ['div#sponsor.tr-info-text']
 								  },
 								  date_published: {
 								    // selectors: ['span.term[data-term="Last Update Posted"]'],
 								    selectors: ['div:has(> span.term[data-term="Last Update Posted"])']
 								  },
 								  content: {
 								    selectors: ['div#tab-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.usa-alert> img']
 								  }
 								};
 								var GithubComExtractor = {
 								  domain: 'github.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [// enter author selectors
 								    ]
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['relative-time[datetime]', 'datetime'], ['span[itemprop="dateModified"] relative-time', 'datetime']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="description"]', 'value'], 'span[itemprop="about"]']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['#readme article']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwRedditComExtractor = {
 								  domain: 'www.reddit.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[data-test-id="post-content"] h1', 'div[data-test-id="post-content"] h2']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
 								    selectors: ['div[data-test-id="post-content"] a[href*="user/"]']
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[data-test-id="post-content"] span[data-click-id="timestamp"]', 'div[data-test-id="post-content"] a[data-click-id="timestamp"]']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['div[data-test-id="post-content"] p'], // text post
 								    ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
 								    'div[data-test-id="post-content"] div[data-click-id="media"]'], // external link with media preview (YouTube, imgur album, etc...)
 								    ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    ['div[data-test-id="post-content"] a'], // external link
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    'div[data-test-id="post-content"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div[role="img"]': function divRoleImg($node) {
 								        // External link image preview
 								        var $img = $node.find('img');
 								        var bgImg = $node.css('background-image');
 								        if ($img.length === 1 && bgImg) {
 								          $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
 								          return $img;
 								        }
 								        return $node;
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.icon', 'span[id^="PostAwardBadges"]', 'div a[data-test-id="comments-page-link-num-comments"]']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  }
 								};
 								var OtrsComExtractor = {
 								  domain: 'otrs.com',
 								  title: {
 								    selectors: ['#main article h1']
 								  },
 								  author: {
 								    selectors: ['div.dateplusauthor a']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#main article'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['div.dateplusauthor', 'div.gr-12.push-6.footershare', '#atftbx', 'div.category-modul']
 								  }
 								};
 								var WwwOssnewsJpExtractor = {
 								  domain: 'www.ossnews.jp',
 								  title: {
 								    selectors: ['#alpha-block h1.hxnewstitle']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['p.fs12'],
 								    format: 'YYYY年MM月DD日 HH:mm',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#alpha-block .section:has(h1.hxnewstitle)'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var BuzzapJpExtractor = {
 								  domain: 'buzzap.jp',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: [['time.entry-date', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.ctiframe'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwAsahiComExtractor = {
 								  domain: 'www.asahi.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['main h1', '.ArticleTitle h1']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
 								    selectors: [['meta[name="article:author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="pubdate"]', 'value']]
 								  },
 								  dek: null,
 								  excerpt: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['main'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    defaultCleaner: false,
 								    transforms: {},
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['div.AdMod', 'div.LoginSelectArea', 'time', 'div.notPrint']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  }
 								};
 								var WwwSanwaCoJpExtractor = {
 								  domain: 'www.sanwa.co.jp',
 								  title: {
 								    selectors: ['#newsContent h1']
 								  },
 								  author: null,
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['dl.date'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    format: 'YYYY.MM.DD',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#newsContent'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['#smartphone', 'div.sns_box', 'div.contentFoot']
 								  }
 								};
 								var WwwElecomCoJpExtractor = {
 								  domain: 'www.elecom.co.jp',
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['p.section-last'],
 								    format: 'YYYY.MM.DD',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['td.TableMain2'],
 								    defaultCleaner: false,
 								    transforms: {
 								      table: function table($node) {
 								        $node.attr('width', 'auto');
 								      }
 								    },
 								    clean: []
 								  }
 								};
 								var ScanNetsecurityNeJpExtractor = {
 								  domain: 'scan.netsecurity.ne.jp',
 								  title: {
 								    selectors: ['header.arti-header h1.head']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: [['meta[name="article:modified_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['header.arti-header p.arti-summary']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.arti-content.arti-content--thumbnail'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['aside.arti-giga']
 								  }
 								};
 								var JvndbJvnJpExtractor = {
 								  domain: 'jvndb.jvn.jp',
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['div.modifytxt:nth-child(2)'],
 								    format: 'YYYY/MM/DD',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['#news-list'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var GeniusComExtractor = {
 								  domain: 'genius.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['h2 a']
 								  },
 								  date_published: {
 								    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
 								      var json = JSON.parse(res);
 								      return json.song.release_date;
 								    }]]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
 								      var json = JSON.parse(res);
 								      return json.song.album.cover_art_url;
 								    }]]
 								  },
 								  content: {
 								    selectors: ['.lyrics'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwJnsaOrgExtractor = {
 								  domain: 'www.jnsa.org',
 								  title: {
 								    selectors: ['#wgtitle h2']
 								  },
 								  author: null,
 								  date_published: null,
 								  dek: null,
 								  excerpt: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#main_area'],
 								    transforms: {},
 								    clean: ['#pankuzu', '#side']
 								  }
 								};
 								var PhpspotOrgExtractor = {
 								  domain: 'phpspot.org',
 								  title: {
 								    selectors: ['h3.hl']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['h4.hl'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['div.entrybody'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwInfoqComExtractor = {
 								  domain: 'www.infoq.com',
 								  title: {
 								    selectors: ['h1.heading']
 								  },
 								  author: {
 								    selectors: ['div.widget.article__authors']
 								  },
 								  date_published: {
 								    selectors: ['.article__readTime.date'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article__data'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwMoongiftJpExtractor = {
 								  domain: 'www.moongift.jp',
 								  title: {
 								    selectors: ['h1.title a']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['ul.meta li:not(.social):first-of-type'],
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#main'],
 								    transforms: {},
 								    clean: ['ul.mg_service.cf']
 								  }
 								};
 								var WwwItmediaCoJpExtractor = {
 								  domain: 'www.itmedia.co.jp',
 								  supportedDomains: ['www.atmarkit.co.jp', 'techtarget.itmedia.co.jp', 'nlab.itmedia.co.jp'],
 								  title: {
 								    selectors: ['#cmsTitle h1']
 								  },
 								  author: {
 								    selectors: ['#byline']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:modified_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['#cmsAbstract h2']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#cmsBody'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['#snsSharebox']
 								  }
 								};
 								var WwwPublickey1JpExtractor = {
 								  domain: 'www.publickey1.jp',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.bloggerinchief p:first-of-type', '#subcol p:has(img)']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
 								    selectors: ['div.pubdate'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#maincol'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer']
 								  }
 								};
 								var TakagihiromitsuJpExtractor = {
 								  domain: 'takagi-hiromitsu.jp',
 								  title: {
 								    selectors: ['h3']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[http-equiv="Last-Modified"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['div.body'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var BookwalkerJpExtractor = {
 								  domain: 'bookwalker.jp',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1.p-main__title', 'h1.main-heading']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.p-author__list', 'div.authors']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['dl.p-information__data dd:nth-of-type(7)', '.work-info .work-detail:first-of-type .work-detail-contents:last-of-type'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.p-main__information', ['div.main-info', 'div.main-cover-inner']],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['span.label.label--trial', 'dt.info-head.info-head--coin', 'dd.info-contents.info-contents--coin', 'div.info-notice.fn-toggleClass']
 								  }
 								};
 								var WwwYomiuriCoJpExtractor = {
 								  domain: 'www.yomiuri.co.jp',
 								  title: {
 								    selectors: ['h1.title-article.c-article-title']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.p-main-contents'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var JapanCnetComExtractor = {
 								  domain: 'japan.cnet.com',
 								  title: {
 								    selectors: ['.leaf-headline-ttl']
 								  },
 								  author: {
 								    selectors: ['.writer']
 								  },
 								  date_published: {
 								    selectors: ['.date'],
 								    format: 'YYYY年MM月DD日 HH時mm分',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article_body'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var DeadlineComExtractor = {
 								  domain: 'deadline.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['section.author h2']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'],
 								    transforms: {
 								      '.embed-twitter': function embedTwitter($node) {
 								        var innerHtml = $node.html();
 								        $node.replaceWith(innerHtml);
 								      }
 								    },
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['figcaption']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  }
 								};
 								var WwwGizmodoJpExtractor = {
 								  domain: 'www.gizmodo.jp',
 								  title: {
 								    selectors: ['h1.p-post-title']
 								  },
 								  author: {
 								    selectors: ['li.p-post-AssistAuthor']
 								  },
 								  date_published: {
 								    selectors: [['li.p-post-AssistTime time', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['article.p-post'],
 								    transforms: {
 								      'img.p-post-thumbnailImage': function imgPPostThumbnailImage($node) {
 								        var src = $node.attr('src');
 								        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
 								      }
 								    },
 								    clean: ['h1.p-post-title', 'ul.p-post-Assist']
 								  }
 								};
 								var GetnewsJpExtractor = {
 								  domain: 'getnews.jp',
 								  title: {
 								    selectors: ['article h1']
 								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:author"]', 'value'], 'span.prof']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['ul.cattag-top time', 'datetime']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.post-bodycopy'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwLifehackerJpExtractor = {
 								  domain: 'www.lifehacker.jp',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[class^="article_pArticle_Title"]', 'h1.lh-summary-title']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="author"]', 'value'], 'p.lh-entryDetailInner--credit']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['div.lh-entryDetail-header time', 'datetime']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class^="article_pArticle_Body__"]', 'div.lh-entryDetail-body'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    transforms: {
 								      'img.lazyload': function imgLazyload($node) {
 								        var src = $node.attr('src');
 								        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
 								      }
 								    },
 								    clean: ['p.lh-entryDetailInner--credit']
 								  }
 								};
 								var SectIijAdJpExtractor = {
 								  domain: 'sect.iij.ad.jp',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.title-box-inner h1', 'h3']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['p.post-author a', 'dl.entrydate dd']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['time'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['.entry-inner', '#article'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    transforms: {},
 								    clean: ['dl.entrydate']
 								  }
 								};
 								var WwwOreillyCoJpExtractor = {
 								  domain: 'www.oreilly.co.jp',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value'], 'h3']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['span[itemprop="author"]', 'li[itemprop="author"]']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['dd[itemprop="datePublished"]', 'content'], ['meta[itemprop="datePublished"]', 'value']],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image:secure_url"]', 'value'], ['meta[name="og:image"]', 'value']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['section.detail', '#content'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['.social-tools']
 								  }
 								};
 								var WwwIpaGoJpExtractor = {
 								  domain: 'www.ipa.go.jp',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['p.ipar_text_right'],
 								    format: 'YYYY年M月D日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['#ipar_main'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['p.ipar_text_right']
 								  }
 								};
 								var WeeklyAsciiJpExtractor = {
 								  domain: 'weekly.ascii.jp',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article h1', 'h1[itemprop="headline"]']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
 								    selectors: ['p.author']
 								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['p.date', ['meta[name="odate"]', 'value']],
 								    format: 'YYYY年MM月DD日 HH:mm',
 								    timezone: 'Asia/Tokyo'
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div#contents_detail', 'div.article'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    transforms: {},
 								    clean: []
 								  }
 								};
 								var TechlogIijAdJpExtractor = {
 								  domain: 'techlog.iij.ad.jp',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['a[rel="author"]']
 								  },
 								  date_published: {
 								    selectors: [['time.entry-date', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.entry-content'],
 								    defaultCleaner: false,
 								    transforms: {},
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['.wp_social_bookmarking_light']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  }
 								};
 								var WiredJpExtractor = {
 								  domain: 'wired.jp',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['h1[data-testid="ContentHeaderHed"]', 'h1.post-title']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:author"]', 'value'], 'p[itemprop="author"]']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:published_time"]', 'value'], ['time', 'datetime']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class^="ContentHeaderDek"]', '.post-intro']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[data-attribute-verso-pattern="article-body"]', 'article.article-detail'],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    transforms: {
 								      'img[data-original]': function imgDataOriginal($node) {
 								        var dataOriginal = $node.attr('data-original');
 								        var src = $node.attr('src');
 								        var url = URL$1.resolve(src, dataOriginal);
 								        $node.attr('src', url);
 								      }
 								    },
 								    clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer']
 								  }
 								};
 								var JapanZdnetComExtractor = {
 								  domain: 'japan.zdnet.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: [['meta[name="cXenseParse:author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article_body'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwRbbtodayComExtractor = {
 								  domain: 'www.rbbtoday.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['.writer.writer-name']
 								  },
 								  date_published: {
 								    selectors: [['header time', 'datetime']]
 								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="description"]', 'value'], '.arti-summary']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.arti-content'],
 								    transforms: {},
 								    clean: ['.arti-giga']
 								  }
 								};
 								var WwwLemondeFrExtractor = {
 								  domain: 'www.lemonde.fr',
 								  title: {
 								    selectors: ['h1.article__title']
 								  },
 								  author: {
 								    selectors: ['.author__name']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="og:article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['.article__desc']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.article__content'],
 								    transforms: {},
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    clean: ['figcaption']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  }
 								};
 								var WwwPhoronixComExtractor = {
 								  domain: 'www.phoronix.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['article h1', 'article header']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
 								    selectors: ['.author a:first-child']
 								  },
 								  date_published: {
 								    selectors: ['.author'],
 								    // 1 June 2019 at 08:34 PM EDT
 								    format: 'D MMMM YYYY at hh:mm',
 								    timezone: 'America/New_York'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['.content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var PitchforkComExtractor = {
 								  domain: 'pitchfork.com',
 								  title: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:title"]', 'value'], 'title']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  author: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="article:author"]', 'value'], '.authors-detail__display-name']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  date_published: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div[class^="InfoSliceWrapper-"]', ['.pub-date', 'datetime']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  dek: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:description"]', 'value'], '.review-detail__abstract']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  lead_image_url: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: [['meta[name="og:image"]', 'value'], ['.single-album-tombstone__art img', 'src']]
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  content: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    selectors: ['div.body__inner-container', '.review-detail__text']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  extend: {
 								    score: {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								      selectors: ['p[class*="Rating"]', '.score']
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    }
 								  }
 								};
 								var BiorxivOrgExtractor = {
 								  domain: 'biorxiv.org',
 								  title: {
 								    selectors: ['h1#page-title']
 								  },
 								  author: {
 								    selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors']
 								  },
 								  content: {
 								    selectors: ['div#abstract-1'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var EpaperZeitDeExtractor = {
 								  domain: 'epaper.zeit.de',
 								  title: {
 								    selectors: ['p.title']
 								  },
 								  author: {
 								    selectors: ['.article__author']
 								  },
 								  date_published: null,
 								  excerpt: {
 								    selectors: ['subtitle']
 								  },
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['.article'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'p.title': 'h1',
 								      '.article__author': 'p',
 								      byline: 'p',
 								      linkbox: 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['image-credits', 'box[type=citation]']
 								  }
 								};
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								var WwwLadbibleComExtractor = {
 								  domain: 'www.ladbible.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['[class*=Byline]']
 								  },
 								  date_published: {
 								    selectors: ['time'],
 								    timezone: 'Europe/London'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['[class*=ArticleContainer]'],
 								    clean: ['time', 'source', 'a[href^="https://www.ladbible.com/"]', 'picture', '[class*=StyledCardBlock]']
 								  }
 								};
 								var TimesofindiaIndiatimesComExtractor = {
 								  domain: 'timesofindia.indiatimes.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  extend: {
 								    reporter: {
 								      selectors: ['div.byline'],
 								      transforms: {}
 								    }
 								  },
 								  date_published: {
 								    selectors: ['.byline'],
 								    format: 'MMM D, YYYY, HH:mm z',
 								    timezone: 'Asia/Kolkata'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.contentwrapper:has(section)'],
 								    defaultCleaner: false,
 								    clean: ['section', 'h1', '.byline', '.img_cptn', '.icon_share_wrap', 'ul[itemtype="https://schema.org/BreadcrumbList"]']
 								  }
 								};
 								var MaTtiasBeExtractor = {
 								  domain: 'ma.ttias.be',
 								  title: {
 								    selectors: [['meta[name="twitter:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['.content']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      h2: function h2($node) {
 								        // The "id" attribute values would result in low scores and the element being
 								        // removed.
 								        $node.attr('id', null); // h1 elements will be demoted to h2, so demote h2 elements to h3.
 								        return 'h3';
 								      },
 								      h1: function h1($node) {
 								        // The "id" attribute values would result in low scores and the element being
 								        // removed.
 								        $node.attr('id', null); // A subsequent h2 will be removed if there is not a paragraph before it, so
 								        // add a paragraph here. It will be removed anyway because it is empty.
 								        $node.after('<p></p>');
 								      },
 								      ul: function ul($node) {
 								        // Articles contain lists of links which look like, but are not, navigation
 								        // elements. Adding this class attribute avoids them being incorrectly removed.
 								        $node.attr('class', 'entry-content-asset');
 								      }
 								    }
 								  }
 								};
 								var PastebinComExtractor = {
 								  domain: 'pastebin.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['.username', '.paste_box_line2 .t_us + a']
 								  },
 								  date_published: {
 								    selectors: ['.date', '.paste_box_line2 .t_da + span'],
 								    timezone: 'America/New_York',
 								    format: 'MMMM D, YYYY'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.source', '#selectable .text'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      ol: 'div',
 								      li: 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								/* eslint-disable no-nested-ternary */
 								/* eslint-disable no-unused-expressions */
 								var WwwAbendblattDeExtractor = {
 								  domain: 'www.abendblatt.de',
 								  title: {
 								    selectors: ['h2.article__header__headline']
 								  },
 								  author: {
 								    selectors: ['span.author-info__name-text']
 								  },
 								  date_published: {
 								    selectors: [['time.teaser-stream-time', 'datetime'], ['time.article__header__date', 'datetime']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article__body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      p: function p($node) {
 								        if (!$node.hasClass('obfuscated')) return null;
 								        var o = '';
 								        var n = 0;
 								        for (var i = $node.text(); n < i.length; n += 1) {
 								          var r = i.charCodeAt(n);
 								          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
 								        }
 								        $node.html(o);
 								        $node.removeClass('obfuscated');
 								        $node.addClass('deobfuscated');
 								        return null;
 								      },
 								      div: function div($node) {
 								        if (!$node.hasClass('obfuscated')) return null;
 								        var o = '';
 								        var n = 0;
 								        for (var i = $node.text(); n < i.length; n += 1) {
 								          var r = i.charCodeAt(n);
 								          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
 								        }
 								        $node.html(o);
 								        $node.removeClass('obfuscated');
 								        $node.addClass('deobfuscated');
 								        return null;
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwGrueneDeExtractor = {
 								  domain: 'www.gruene.de',
 								  title: {
 								    selectors: ['header h1']
 								  },
 								  author: null,
 								  date_published: null,
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[property="og:image"]', 'content']]
 								  },
 								  content: {
 								    // selectors: ['section'],
 								    selectors: [['section header', 'section h2', 'section p', 'section ol']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['figcaption', 'p[class]']
 								  }
 								};
 								var WwwEngadgetComExtractor = {
 								  domain: 'www.engadget.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: ['a.th-meta[data-ylk*="subsec:author"]']
 								  },
 								  // Engadget stories have publish dates, but the only representation of them on the page
 								  // is in a format like "2h ago". There are also these tags with blank values:
 								  // <meta class="swiftype" name="published_at" data-type="date" value="">
 								  date_published: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  dek: {
 								    selectors: ['div[class*="o-title_mark"] div']
 								  },
 								  // Engadget stories do have lead images specified by an og:image meta tag, but selecting
 								  // the value attribute of that tag fails. I believe the "&#x2111;" sequence of characters
 								  // is triggering this inability to select the attribute value.
 								  lead_image_url: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  content: {
 								    selectors: [[// Some figures will be inside div.article-text, but some header figures/images
 								    // will not.
 								    '#page_body figure:not(div.article-text figure)', 'div.article-text']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var ArstechnicaComExtractor = {
 								  domain: 'arstechnica.com',
 								  // Articles from this site are often paginated, but I was unable to write a CSS
 								  // selector to find the next page. On the last page, there will be a link with a CSS
 								  // selector indicating that the previous page is next. But the parser appears to find
 								  // the next page without this extractor finding it, as long as the fallback option is
 								  // left at its default value of true.
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: {
 								    selectors: ['*[rel="author"] *[itemprop="name"]']
 								  },
 								  date_published: {
 								    selectors: [['.byline time', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['h2[itemprop="description"]']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div[itemprop="articleBody"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      h2: function h2($node) {
 								        // Some pages have an element h2 that is significant, and that the parser will
 								        // remove if not following a paragraph. Adding this empty paragraph fixes it, and
 								        // the empty paragraph will be removed anyway.
 								        $node.before('<p></p>');
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result.
 								    clean: [// Remove enlarge links and separators inside image captions.
 								    'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
 								    // removed them.
 								    'figure.video', // Image galleries that do not work.
 								    '.gallery', 'aside', '.sidebar']
 								  }
 								};
 								var WwwNdtvComExtractor = {
 								  domain: 'www.ndtv.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['span[itemprop="author"] span[itemprop="name"]']
 								  },
 								  date_published: {
 								    selectors: [['span[itemprop="dateModified"]', 'content']]
 								  },
 								  dek: {
 								    selectors: ['h2']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div[itemprop="articleBody"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      // This site puts a dateline in a 'b' above the first paragraph, and then somehow
 								      // blends it into the first paragraph with CSS. This transform moves the dateline
 								      // to the first paragraph.
 								      '.place_cont': function place_cont($node) {
 								        if (!$node.parents('p').length) {
 								          var nextSibling = $node.next('p');
 								          if (nextSibling) {
 								            $node.remove();
 								            nextSibling.prepend($node);
 								          }
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.highlghts_Wdgt', '.ins_instory_dv_caption', 'input', '._world-wrapper .mt20']
 								  }
 								};
 								var SpektrumExtractor = {
 								  domain: 'www.spektrum.de',
 								  title: {
 								    selectors: ['.content__title']
 								  },
 								  author: {
 								    selectors: ['.content__author__info__name']
 								  },
 								  date_published: {
 								    selectors: ['.content__meta__date'],
 								    timezone: 'Europe/Berlin'
 								  },
 								  dek: {
 								    selectors: ['.content__intro']
 								  },
 								  lead_image_url: {
 								    selectors: [// This is how the meta tag appears in the original source code.
 								    ['meta[name="og:image"]', 'value'], // This is how the meta tag appears in the DOM in Chrome.
 								    // The selector is included here to make the code work within the browser as well.
 								    ['meta[property="og:image"]', 'content'], // This is the image that is shown on the page.
 								    // It can be slightly cropped compared to the original in the meta tag.
 								    '.image__article__top img']
 								  },
 								  content: {
 								    selectors: ['article.content'],
 								    clean: ['.breadcrumbs', '.hide-for-print', 'aside', 'header h2', '.image__article__top', '.content__author', '.copyright', '.callout-box']
 								  }
 								};
 								var PostlightComExtractor = {
 								  domain: 'postlight.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['meta[name="parsely-author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['h2.single-hero__abstract']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['main.post'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['section.pl-post-link', 'aside', 'section.insights_featured_case_studies']
 								  }
 								};
 								var WwwInvestmentexecutiveComExtractor = {
 								  domain: 'www.investmentexecutive.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['div[itemprop="author"]']
 								  },
 								  date_published: {
 								    selectors: [['meta[itemprop="datePublished"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['section.article-body'],
 								    clean: ['.hidden']
 								  }
 								};
 								var WwwCbcCaExtractor = {
 								  domain: 'www.cbc.ca',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['.authorText', '.bylineDetails']
 								  },
 								  date_published: {
 								    selectors: [['.timeStamp[datetime]', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['.deck']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.story'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
 								var CustomExtractors =
 								/*#__PURE__*/
 								_Object$freeze({
 								  BloggerExtractor: BloggerExtractor,
 								  NYMagExtractor: NYMagExtractor,
 								  WikipediaExtractor: WikipediaExtractor,
 								  TwitterExtractor: TwitterExtractor,
 								  NYTimesExtractor: NYTimesExtractor,
 								  TheAtlanticExtractor: TheAtlanticExtractor,
 								  NewYorkerExtractor: NewYorkerExtractor,
 								  WiredExtractor: WiredExtractor,
 								  MSNExtractor: MSNExtractor,
 								  YahooExtractor: YahooExtractor,
 								  BuzzfeedExtractor: BuzzfeedExtractor,
 								  WikiaExtractor: WikiaExtractor,
 								  LittleThingsExtractor: LittleThingsExtractor,
 								  PoliticoExtractor: PoliticoExtractor,
 								  DeadspinExtractor: DeadspinExtractor,
 								  BroadwayWorldExtractor: BroadwayWorldExtractor,
 								  ApartmentTherapyExtractor: ApartmentTherapyExtractor,
 								  MediumExtractor: MediumExtractor,
 								  WwwTmzComExtractor: WwwTmzComExtractor,
 								  WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
 								  WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
 								  NewrepublicComExtractor: NewrepublicComExtractor,
 								  MoneyCnnComExtractor: MoneyCnnComExtractor,
 								  WwwThevergeComExtractor: WwwThevergeComExtractor,
 								  WwwCnnComExtractor: WwwCnnComExtractor,
 								  WwwAolComExtractor: WwwAolComExtractor,
 								  WwwYoutubeComExtractor: WwwYoutubeComExtractor,
 								  WwwTheguardianComExtractor: WwwTheguardianComExtractor,
 								  WwwSbnationComExtractor: WwwSbnationComExtractor,
 								  WwwBloombergComExtractor: WwwBloombergComExtractor,
 								  WwwBustleComExtractor: WwwBustleComExtractor,
 								  WwwNprOrgExtractor: WwwNprOrgExtractor,
 								  WwwRecodeNetExtractor: WwwRecodeNetExtractor,
 								  QzComExtractor: QzComExtractor,
 								  WwwDmagazineComExtractor: WwwDmagazineComExtractor,
 								  WwwReutersComExtractor: WwwReutersComExtractor,
 								  MashableComExtractor: MashableComExtractor,
 								  WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
 								  WwwVoxComExtractor: WwwVoxComExtractor,
 								  NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
 								  WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
 								  WwwLatimesComExtractor: WwwLatimesComExtractor,
 								  PagesixComExtractor: PagesixComExtractor,
 								  ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
 								  WwwCbssportsComExtractor: WwwCbssportsComExtractor,
 								  WwwMsnbcComExtractor: WwwMsnbcComExtractor,
 								  WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
 								  WwwMentalflossComExtractor: WwwMentalflossComExtractor,
 								  AbcnewsGoComExtractor: AbcnewsGoComExtractor,
 								  WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
 								  WwwCnbcComExtractor: WwwCnbcComExtractor,
 								  WwwPopsugarComExtractor: WwwPopsugarComExtractor,
 								  ObserverComExtractor: ObserverComExtractor,
 								  PeopleComExtractor: PeopleComExtractor,
 								  WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
 								  WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
 								  twofortysevensportsComExtractor: twofortysevensportsComExtractor,
 								  UproxxComExtractor: UproxxComExtractor,
 								  WwwEonlineComExtractor: WwwEonlineComExtractor,
 								  WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
 								  WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
 								  WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
 								  WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
 								  WwwSiComExtractor: WwwSiComExtractor,
 								  WwwRawstoryComExtractor: WwwRawstoryComExtractor,
 								  WwwCnetComExtractor: WwwCnetComExtractor,
 								  WwwTodayComExtractor: WwwTodayComExtractor,
 								  WwwAlComExtractor: WwwAlComExtractor,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
 								  WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
 								  WwwAmericanowComExtractor: WwwAmericanowComExtractor,
 								  ScienceflyComExtractor: ScienceflyComExtractor,
 								  HellogigglesComExtractor: HellogigglesComExtractor,
 								  ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
 								  WwwInquisitrComExtractor: WwwInquisitrComExtractor,
 								  WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
 								  FortuneComExtractor: FortuneComExtractor,
 								  WwwLinkedinComExtractor: WwwLinkedinComExtractor,
 								  ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
 								  WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
 								  WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
 								  ForwardComExtractor: ForwardComExtractor,
 								  WwwQdailyComExtractor: WwwQdailyComExtractor,
 								  GothamistComExtractor: GothamistComExtractor,
 								  WwwFoolComExtractor: WwwFoolComExtractor,
 								  WwwSlateComExtractor: WwwSlateComExtractor,
 								  IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
 								  WwwFortinetComExtractor: WwwFortinetComExtractor,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
 								  BlisterreviewComExtractor: BlisterreviewComExtractor,
 								  NewsMynaviJpExtractor: NewsMynaviJpExtractor,
 								  ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
 								  GithubComExtractor: GithubComExtractor,
 								  WwwRedditComExtractor: WwwRedditComExtractor,
 								  OtrsComExtractor: OtrsComExtractor,
 								  WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
 								  BuzzapJpExtractor: BuzzapJpExtractor,
 								  WwwAsahiComExtractor: WwwAsahiComExtractor,
 								  WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
 								  WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
 								  ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
 								  JvndbJvnJpExtractor: JvndbJvnJpExtractor,
 								  GeniusComExtractor: GeniusComExtractor,
 								  WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
 								  PhpspotOrgExtractor: PhpspotOrgExtractor,
 								  WwwInfoqComExtractor: WwwInfoqComExtractor,
 								  WwwMoongiftJpExtractor: WwwMoongiftJpExtractor,
 								  WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor,
 								  WwwPublickey1JpExtractor: WwwPublickey1JpExtractor,
 								  TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
 								  BookwalkerJpExtractor: BookwalkerJpExtractor,
 								  WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
 								  JapanCnetComExtractor: JapanCnetComExtractor,
 								  DeadlineComExtractor: DeadlineComExtractor,
 								  WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
 								  GetnewsJpExtractor: GetnewsJpExtractor,
 								  WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
 								  SectIijAdJpExtractor: SectIijAdJpExtractor,
 								  WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
 								  WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
 								  WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
 								  TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
 								  WiredJpExtractor: WiredJpExtractor,
 								  JapanZdnetComExtractor: JapanZdnetComExtractor,
 								  WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
 								  WwwLemondeFrExtractor: WwwLemondeFrExtractor,
 								  WwwPhoronixComExtractor: WwwPhoronixComExtractor,
 								  PitchforkComExtractor: PitchforkComExtractor,
 								  BiorxivOrgExtractor: BiorxivOrgExtractor,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  EpaperZeitDeExtractor: EpaperZeitDeExtractor,
 								  WwwLadbibleComExtractor: WwwLadbibleComExtractor,
 								  TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
 								  MaTtiasBeExtractor: MaTtiasBeExtractor,
 								  PastebinComExtractor: PastebinComExtractor,
 								  WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
 								  WwwGrueneDeExtractor: WwwGrueneDeExtractor,
 								  WwwEngadgetComExtractor: WwwEngadgetComExtractor,
 								  ArstechnicaComExtractor: ArstechnicaComExtractor,
 								  WwwNdtvComExtractor: WwwNdtvComExtractor,
 								  SpektrumExtractor: SpektrumExtractor,
 								  PostlightComExtractor: PostlightComExtractor,
 								  WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
 								  WwwCbcCaExtractor: WwwCbcCaExtractor
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								});
 								var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
 								  var extractor = CustomExtractors[key];
 								  return _objectSpread({}, acc, mergeSupportedDomains(extractor));
 								}, {}); // CLEAN AUTHOR CONSTANTS
 								var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS
 								var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.
 								var MS_DATE_STRING = /^\d{13}$/i;
 								var SEC_DATE_STRING = /^\d{10}$/i;
 								var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
 								var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
 								var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								var TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
 								var timeUnits = ['seconds?', 'minutes?', 'hours?', 'days?', 'weeks?', 'months?', 'years?'];
 								var allTimeUnits = timeUnits.join('|');
 								var TIME_AGO_STRING = new RegExp("(\\d+)\\s+(".concat(allTimeUnits, ")\\s+ago"), 'i');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
 								var allMonths = months.join('|');
 								var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
 								var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
 								var timestamp3 = '-[0-9]{3,4}$';
 								var SPLIT_DATE_STRING = new RegExp("(".concat(timestamp1, ")|(").concat(timestamp2, ")|(").concat(timestamp3, ")|([0-9]{1,4})|(").concat(allMonths, ")"), 'ig'); // 2016-11-22T08:57-500
-												Refactor: running tests more efficiently (#49)

Only running one parser per page we're testing rather than a parser per field we're testing.
											
										
										
											8 years ago
+								// Check if datetime string has an offset at the end
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// A regular expression that will match separating characters on a
 								// title, that usually denote breadcrumbs or something similar.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
 								var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g'); // just the name(s): 'David Smith'.
 								function cleanAuthor(author) {
 								  return normalizeSpaces$1(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
 								}
 								function clean$1(leadImageUrl) {
 								  leadImageUrl = leadImageUrl.trim();
 								  if (validUrl$1.isWebUri(leadImageUrl)) {
 								    return leadImageUrl;
 								  }
 								  return null;
 								} // Return None if the dek wasn't good enough.
 								function cleanDek(dek, _ref) {
 								  var $ = _ref.$,
 								      excerpt = _ref.excerpt; // Sanity check that we didn't get too short or long of a dek.
 								  if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt
 								  if (excerpt && excerptContent$1(excerpt, 10) === excerptContent$1(dek, 10)) return null;
 								  var dekText = stripTags$1(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's
 								  // not a good dek - bail.
 								  if (TEXT_LINK_RE.test(dekText)) return null;
 								  return normalizeSpaces$1(dekText.trim());
 								}
 								function cleanDateString(dateString) {
 								  return (dateString.match(SPLIT_DATE_STRING) || []).join(' ').replace(TIME_MERIDIAN_DOTS_RE, 'm').replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3').replace(CLEAN_DATE_STRING_RE, '$1').trim();
 								}
 								function createDate(dateString, timezone, format) {
 								  if (TIME_WITH_OFFSET_RE.test(dateString)) {
 								    return moment(new Date(dateString));
 								  }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  if (TIME_AGO_STRING.test(dateString)) {
 								    var fragments = TIME_AGO_STRING.exec(dateString);
 								    return moment().subtract(fragments[1], fragments[2]);
 								  }
 								  if (TIME_NOW_STRING.test(dateString)) {
 								    return moment();
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString));
 								} // Take a date published string, and hopefully return a date out of
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// it. Return none if we fail.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function cleanDatePublished(dateString) {
 								  var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
 								      timezone = _ref.timezone,
 								      format = _ref.format; // If string is in milliseconds or seconds, convert to int and return
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  if (MS_DATE_STRING.test(dateString)) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    return new Date(_parseInt$1(dateString, 10)).toISOString();
 								  }
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  if (SEC_DATE_STRING.test(dateString)) {
 								    return new Date(_parseInt$1(dateString, 10) * 1000).toISOString();
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var date = createDate(dateString, timezone, format);
 								  if (!date.isValid()) {
 								    dateString = cleanDateString(dateString);
 								    date = createDate(dateString, timezone, format);
 								  }
 								  return date.isValid() ? date.toISOString() : null;
 								}
 								function extractCleanNode(article, _ref) {
 								  var $ = _ref.$,
 								      _ref$cleanConditional = _ref.cleanConditionally,
 								      cleanConditionally = _ref$cleanConditional === void 0 ? true : _ref$cleanConditional,
 								      _ref$title = _ref.title,
 								      title = _ref$title === void 0 ? '' : _ref$title,
 								      _ref$url = _ref.url,
 								      url = _ref$url === void 0 ? '' : _ref$url,
 								      _ref$defaultCleaner = _ref.defaultCleaner,
 								      defaultCleaner = _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner; // Rewrite the tag name to div if it's a top level node like body or
 								  // html to avoid later complications with multiple body tags.
 								  rewriteTopLevel$$1(article, $); // Drop small images and spacer images
 								  // Only do this is defaultCleaner is set to true;
 								  // this can sometimes be too aggressive.
 								  if (defaultCleaner) cleanImages$1(article, $); // Make links absolute
 								  makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed.
 								  // E.g., stripJunkTags will remove iframes, so we're going to mark
 								  // YouTube/Vimeo videos as elements we want to keep.
 								  markToKeep$1(article, $, url); // Drop certain tags like <title>, etc
 								  // This is -mostly- for cleanliness, not security.
 								  stripJunkTags$1(article, $); // H1 tags are typically the article title, which should be extracted
 								  // by the title extractor instead. If there's less than 3 of them (<3),
 								  // strip them. Otherwise, turn 'em into H2s.
 								  cleanHOnes$$1(article, $); // Clean headers
 								  cleanHeaders$1(article, $, title); // We used to clean UL's and OL's here, but it was leading to
 								  // too many in-article lists being removed. Consider a better
 								  // way to detect menus particularly and remove them.
 								  // Also optionally running, since it can be overly aggressive.
 								  if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes
 								  removeEmpty$1(article, $); // Remove unnecessary attributes
 								  cleanAttributes$$1(article, $);
 								  return article;
 								}
 								function cleanTitle$$1(title, _ref) {
 								  var url = _ref.url,
 								      $ = _ref.$; // If title has |, :, or - in it, see if
 								  // we can clean it up.
 								  if (TITLE_SPLITTERS_RE.test(title)) {
 								    title = resolveSplitTitle(title, url);
 								  } // Final sanity check that we didn't get a crazy title.
 								  // if (title.length > 150 || title.length < 15) {
 								  if (title.length > 150) {
 								    // If we did, return h1 from the document if it exists
 								    var h1 = $('h1');
 								    if (h1.length === 1) {
 								      title = h1.text();
 								    }
 								  } // strip any html tags in the title text
 								  return normalizeSpaces$1(stripTags$1(title, $).trim());
 								}
 								function extractBreadcrumbTitle(splitTitle, text) {
 								  // This must be a very breadcrumbed title, like:
 								  // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
 								  // NYTimes - Blogs - Bits - The Best Gadgets on Earth
 								  if (splitTitle.length >= 6) {
 								    // Look to see if we can find a breadcrumb splitter that happens
 								    // more than once. If we can, we'll be able to better pull out
 								    // the title.
 								    var termCounts = splitTitle.reduce(function (acc, titleText) {
 								      acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
 								      return acc;
 								    }, {});
 								    var _Reflect$ownKeys$redu = _Reflect$ownKeys$1(termCounts).reduce(function (acc, key) {
 								      if (acc[1] < termCounts[key]) {
 								        return [key, termCounts[key]];
 								      }
 								      return acc;
 								    }, [0, 0]),
 								        _Reflect$ownKeys$redu2 = _slicedToArray$1(_Reflect$ownKeys$redu, 2),
 								        maxTerm = _Reflect$ownKeys$redu2[0],
 								        termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it
 								    // is probably the breadcrumber. Split our title on that instead.
 								    // Note: max_term should be <= 4 characters, so that " >> "
 								    // will match, but nothing longer than that.
 								    if (termCount >= 2 && maxTerm.length <= 4) {
 								      splitTitle = text.split(maxTerm);
 								    }
 								    var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
 								    var longestEnd = splitEnds.reduce(function (acc, end) {
 								      return acc.length > end.length ? acc : end;
 								    }, '');
 								    if (longestEnd.length > 10) {
 								      return longestEnd;
 								    }
 								    return text;
 								  }
 								  return null;
 								}
 								function cleanDomainFromTitle(splitTitle, url) {
 								  // Search the ends of the title, looking for bits that fuzzy match
 								  // the URL too closely. If one is found, discard it and return the
 								  // rest.
 								  //
 								  // Strip out the big TLDs - it just makes the matching a bit more
 								  // accurate. Not the end of the world if it doesn't strip right.
 								  var _URL$parse = URL$1.parse(url),
 								      host = _URL$parse.host;
 								  var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
 								  var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
 								  var startSlugRatio = wuzzy$1.levenshtein(startSlug, nakedDomain);
 								  if (startSlugRatio > 0.4 && startSlug.length > 5) {
 								    return splitTitle.slice(2).join('');
 								  }
 								  var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
 								  var endSlugRatio = wuzzy$1.levenshtein(endSlug, nakedDomain);
 								  if (endSlugRatio > 0.4 && endSlug.length >= 5) {
 								    return splitTitle.slice(0, -2).join('');
 								  }
 								  return null;
 								} // Given a title with separators in it (colons, dashes, etc),
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// resolve whether any of the segments should be removed.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function resolveSplitTitle(title) {
 								  var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : ''; // Splits while preserving splitters, like:
 								  // ['The New New York', ' - ', 'The Washington Post']
 								  var splitTitle = title.split(TITLE_SPLITTERS_RE);
 								  if (splitTitle.length === 1) {
 								    return title;
 								  }
 								  var newTitle = extractBreadcrumbTitle(splitTitle, title);
 								  if (newTitle) return newTitle;
 								  newTitle = cleanDomainFromTitle(splitTitle, url);
 								  if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit.
 								  // Just return it all.
 								  return title;
 								}
 								var Cleaners = {
 								  author: cleanAuthor,
 								  lead_image_url: clean$1,
 								  dek: cleanDek,
 								  date_published: cleanDatePublished,
 								  content: extractCleanNode,
 								  title: cleanTitle$$1
 								}; // likely to be article text.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								//
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// If strip_unlikely_candidates is True, remove any elements that
 								// match certain criteria first. (Like, does this element have a
 								// classname of "comment")
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								//
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// If weight_nodes is True, use classNames and IDs to determine the
 								// worthiness of nodes.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								//
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Returns a cheerio object $
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function extractBestNode($, opts) {
 								  if (opts.stripUnlikelyCandidates) {
 								    $ = stripUnlikelyCandidates$1($);
 								  }
 								  $ = convertToParagraphs$$1($);
 								  $ = scoreContent$$1($, opts.weightNodes);
 								  var $topCandidate = findTopCandidate$$1($);
 								  return $topCandidate;
 								}
 								var GenericContentExtractor = {
 								  defaultOpts: {
 								    stripUnlikelyCandidates: true,
 								    weightNodes: true,
 								    cleanConditionally: true
 								  },
 								  // Extract the content for this resource - initially, pass in our
 								  // most restrictive opts which will return the highest quality
 								  // content. On each failure, retry with slightly more lax opts.
 								  //
 								  // :param return_type: string. If "node", should return the content
 								  // as a cheerio node rather than as an HTML string.
 								  //
 								  // Opts:
 								  // stripUnlikelyCandidates: Remove any elements that match
 								  // non-article-like criteria first.(Like, does this element
 								  //   have a classname of "comment")
 								  //
 								  // weightNodes: Modify an elements score based on whether it has
 								  // certain classNames or IDs. Examples: Subtract if a node has
 								  // a className of 'comment', Add if a node has an ID of
 								  // 'entry-content'.
 								  //
 								  // cleanConditionally: Clean the node to return of some
 								  // superfluous content. Things like forms, ads, etc.
 								  extract: function extract(_ref, opts) {
 								    var $ = _ref.$,
 								        html = _ref.html,
 								        title = _ref.title,
 								        url = _ref.url;
 								    opts = _objectSpread({}, this.defaultOpts, opts);
 								    $ = $ || cheerio$1.load(html); // Cascade through our extraction-specific opts in an ordered fashion,
 								    // turning them off as we try to extract content.
 								    var node = this.getContentNode($, title, url, opts);
 								    if (nodeIsSufficient$1(node)) {
 								      return this.cleanAndReturnNode(node, $);
 								    } // We didn't succeed on first pass, one by one disable our
 								    // extraction opts and try again.
 								    // eslint-disable-next-line no-restricted-syntax
 								    var _iteratorNormalCompletion = true;
 								    var _didIteratorError = false;
 								    var _iteratorError = undefined;
 								    try {
 								      for (var _iterator = _getIterator$1(_Reflect$ownKeys$1(opts).filter(function (k) {
 								        return opts[k] === true;
 								      })), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
 								        var key = _step.value;
 								        opts[key] = false;
 								        $ = cheerio$1.load(html);
 								        node = this.getContentNode($, title, url, opts);
 								        if (nodeIsSufficient$1(node)) {
 								          break;
 								        }
 								      }
 								    } catch (err) {
 								      _didIteratorError = true;
 								      _iteratorError = err;
 								    } finally {
 								      try {
 								        if (!_iteratorNormalCompletion && _iterator.return != null) {
 								          _iterator.return();
 								        }
 								      } finally {
 								        if (_didIteratorError) {
 								          throw _iteratorError;
 								        }
 								      }
 								    }
 								    return this.cleanAndReturnNode(node, $);
 								  },
 								  // Get node given current options
 								  getContentNode: function getContentNode($, title, url, opts) {
 								    return extractCleanNode(extractBestNode($, opts), {
 								      $: $,
 								      cleanConditionally: opts.cleanConditionally,
 								      title: title,
 								      url: url
 								    });
 								  },
 								  // Once we got here, either we're at our last-resort node, or
 								  // we broke early. Make sure we at least have -something- before we
 								  // move forward.
 								  cleanAndReturnNode: function cleanAndReturnNode(node, $) {
 								    if (!node) {
 								      return null;
 								    }
 								    return normalizeSpaces$1($.html(node));
 								  }
 								}; // TODO: It would be great if we could merge the meta and selector lists into
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// a list of objects, because we could then rank them better. For example,
 								// .hentry .entry-title is far better suited than <meta title>.
 								// An ordered list of meta tag names that denote likely article titles. All
 								// attributes should be lowercase for faster case-insensitive matching. From
 								// most distinct to least distinct.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var STRONG_TITLE_META_TAGS = ['tweetmeme-title', 'dc.title', 'rbtitle', 'headline', 'title']; // og:title is weak because it typically contains context that we don't like,
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// for example the source site's name. Gotta get that brand into facebook!
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// most explicit to least explicit.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								//
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Note - this does not use classes like CSS. This checks to see if the string
 								// exists in the className, which is not as accurate as .className (which
 								// splits on spaces/endlines), but for our purposes it's close enough. The
 								// speed tradeoff is worth the accuracy hit.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var STRONG_TITLE_SELECTORS = ['.hentry .entry-title', 'h1#articleHeader', 'h1.articleHeader', 'h1.article', '.instapaper_title', '#meebo-title'];
 								var WEAK_TITLE_SELECTORS = ['article h1', '#entry-title', '.entry-title', '#entryTitle', '#entrytitle', '.entryTitle', '.entrytitle', '#articleTitle', '.articleTitle', 'post post-title', 'h1.title', 'h2.article', 'h1', 'html head title', 'title'];
 								var GenericTitleExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        url = _ref.url,
 								        metaCache = _ref.metaCache; // First, check to see if we have a matching meta tag that we can make
 								    // use of that is strongly associated with the headline.
 								    var title;
 								    title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache);
 								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // Second, look through our content selectors for the most likely
 								    // article title that is strongly associated with the headline.
 								    title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS);
 								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // Third, check for weaker meta tags that may match.
 								    title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache);
 								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // Last, look for weaker selector tags that may match.
 								    title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS);
 								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // If no matches, return an empty string
 								    return '';
 								  }
 								}; // An ordered list of meta tag names that denote likely article authors. All
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// attributes should be lowercase for faster case-insensitive matching. From
 								// most distinct to least distinct.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								//
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Note: "author" is too often the -developer- of the page, so it is not
 								// added here.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var AUTHOR_META_TAGS = ['byl', 'clmst', 'dc.author', 'dcsext.author', 'dc.creator', 'rbauthors', 'authors'];
 								var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// most explicit to least explicit.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								//
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Note - this does not use classes like CSS. This checks to see if the string
 								// exists in the className, which is not as accurate as .className (which
 								// splits on spaces/endlines), but for our purposes it's close enough. The
 								// speed tradeoff is worth the accuracy hit.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .vcard .fn', '.byline.vcard .fn', '.byline .vcard .fn', '.byline .by .author', '.byline .by', '.byline .author', '.post-author.vcard', '.post-author .vcard', 'a[rel=author]', '#by_author', '.by_author', '#entryAuthor', '.entryAuthor', '.byline a[href*=author]', '#author .authorname', '.author .authorname', '#author', '.author', '.articleauthor', '.ArticleAuthor', '.byline']; // An ordered list of Selectors to find likely article authors, with
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// regular expression for content.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var bylineRe = /^[\n\s]*By/i;
 								var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];
 								var GenericAuthorExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        metaCache = _ref.metaCache;
 								    var author; // First, check to see if we have a matching
 								    // meta tag that we can make use of.
 								    author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache);
 								    if (author && author.length < AUTHOR_MAX_LENGTH) {
 								      return cleanAuthor(author);
 								    } // Second, look through our selectors looking for potential authors.
 								    author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2);
 								    if (author && author.length < AUTHOR_MAX_LENGTH) {
 								      return cleanAuthor(author);
 								    } // Last, use our looser regular-expression based selectors for
 								    // potential authors.
 								    // eslint-disable-next-line no-restricted-syntax
 								    var _iteratorNormalCompletion = true;
 								    var _didIteratorError = false;
 								    var _iteratorError = undefined;
 								    try {
 								      for (var _iterator = _getIterator$1(BYLINE_SELECTORS_RE), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
 								        var _step$value = _slicedToArray$1(_step.value, 2),
 								            selector = _step$value[0],
 								            regex = _step$value[1];
 								        var node = $(selector);
 								        if (node.length === 1) {
 								          var text = node.text();
 								          if (regex.test(text)) {
 								            return cleanAuthor(text);
 								          }
 								        }
 								      }
 								    } catch (err) {
 								      _didIteratorError = true;
 								      _iteratorError = err;
 								    } finally {
 								      try {
 								        if (!_iteratorNormalCompletion && _iterator.return != null) {
 								          _iterator.return();
 								        }
 								      } finally {
 								        if (_didIteratorError) {
 								          throw _iteratorError;
 								        }
 								      }
 								    }
 								    return null;
 								  }
 								}; // An ordered list of meta tag names that denote
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// likely date published dates. All attributes
 								// should be lowercase for faster case-insensitive matching.
 								// From most distinct to least distinct.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var DATE_PUBLISHED_META_TAGS = ['article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date']; // An ordered list of XPath Selectors to find
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// likely date published dates. From most explicit
 								// to least explicit.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var DATE_PUBLISHED_SELECTORS = ['.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate']; // An ordered list of compiled regular expressions to find likely date
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// published dates from the URL. These should always have the first
 								// reference be a date string that is parseable by dateutil.parser.parse
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
 								var DATE_PUBLISHED_URL_RES = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), new RegExp("/(20\\d{2}/".concat(abbrevMonthsStr, "/[0-3]\\d)/"), 'i')];
 								var GenericDatePublishedExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        url = _ref.url,
 								        metaCache = _ref.metaCache;
 								    var datePublished; // First, check to see if we have a matching meta tag
 								    // that we can make use of.
 								    // Don't try cleaning tags from this string
 								    datePublished = extractFromMeta$$1($, DATE_PUBLISHED_META_TAGS, metaCache, false);
 								    if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential
 								    // date_published's.
 								    datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS);
 								    if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL
 								    datePublished = extractFromUrl$1(url, DATE_PUBLISHED_URL_RES);
 								    if (datePublished) return cleanDatePublished(datePublished);
 								    return null;
 								  }
 								}; // Currently there is only one selector for
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// deks. We should simply return null here
 								// until we have a more robust generic option.
 								// Below is the original source for this, for reference.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var GenericDekExtractor = {
 								  extract: function extract() {
 								    return null;
 								  }
 								}; // An ordered list of meta tag names that denote likely article leading images.
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// All attributes should be lowercase for faster case-insensitive matching.
 								// From most distinct to least distinct.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src'];
 								var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]'];
 								var POSITIVE_LEAD_IMAGE_URL_HINTS = ['upload', 'wp-content', 'large', 'photo', 'wp-image'];
 								var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
 								var NEGATIVE_LEAD_IMAGE_URL_HINTS = ['spacer', 'sprite', 'blank', 'throbber', 'gradient', 'tile', 'bg', 'background', 'icon', 'social', 'header', 'hdr', 'advert', 'spinner', 'loader', 'loading', 'default', 'rating', 'share', 'facebook', 'twitter', 'theme', 'promo', 'ads', 'wp-includes'];
 								var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
 								var GIF_RE = /\.gif(\?.*)?$/i;
 								var JPG_RE = /\.jpe?g(\?.*)?$/i;
 								function getSig($node) {
 								  return "".concat($node.attr('class') || '', " ").concat($node.attr('id') || '');
 								} // Scores image urls based on a variety of heuristics.
 								function scoreImageUrl(url) {
 								  url = url.trim();
 								  var score = 0;
 								  if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
 								    score += 20;
 								  }
 								  if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
 								    score -= 20;
 								  } // TODO: We might want to consider removing this as
 								  // gifs are much more common/popular than they once were
 								  if (GIF_RE.test(url)) {
 								    score -= 10;
 								  }
 								  if (JPG_RE.test(url)) {
 								    score += 10;
 								  } // PNGs are neutral.
 								  return score;
 								} // Alt attribute usually means non-presentational image.
 								function scoreAttr($img) {
 								  if ($img.attr('alt')) {
 								    return 5;
 								  }
 								  return 0;
 								} // Look through our parent and grandparent for figure-like
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// container elements, give a bonus if we find them
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function scoreByParents($img) {
 								  var score = 0;
 								  var $figParent = $img.parents('figure').first();
 								  if ($figParent.length === 1) {
 								    score += 25;
 								  }
 								  var $parent = $img.parent();
 								  var $gParent;
 								  if ($parent.length === 1) {
 								    $gParent = $parent.parent();
 								  }
 								  [$parent, $gParent].forEach(function ($node) {
 								    if (PHOTO_HINTS_RE$1$1.test(getSig($node))) {
 								      score += 15;
 								    }
 								  });
 								  return score;
 								} // Look at our immediate sibling and see if it looks like it's a
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// caption. Bonus if so.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								function scoreBySibling($img) {
 								  var score = 0;
 								  var $sibling = $img.next();
 								  var sibling = $sibling.get(0);
 								  if (sibling && sibling.tagName.toLowerCase() === 'figcaption') {
 								    score += 25;
 								  }
 								  if (PHOTO_HINTS_RE$1$1.test(getSig($sibling))) {
 								    score += 15;
 								  }
 								  return score;
 								}
 								function scoreByDimensions($img) {
 								  var score = 0;
 								  var width = _parseFloat$1($img.attr('width'));
 								  var height = _parseFloat$1($img.attr('height'));
 								  var src = $img.attr('src'); // Penalty for skinny images
 								  if (width && width <= 50) {
 								    score -= 50;
 								  } // Penalty for short images
 								  if (height && height <= 50) {
 								    score -= 50;
 								  }
 								  if (width && height && !src.includes('sprite')) {
 								    var area = width * height;
 								    if (area < 5000) {
 								      // Smaller than 50 x 100
 								      score -= 100;
 								    } else {
 								      score += Math.round(area / 1000);
 								    }
 								  }
 								  return score;
 								}
 								function scoreByPosition($imgs, index) {
 								  return $imgs.length / 2 - index;
 								} // it. Like content and next page extraction, uses a scoring system
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								// to determine what the most likely image may be. Short circuits
 								// on really probable things like og:image meta tags.
 								//
 								// Potential signals to still take advantage of:
 								//   * domain
 								//   * weird aspect ratio
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var GenericLeadImageUrlExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        content = _ref.content,
 								        metaCache = _ref.metaCache,
 								        html = _ref.html;
 								    var cleanUrl;
 								    if (!$.browser && $('head').length === 0) {
 								      $('*').first().prepend(html);
 								    } // Check to see if we have a matching meta tag that we can make use of.
 								    // Moving this higher because common practice is now to use large
 								    // images on things like Open Graph or Twitter cards.
 								    // images usually have for things like Open Graph.
 								    var imageUrl = extractFromMeta$$1($, LEAD_IMAGE_URL_META_TAGS, metaCache, false);
 								    if (imageUrl) {
 								      cleanUrl = clean$1(imageUrl);
 								      if (cleanUrl) return cleanUrl;
 								    } // Next, try to find the "best" image via the content.
 								    // We'd rather not have to fetch each image and check dimensions,
 								    // so try to do some analysis and determine them instead.
 								    var $content = $(content);
 								    var imgs = $('img', $content).toArray();
 								    var imgScores = {};
 								    imgs.forEach(function (img, index) {
 								      var $img = $(img);
 								      var src = $img.attr('src');
 								      if (!src) return;
 								      var score = scoreImageUrl(src);
 								      score += scoreAttr($img);
 								      score += scoreByParents($img);
 								      score += scoreBySibling($img);
 								      score += scoreByDimensions($img);
 								      score += scoreByPosition(imgs, index);
 								      imgScores[src] = score;
 								    });
 								    var _Reflect$ownKeys$redu = _Reflect$ownKeys$1(imgScores).reduce(function (acc, key) {
 								      return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
 								    }, [null, 0]),
 								        _Reflect$ownKeys$redu2 = _slicedToArray$1(_Reflect$ownKeys$redu, 2),
 								        topUrl = _Reflect$ownKeys$redu2[0],
 								        topScore = _Reflect$ownKeys$redu2[1];
 								    if (topScore > 0) {
 								      cleanUrl = clean$1(topUrl);
 								      if (cleanUrl) return cleanUrl;
 								    } // If nothing else worked, check to see if there are any really
 								    // probable nodes in the doc, like <link rel="image_src" />.
 								    // eslint-disable-next-line no-restricted-syntax
 								    var _iteratorNormalCompletion = true;
 								    var _didIteratorError = false;
 								    var _iteratorError = undefined;
 								    try {
 								      for (var _iterator = _getIterator$1(LEAD_IMAGE_URL_SELECTORS), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
 								        var selector = _step.value;
 								        var $node = $(selector).first();
 								        var src = $node.attr('src');
 								        if (src) {
 								          cleanUrl = clean$1(src);
 								          if (cleanUrl) return cleanUrl;
 								        }
 								        var href = $node.attr('href');
 								        if (href) {
 								          cleanUrl = clean$1(href);
 								          if (cleanUrl) return cleanUrl;
 								        }
 								        var value = $node.attr('value');
 								        if (value) {
 								          cleanUrl = clean$1(value);
 								          if (cleanUrl) return cleanUrl;
 								        }
 								      }
 								    } catch (err) {
 								      _didIteratorError = true;
 								      _iteratorError = err;
 								    } finally {
 								      try {
 								        if (!_iteratorNormalCompletion && _iterator.return != null) {
 								          _iterator.return();
 								        }
 								      } finally {
 								        if (_didIteratorError) {
 								          throw _iteratorError;
 								        }
 								      }
 								    }
 								    return null;
 								  }
 								};
 								function scoreSimilarity(score, articleUrl, href) {
 								  // Do this last and only if we have a real candidate, because it's
 								  // potentially expensive computationally. Compare the link to this
 								  // URL using difflib to get the % similarity of these URLs. On a
 								  // sliding scale, subtract points from this link based on
 								  // similarity.
 								  if (score > 0) {
 								    var similarity = new difflib$1.SequenceMatcher(null, articleUrl, href).ratio(); // Subtract .1 from diff_percent when calculating modifier,
 								    // which means that if it's less than 10% different, we give a
 								    // bonus instead. Ex:
 								    //  3% different = +17.5 points
 								    // 10% different = 0 points
 								    // 20% different = -25 points
 								    var diffPercent = 1.0 - similarity;
 								    var diffModifier = -(250 * (diffPercent - 0.2));
 								    return score + diffModifier;
 								  }
 								  return 0;
 								}
 								function scoreLinkText(linkText, pageNum) {
 								  // If the link text can be parsed as a number, give it a minor
 								  // bonus, with a slight bias towards lower numbered pages. This is
 								  // so that pages that might not have 'next' in their text can still
 								  // get scored, and sorted properly by score.
 								  var score = 0;
 								  if (IS_DIGIT_RE$1.test(linkText.trim())) {
 								    var linkTextAsNum = _parseInt$1(linkText, 10); // If it's the first page, we already got it on the first call.
 								    // Give it a negative score. Otherwise, up to page 10, give a
 								    // small bonus.
 								    if (linkTextAsNum < 2) {
 								      score = -30;
 								    } else {
 								      score = Math.max(0, 10 - linkTextAsNum);
 								    } // If it appears that the current page number is greater than
 								    // this links page number, it's a very bad sign. Give it a big
 								    // penalty.
 								    if (pageNum && pageNum >= linkTextAsNum) {
 								      score -= 50;
 								    }
 								  }
 								  return score;
 								}
 								function scorePageInLink(pageNum, isWp) {
 								  // page in the link = bonus. Intentionally ignore wordpress because
 								  // their ?p=123 link style gets caught by this even though it means
 								  // separate documents entirely.
 								  if (pageNum && !isWp) {
 								    return 50;
 								  }
 								  return 0;
 								}
 								var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// this link is not a next page link.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var EXTRANEOUS_LINK_HINTS$1 = ['print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated'];
 								var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(EXTRANEOUS_LINK_HINTS$1.join('|'), 'i'); // Match any link text/classname/id that looks like it could mean the next
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// page. Things like: next, continue, >, >>, » but not >|, »| as those can
 								// mean last page.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var NEXT_LINK_TEXT_RE$1 = new RegExp('(next|weiter|continue|>([^|]|$)|»([^|]|$))', 'i'); // Match any link text/classname/id that looks like it is an end link: things
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// like "first", "last", "end", etc.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// page.
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination
 								function scoreExtraneousLinks(href) {
 								  // If the URL itself contains extraneous values, give a penalty.
 								  if (EXTRANEOUS_LINK_HINTS_RE$1.test(href)) {
 								    return -25;
 								  }
 								  return 0;
 								}
 								function makeSig($link) {
 								  return "".concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
 								}
 								function scoreByParents$1($link) {
 								  // If a parent node contains paging-like classname or id, give a
 								  // bonus. Additionally, if a parent_node contains bad content
 								  // (like 'sponsor'), give a penalty.
 								  var $parent = $link.parent();
 								  var positiveMatch = false;
 								  var negativeMatch = false;
 								  var score = 0;
 								  _Array$from(range(0, 4)).forEach(function () {
 								    if ($parent.length === 0) {
 								      return;
 								    }
 								    var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good
 								    // sign. Add a bonus.
 								    if (!positiveMatch && PAGE_RE$1.test(parentData)) {
 								      positiveMatch = true;
 								      score += 25;
 								    } // If we have 'comment' or something in our data, and
 								    // we don't have something like 'content' as well, that's
 								    // a bad sign. Give a penalty.
 								    if (!negativeMatch && NEGATIVE_SCORE_RE$2.test(parentData) && EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)) {
 								      if (!POSITIVE_SCORE_RE$2.test(parentData)) {
 								        negativeMatch = true;
 								        score -= 25;
 								      }
 								    }
 								    $parent = $parent.parent();
 								  });
 								  return score;
 								}
 								function scorePrevLink(linkData) {
 								  // If the link has something like "previous", its definitely
 								  // an old link, skip it.
 								  if (PREV_LINK_TEXT_RE$1.test(linkData)) {
 								    return -200;
 								  }
 								  return 0;
 								}
 								function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls) {
 								  // skip if we've already fetched this url
 								  if (previousUrls.find(function (url) {
 								    return href === url;
 								  }) !== undefined) {
 								    return false;
 								  } // If we've already parsed this URL, or the URL matches the base
 								  // URL, or is empty, skip it.
 								  if (!href || href === articleUrl || href === baseUrl) {
 								    return false;
 								  }
 								  var hostname = parsedUrl.hostname;
 								  var _URL$parse = URL$1.parse(href),
 								      linkHost = _URL$parse.hostname; // Domain mismatch.
 								  if (linkHost !== hostname) {
 								    return false;
 								  } // If href doesn't contain a digit after removing the base URL,
 								  // it's certainly not the next page.
 								  var fragment = href.replace(baseUrl, '');
 								  if (!DIGIT_RE$2.test(fragment)) {
 								    return false;
 								  } // This link has extraneous content (like "comment") in its link
 								  // text, so we skip it.
 								  if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) {
 								    return false;
 								  } // Next page link text is never long, skip if it is too long.
 								  if (linkText.length > 25) {
 								    return false;
 								  }
 								  return true;
 								}
 								function scoreBaseUrl(href, baseRegex) {
 								  // If the baseUrl isn't part of this URL, penalize this
 								  // link. It could still be the link, but the odds are lower.
 								  // Example:
 								  // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
 								  if (!baseRegex.test(href)) {
 								    return -25;
 								  }
 								  return 0;
 								}
 								function scoreNextLinkText(linkData) {
 								  // Things like "next", ">>", etc.
 								  if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
 								    return 50;
 								  }
 								  return 0;
 								}
 								function scoreCapLinks(linkData) {
 								  // Cap links are links like "last", etc.
 								  if (CAP_LINK_TEXT_RE$1.test(linkData)) {
 								    // If we found a link like "last", but we've already seen that
 								    // this link is also "next", it's fine. If it's not been
 								    // previously marked as "next", then it's probably bad.
 								    // Penalize.
 								    if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
 								      return -65;
 								    }
 								  }
 								  return 0;
 								}
 								function makeBaseRegex(baseUrl) {
 								  return new RegExp("^".concat(baseUrl), 'i');
 								}
 								function makeSig$1($link, linkText) {
 								  return "".concat(linkText || $link.text(), " ").concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
 								}
 								function scoreLinks(_ref) {
 								  var links = _ref.links,
 								      articleUrl = _ref.articleUrl,
 								      baseUrl = _ref.baseUrl,
 								      parsedUrl = _ref.parsedUrl,
 								      $ = _ref.$,
 								      _ref$previousUrls = _ref.previousUrls,
 								      previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
 								  parsedUrl = parsedUrl || URL$1.parse(articleUrl);
 								  var baseRegex = makeBaseRegex(baseUrl);
 								  var isWp = isWordpress$1($); // Loop through all links, looking for hints that they may be next-page
 								  // links. Things like having "page" in their textContent, className or
 								  // id, or being a child of a node with a page-y className or id.
 								  //
 								  // After we do that, assign each page a score, and pick the one that
 								  // looks most like the next page link, as long as its score is strong
 								  // enough to have decent confidence.
 								  var scoredPages = links.reduce(function (possiblePages, link) {
 								    // Remove any anchor data since we don't do a good job
 								    // standardizing URLs (it's hard), we're going to do
 								    // some checking with and without a trailing slash
 								    var attrs = getAttrs$1(link); // if href is undefined, return
 								    if (!attrs.href) return possiblePages;
 								    var href = removeAnchor$1(attrs.href);
 								    var $link = $(link);
 								    var linkText = $link.text();
 								    if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
 								      return possiblePages;
 								    } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
 								    if (!possiblePages[href]) {
 								      possiblePages[href] = {
 								        score: 0,
 								        linkText: linkText,
 								        href: href
 								      };
 								    } else {
 								      possiblePages[href].linkText = "".concat(possiblePages[href].linkText, "|").concat(linkText);
 								    }
 								    var possiblePage = possiblePages[href];
 								    var linkData = makeSig$1($link, linkText);
 								    var pageNum = pageNumFromUrl$1(href);
 								    var score = scoreBaseUrl(href, baseRegex);
 								    score += scoreNextLinkText(linkData);
 								    score += scoreCapLinks(linkData);
 								    score += scorePrevLink(linkData);
 								    score += scoreByParents$1($link);
 								    score += scoreExtraneousLinks(href);
 								    score += scorePageInLink(pageNum, isWp);
 								    score += scoreLinkText(linkText, pageNum);
 								    score += scoreSimilarity(score, articleUrl, href);
 								    possiblePage.score = score;
 								    return possiblePages;
 								  }, {});
 								  return _Reflect$ownKeys$1(scoredPages).length === 0 ? null : scoredPages;
 								} // for multi-page articles
 								var GenericNextPageUrlExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        url = _ref.url,
 								        parsedUrl = _ref.parsedUrl,
 								        _ref$previousUrls = _ref.previousUrls,
 								        previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
 								    parsedUrl = parsedUrl || URL$1.parse(url);
 								    var articleUrl = removeAnchor$1(url);
 								    var baseUrl = articleBaseUrl$1(url, parsedUrl);
 								    var links = $('a[href]').toArray();
 								    var scoredLinks = scoreLinks({
 								      links: links,
 								      articleUrl: articleUrl,
 								      baseUrl: baseUrl,
 								      parsedUrl: parsedUrl,
 								      $: $,
 								      previousUrls: previousUrls
 								    }); // If no links were scored, return null
 								    if (!scoredLinks) return null; // now that we've scored all possible pages,
 								    // find the biggest one.
 								    var topPage = _Reflect$ownKeys$1(scoredLinks).reduce(function (acc, link) {
 								      var scoredLink = scoredLinks[link];
 								      return scoredLink.score > acc.score ? scoredLink : acc;
 								    }, {
 								      score: -100
 								    }); // If the score is less than 50, we're not confident enough to use it,
 								    // so we fail.
 								    if (topPage.score >= 50) {
 								      return topPage.href;
 								    }
 								    return null;
 								  }
 								};
 								var CANONICAL_META_SELECTORS = ['og:url'];
 								function parseDomain(url) {
 								  var parsedUrl = URL$1.parse(url);
 								  var hostname = parsedUrl.hostname;
 								  return hostname;
 								}
 								function result(url) {
 								  return {
 								    url: url,
 								    domain: parseDomain(url)
 								  };
 								}
 								var GenericUrlExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        url = _ref.url,
 								        metaCache = _ref.metaCache;
 								    var $canonical = $('link[rel=canonical]');
 								    if ($canonical.length !== 0) {
 								      var href = $canonical.attr('href');
 								      if (href) {
 								        return result(href);
 								      }
 								    }
 								    var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache);
 								    if (metaUrl) {
 								      return result(metaUrl);
 								    }
 								    return result(url);
 								  }
 								};
 								var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
 								function clean$2(content, $) {
 								  var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
 								  content = content.replace(/[\s\n]+/g, ' ').trim();
 								  return ellipsize$1(content, maxLength, {
 								    ellipse: '&hellip;'
 								  });
 								}
 								var GenericExcerptExtractor = {
 								  extract: function extract(_ref) {
 								    var $ = _ref.$,
 								        content = _ref.content,
 								        metaCache = _ref.metaCache;
 								    var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache);
 								    if (excerpt) {
 								      return clean$2(stripTags$1(excerpt, $));
 								    } // Fall back to excerpting from the extracted content
 								    var maxLength = 200;
 								    var shortContent = content.slice(0, maxLength * 5);
 								    return clean$2($(shortContent).text(), $, maxLength);
 								  }
 								};
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
 								var getWordCount = function getWordCount(content) {
 								  var $ = cheerio$1.load(content);
 								  var $content = $('div').first();
 								  var text = normalizeSpaces$1($content.text());
 								  return text.split(/\s/).length;
 								};
 								var getWordCountAlt = function getWordCountAlt(content) {
 								  content = content.replace(/<[^>]*>/g, ' ');
 								  content = content.replace(/\s+/g, ' ');
 								  content = content.trim();
 								  return content.split(' ').length;
 								};
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var GenericWordCountExtractor = {
 								  extract: function extract(_ref) {
 								    var content = _ref.content;
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								    var count = getWordCount(content);
 								    if (count === 1) count = getWordCountAlt(content);
 								    return count;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								var GenericExtractor = {
 								  // This extractor is the default for all domains
 								  domain: '*',
 								  title: GenericTitleExtractor.extract,
 								  date_published: GenericDatePublishedExtractor.extract,
 								  author: GenericAuthorExtractor.extract,
 								  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
 								  lead_image_url: GenericLeadImageUrlExtractor.extract,
 								  dek: GenericDekExtractor.extract,
 								  next_page_url: GenericNextPageUrlExtractor.extract,
 								  url_and_domain: GenericUrlExtractor.extract,
 								  excerpt: GenericExcerptExtractor.extract,
 								  word_count: GenericWordCountExtractor.extract,
 								  direction: function direction(_ref) {
 								    var title = _ref.title;
 								    return stringDirection$1.getDirection(title);
 								  },
 								  extract: function extract(options) {
 								    var html = options.html,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								        $ = options.$;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								    if (html && !$) {
 								      var loaded = cheerio$1.load(html);
 								      options.$ = loaded;
 								    }
 								    var title = this.title(options);
 								    var date_published = this.date_published(options);
 								    var author = this.author(options);
 								    var content = this.content(_objectSpread({}, options, {
 								      title: title
 								    }));
 								    var lead_image_url = this.lead_image_url(_objectSpread({}, options, {
 								      content: content
 								    }));
 								    var dek = this.dek(_objectSpread({}, options, {
 								      content: content
 								    }));
 								    var next_page_url = this.next_page_url(options);
 								    var excerpt = this.excerpt(_objectSpread({}, options, {
 								      content: content
 								    }));
 								    var word_count = this.word_count(_objectSpread({}, options, {
 								      content: content
 								    }));
 								    var direction = this.direction({
 								      title: title
 								    });
 								    var _this$url_and_domain = this.url_and_domain(options),
 								        url = _this$url_and_domain.url,
 								        domain = _this$url_and_domain.domain;
 								    return {
 								      title: title,
 								      author: author,
 								      date_published: date_published || null,
 								      dek: dek,
 								      lead_image_url: lead_image_url,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      content: content,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      next_page_url: next_page_url,
 								      url: url,
 								      domain: domain,
 								      excerpt: excerpt,
 								      word_count: word_count,
 								      direction: direction
 								    };
 								  }
 								};
 								var Detectors = {
 								  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
 								  'meta[name="generator"][value="blogger"]': BloggerExtractor
 								};
 								function detectByHtml($) {
 								  var selector = _Reflect$ownKeys$1(Detectors).find(function (s) {
 								    return $(s).length > 0;
 								  });
 								  return Detectors[selector];
 								}
 								function getExtractor(url, parsedUrl, $) {
 								  parsedUrl = parsedUrl || URL$1.parse(url);
 								  var _parsedUrl = parsedUrl,
 								      hostname = _parsedUrl.hostname;
 								  var baseDomain = hostname.split('.').slice(-2).join('.');
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								}
 								function cleanBySelectors($content, $, _ref) {
 								  var clean = _ref.clean;
 								  if (!clean) return $content;
 								  $(clean.join(','), $content).remove();
 								  return $content;
 								} // Transform matching elements
 								function transformElements($content, $, _ref2) {
 								  var transforms = _ref2.transforms;
 								  if (!transforms) return $content;
 								  _Reflect$ownKeys$1(transforms).forEach(function (key) {
 								    var $matches = $(key, $content);
 								    var value = transforms[key]; // If value is a string, convert directly
 								    if (typeof value === 'string') {
 								      $matches.each(function (index, node) {
 								        convertNodeTo$$1($(node), $, transforms[key]);
 								      });
 								    } else if (typeof value === 'function') {
 								      // If value is function, apply function to node
 								      $matches.each(function (index, node) {
 								        var result = value($(node), $); // If function returns a string, convert node to that value
 								        if (typeof result === 'string') {
 								          convertNodeTo$$1($(node), $, result);
 								        }
 								      });
 								    }
 								  });
 								  return $content;
 								}
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return selectors.find(function (selector) {
 								    if (_Array$isArray(selector)) {
 								      if (extractHtml) {
 								        return selector.reduce(function (acc, s) {
 								          return acc && $(s).length > 0;
 								        }, true);
 								      }
 								      var _selector = _slicedToArray$1(selector, 2),
 								          s = _selector[0],
 								          attr = _selector[1];
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    return (allowMultiple || !allowMultiple && $(selector).length === 1) && $(selector).text().trim() !== '';
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  });
 								}
 								function select(opts) {
 								  var $ = opts.$,
 								      type = opts.type,
 								      extractionOpts = opts.extractionOpts,
 								      _opts$extractHtml = opts.extractHtml,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								  if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
 								  // contributors), return the string
 								  if (typeof extractionOpts === 'string') return extractionOpts;
 								  var selectors = extractionOpts.selectors,
 								      _extractionOpts$defau = extractionOpts.defaultCleaner,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
 								      allowMultiple = extractionOpts.allowMultiple;
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
 								  var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  if (!matchingSelector) return null;
 								  function transformAndClean($node) {
 								    makeLinksAbsolute$$1($node, $, opts.url || '');
 								    cleanBySelectors($node, $, extractionOpts);
 								    transformElements($node, $, extractionOpts);
 								    return $node;
 								  }
 								  function selectHtml() {
 								    // If the selector type requests html as its return type
 								    // transform and clean the element with provided selectors
 								    var $content; // If matching selector is an array, we're considering this a
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    // multi-match selection, which allows the parser to choose several
 								    // selectors to include in the result. Note that all selectors in the
 								    // array must match in order for this selector to trigger
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (_Array$isArray(matchingSelector)) {
 								      $content = $(matchingSelector.join(','));
 								      var $wrapper = $('<div></div>');
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								      $content.each(function (_, element) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								        $wrapper.append(element);
 								      });
 								      $content = $wrapper;
 								    } else {
 								      $content = $(matchingSelector);
 								    } // Wrap in div so transformation can take place on root element
 								    $content.wrap($('<div></div>'));
 								    $content = $content.parent();
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    $content = transformAndClean($content);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    if (Cleaners[type]) {
 								      Cleaners[type]($content, _objectSpread({}, opts, {
 								        defaultCleaner: defaultCleaner
 								      }));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    if (allowMultiple) {
 								      return $content.children().toArray().map(function (el) {
 								        return $.html($(el));
 								      });
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    return $.html($content);
 								  }
 								  if (extractHtml) {
 								    return selectHtml(matchingSelector);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  var $match;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var result; // if selector is an array (e.g., ['img', 'src']),
 								  // extract the attr
 								  if (_Array$isArray(matchingSelector)) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    var _matchingSelector = _slicedToArray$1(matchingSelector, 3),
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								        selector = _matchingSelector[0],
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								        attr = _matchingSelector[1],
 								        transform = _matchingSelector[2];
 								    $match = $(selector);
 								    $match = transformAndClean($match);
 								    result = $match.map(function (_, el) {
 								      var item = $(el).attr(attr).trim();
 								      return transform ? transform(item) : item;
 								    });
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  } else {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    $match = $(matchingSelector);
 								    $match = transformAndClean($match);
 								    result = $match.map(function (_, el) {
 								      return $(el).text().trim();
 								    });
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0]; // Allow custom extractor to skip default cleaner
 								  // for this type; defaults to true
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  if (defaultCleaner && Cleaners[type]) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
 								  }
 								  return result;
 								}
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								function selectExtendedTypes(extend, opts) {
 								  var results = {};
 								  _Reflect$ownKeys$1(extend).forEach(function (t) {
 								    if (!results[t]) {
 								      results[t] = select(_objectSpread({}, opts, {
 								        type: t,
 								        extractionOpts: extend[t]
 								      }));
 								    }
 								  });
 								  return results;
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function extractResult(opts) {
 								  var type = opts.type,
 								      extractor = opts.extractor,
 								      _opts$fallback = opts.fallback,
 								      fallback = _opts$fallback === void 0 ? true : _opts$fallback;
 								  var result = select(_objectSpread({}, opts, {
 								    extractionOpts: extractor[type]
 								  })); // If custom parser succeeds, return the result
 								  if (result) {
 								    return result;
 								  } // If nothing matches the selector, and fallback is enabled,
 								  // run the Generic extraction
 								  if (fallback) return GenericExtractor[type](opts);
 								  return null;
 								}
 								var RootExtractor = {
 								  extract: function extract() {
 								    var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
 								    var opts = arguments.length > 1 ? arguments[1] : undefined;
 								    var _opts = opts,
 								        contentOnly = _opts.contentOnly,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								        extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								    if (extractor.domain === '*') return extractor.extract(opts);
 								    opts = _objectSpread({}, opts, {
 								      extractor: extractor
 								    });
 								    if (contentOnly) {
 								      var _content = extractResult(_objectSpread({}, opts, {
 								        type: 'content',
 								        extractHtml: true,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								        title: extractedTitle
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      }));
 								      return {
 								        content: _content
 								      };
 								    }
 								    var title = extractResult(_objectSpread({}, opts, {
 								      type: 'title'
 								    }));
 								    var date_published = extractResult(_objectSpread({}, opts, {
 								      type: 'date_published'
 								    }));
 								    var author = extractResult(_objectSpread({}, opts, {
 								      type: 'author'
 								    }));
 								    var next_page_url = extractResult(_objectSpread({}, opts, {
 								      type: 'next_page_url'
 								    }));
 								    var content = extractResult(_objectSpread({}, opts, {
 								      type: 'content',
 								      extractHtml: true,
 								      title: title
 								    }));
 								    var lead_image_url = extractResult(_objectSpread({}, opts, {
 								      type: 'lead_image_url',
 								      content: content
 								    }));
 								    var excerpt = extractResult(_objectSpread({}, opts, {
 								      type: 'excerpt',
 								      content: content
 								    }));
 								    var dek = extractResult(_objectSpread({}, opts, {
 								      type: 'dek',
 								      content: content,
 								      excerpt: excerpt
 								    }));
 								    var word_count = extractResult(_objectSpread({}, opts, {
 								      type: 'word_count',
 								      content: content
 								    }));
 								    var direction = extractResult(_objectSpread({}, opts, {
 								      type: 'direction',
 								      title: title
 								    }));
 								    var _ref3 = extractResult(_objectSpread({}, opts, {
 								      type: 'url_and_domain'
 								    })) || {
 								      url: null,
 								      domain: null
 								    },
 								        url = _ref3.url,
 								        domain = _ref3.domain;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    var extendedResults = {};
 								    if (extractor.extend) {
 								      extendedResults = selectExtendedTypes(extractor.extend, opts);
 								    }
 								    return _objectSpread({
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      title: title,
 								      content: content,
 								      author: author,
 								      date_published: date_published,
 								      lead_image_url: lead_image_url,
 								      dek: dek,
 								      next_page_url: next_page_url,
 								      url: url,
 								      domain: domain,
 								      excerpt: excerpt,
 								      word_count: word_count,
 								      direction: direction
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								    }, extendedResults);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
 								function collectAllPages(_x) {
 								  return _collectAllPages.apply(this, arguments);
 								}
 								function _collectAllPages() {
 								  _collectAllPages = _asyncToGenerator(
 								  /*#__PURE__*/
 								  _regeneratorRuntime.mark(function _callee(_ref) {
 								    var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count;
 								    return _regeneratorRuntime.wrap(function _callee$(_context) {
 								      while (1) {
 								        switch (_context.prev = _context.next) {
 								          case 0:
 								            next_page_url = _ref.next_page_url, html = _ref.html, $ = _ref.$, metaCache = _ref.metaCache, result = _ref.result, Extractor = _ref.Extractor, title = _ref.title, url = _ref.url; // At this point, we've fetched just the first page
 								            pages = 1;
 								            previousUrls = [removeAnchor$1(url)];
 								          // If we've gone over 26 pages, something has
 								          // likely gone wrong.
 								          case 3:
 								            if (!(next_page_url && pages < 26)) {
 								              _context.next = 16;
 								              break;
 								            }
 								            pages += 1; // eslint-disable-next-line no-await-in-loop
 								            _context.next = 7;
 								            return Resource.create(next_page_url);
 								          case 7:
 								            $ = _context.sent;
 								            html = $.html();
 								            extractorOpts = {
 								              url: next_page_url,
 								              html: html,
 								              $: $,
 								              metaCache: metaCache,
 								              extractedTitle: title,
 								              previousUrls: previousUrls
 								            };
 								            nextPageResult = RootExtractor.extract(Extractor, extractorOpts);
 								            previousUrls.push(next_page_url);
 								            result = _objectSpread({}, result, {
 								              content: "".concat(result.content, "<hr><h4>Page ").concat(pages, "</h4>").concat(nextPageResult.content)
 								            }); // eslint-disable-next-line prefer-destructuring
 								            next_page_url = nextPageResult.next_page_url;
 								            _context.next = 3;
 								            break;
 								          case 16:
 								            word_count = GenericExtractor.word_count({
 								              content: "<div>".concat(result.content, "</div>")
 								            });
 								            return _context.abrupt("return", _objectSpread({}, result, {
 								              total_pages: pages,
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								              rendered_pages: pages,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              word_count: word_count
 								            }));
 								          case 18:
 								          case "end":
 								            return _context.stop();
 								        }
 								      }
 								    }, _callee, this);
 								  }));
 								  return _collectAllPages.apply(this, arguments);
 								}
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								var Parser = {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  parse: function () {
 								    var _parse = _asyncToGenerator(
 								    /*#__PURE__*/
 								    _regeneratorRuntime.mark(function _callee(url) {
 								      var _ref,
 								          html,
 								          opts,
 								          _opts$fetchAllPages,
 								          fetchAllPages,
 								          _opts$fallback,
 								          fallback,
 								          _opts$contentType,
 								          contentType,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          _opts$headers,
 								          headers,
 								          extend,
 								          customExtractor,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								          parsedUrl,
 								          $,
 								          Extractor,
 								          metaCache,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          extendedTypes,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								          result,
 								          _result,
 								          title,
 								          next_page_url,
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								          turndownService,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								          _args = arguments;
 								      return _regeneratorRuntime.wrap(function _callee$(_context) {
 								        while (1) {
 								          switch (_context.prev = _context.next) {
 								            case 0:
 								              _ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              // set url to window.location.href and load the html
 								              // from the current page
 								              if (!url && cheerio$1.browser) {
 								                url = window.location.href; // eslint-disable-line no-undef
 								                html = html || cheerio$1.html();
 								              }
 								              parsedUrl = URL$1.parse(url);
 								              if (validateUrl(parsedUrl)) {
 								                _context.next = 6;
 								                break;
 								              }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              return _context.abrupt("return", {
 								                error: true,
 								                message: 'The url parameter passed does not look like a valid URL. Please check your URL and try again.'
 								              });
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								            case 6:
 								              _context.next = 8;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              return Resource.create(url, html, parsedUrl, headers);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								            case 8:
 								              $ = _context.sent;
 								              if (!$.failed) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								                _context.next = 11;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								                break;
 								              }
 								              return _context.abrupt("return", $);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 11:
 								              // Add custom extractor via cli.
 								              if (customExtractor) {
 								                addExtractor(customExtractor);
 								              }
 								              Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								              // if html still has not been set (i.e., url passed to Parser.parse),
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              // set html from the response of Resource.create
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              if (!html) {
 								                html = $.html();
 								              } // Cached value of every meta name in our document.
 								              // Used when extracting title/author/date_published/dek
 								              metaCache = $('meta').map(function (_, node) {
 								                return $(node).attr('name');
 								              }).toArray();
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              extendedTypes = {};
 								              if (extend) {
 								                extendedTypes = selectExtendedTypes(extend, {
 								                  $: $,
 								                  url: url,
 								                  html: html
 								                });
 								              }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              result = RootExtractor.extract(Extractor, {
 								                url: url,
 								                html: html,
 								                $: $,
 								                metaCache: metaCache,
 								                parsedUrl: parsedUrl,
 								                fallback: fallback,
 								                contentType: contentType
 								              });
 								              _result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found
 								              if (!(fetchAllPages && next_page_url)) {
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								                _context.next = 25;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								                break;
 								              }
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              _context.next = 22;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              return collectAllPages({
 								                Extractor: Extractor,
 								                next_page_url: next_page_url,
 								                html: html,
 								                $: $,
 								                metaCache: metaCache,
 								                result: result,
 								                title: title,
 								                url: url
 								              });
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 22:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              result = _context.sent;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              _context.next = 26;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              break;
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 25:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              result = _objectSpread({}, result, {
 								                total_pages: 1,
 								                rendered_pages: 1
 								              });
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								            case 26:
 								              if (contentType === 'markdown') {
 								                turndownService = new TurndownService();
 								                result.content = turndownService.turndown(result.content);
 								              } else if (contentType === 'text') {
 								                result.content = $.text($(result.content));
 								              }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								              return _context.abrupt("return", _objectSpread({}, result, extendedTypes));
 								            case 28:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								            case "end":
 								              return _context.stop();
 								          }
 								        }
 								      }, _callee, this);
 								    }));
 								    function parse(_x) {
 								      return _parse.apply(this, arguments);
 								    }
 								    return parse;
 								  }(),
 								  browser: !!cheerio$1.browser,
 								  // A convenience method for getting a resource
 								  // to work with, e.g., for custom extractor generator
 								  fetchResource: function fetchResource(url) {
 								    return Resource.create(url);
-												fix: updating generate-parser dist (#499)


											
										
										
											2 years ago
+								  },
 								  addExtractor: function addExtractor$$1(extractor) {
 								    return addExtractor(extractor);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  }
 								};
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								var mercury = Parser;
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								function insertValues(strings) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  for (var _len = arguments.length, values = new Array(_len > 1 ? _len - 1 : 0), _key = 1; _key < _len; _key++) {
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								    values[_key - 1] = arguments[_key];
 								  }
 								  if (values.length) {
 								    return strings.reduce(function (result, part, idx) {
 								      var value = values[idx];
 								      if (value && typeof value.toString === 'function') {
 								        value = value.toString();
 								      } else {
 								        value = '';
 								      }
 								      return result + part + value;
 								    }, '');
 								  }
 								  return strings.join('');
 								}
 								var bodyPattern = /^\n([\s\S]+)\s{2}$/gm;
 								var trailingWhitespace = /\s+$/;
 								function template(strings) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  for (var _len = arguments.length, values = new Array(_len > 1 ? _len - 1 : 0), _key = 1; _key < _len; _key++) {
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								    values[_key - 1] = arguments[_key];
 								  }
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var compiled = insertValues.apply(void 0, [strings].concat(values));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var _ref = compiled.match(bodyPattern) || [],
 								      _ref2 = _slicedToArray(_ref, 1),
 								      body = _ref2[0];
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
 								  var indentLevel = /^\s{0,4}(.+)$/g;
 								  if (!body) {
 								    body = compiled;
 								    indentLevel = /^\s{0,2}(.+)$/g;
 								  }
 								  return body.split('\n').slice(1).map(function (line) {
 								    line = line.replace(indentLevel, '$1');
 								    if (trailingWhitespace.test(line)) {
 								      line = line.replace(trailingWhitespace, '');
 								    }
 								    return line;
 								  }).join('\n');
 								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function _templateObject() {
 								  var data = _taggedTemplateLiteral(["\n    export const ", " = {\n      domain: '", "',\n\n      title: {\n        selectors: [\n          // enter title selectors\n        ],\n      },\n\n      author: {\n        selectors: [\n          // enter author selectors\n        ],\n      },\n\n      date_published: {\n        selectors: [\n          // enter selectors\n        ],\n      },\n\n      dek: {\n        selectors: [\n          // enter selectors\n        ],\n      },\n\n      lead_image_url: {\n        selectors: [\n          // enter selectors\n        ],\n      },\n\n      content: {\n        selectors: [\n          // enter content selectors\n        ],\n\n        // Is there anything in the content you selected that needs transformed\n        // before it's consumable content? E.g., unusual lazy loaded images\n        transforms: {\n        },\n\n        // Is there anything that is in the result that shouldn't be?\n        // The clean selectors will remove anything that matches from\n        // the result\n        clean: [\n\n        ]\n      },\n    }\n  "]);
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  _templateObject = function _templateObject() {
 								    return data;
 								  };
 								  return data;
 								}
 								function extractorTemplate (hostname, name) {
 								  return template(_templateObject(), name, hostname);
 								}
 								function _templateObject2() {
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								  var data = _taggedTemplateLiteral(["\n    import assert from 'assert';\n    import URL from 'url';\n    import cheerio from 'cheerio';\n\n    import Parser from 'mercury';\n    import getExtractor from 'extractors/get-extractor';\n    import { excerptContent } from 'utils/text';\n\n    const fs = require('fs');\n\n    describe('", "', () => {\n      describe('initial test case', () => {\n        let result;\n        let url;\n        beforeAll(() => {\n          url =\n            '", "';\n          const html =\n            fs.readFileSync('", "');\n          result =\n            Parser.parse(url, { html, fallback: false });\n        });\n\n        it('is selected properly', () => {\n          // This test should be passing by default.\n          // It sanity checks that the correct parser\n          // is being selected for URLs from this domain\n          const extractor = getExtractor(url);\n          assert.equal(extractor.domain, URL.parse(url).hostname)\n        })\n\n          ", "\n\n        it('returns the content', async () => {\n          // To pass this test, fill out the content selector\n          // in ", "/index.js.\n          // You may also want to make use of the clean and transform\n          // options.\n          const { content } = await result;\n\n          const $ = cheerio.load(content || '');\n\n          const first13 = excerptContent($('*').first().text(), 13)\n\n          // Update these values with the expected values from\n          // the article.\n          assert.equal(first13, 'Add the first 13 words of the article here');\n        });\n      });\n    });\n  "]);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
 								  _templateObject2 = function _templateObject2() {
 								    return data;
 								  };
 								  return data;
 								}
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function _templateObject$1() {
 								  var data = _taggedTemplateLiteral(["\n  it('returns the ", "', async () => {\n            // To pass this test, fill out the ", " selector\n            // in ", "/index.js.\n            const { ", " } = await result\n\n            // Update these values with the expected values from\n            // the article.\n            assert.equal(", ", ", ")\n          });\n    "]);
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  _templateObject$1 = function _templateObject() {
 								    return data;
 								  };
 								  return data;
 								}
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								var IGNORE = ['url', 'domain', 'content', 'word_count', 'next_page_url', 'excerpt', 'direction', 'total_pages', 'rendered_pages'];
-												Feat: LinkedIn parser (#123)

* feat: rebuild custom parser

* feat: linkedin custom parser

											
										
										
											7 years ago
+								function testFor(key, value, dir) {
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  if (IGNORE.find(function (k) {
 								    return k === key;
 								  })) return '';
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return template(_templateObject$1(), key, key, dir, key, key, value ? "`".concat(value, "`") : "''");
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								}
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								function extractorTestTemplate (file, url, dir, result, name) {
 								  return template(_templateObject2(), name, url, file, _Reflect$ownKeys(result).map(function (k) {
-												Feat: LinkedIn parser (#123)

* feat: rebuild custom parser

* feat: linkedin custom parser

											
										
										
											7 years ago
+								    return testFor(k, result[k], dir);
-												Refactor: running tests more efficiently (#49)

Only running one parser per page we're testing rather than a parser per field we're testing.
											
										
										
											8 years ago
+								  }).join('\n\n'), dir);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								}
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
 								var questions = [{
 								  type: 'input',
 								  name: 'website',
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  message: "Paste a url to an article you'd like to create or extend a parser for:",
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								  validate: function validate(value) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var _URL$parse = URL.parse(value),
 								        hostname = _URL$parse.hostname;
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
 								    if (hostname) return true;
 								    return false;
 								  }
 								}];
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var spinner;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								function confirm(fn, args, msg, newParser) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  spinner = ora({
 								    text: msg
 								  });
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								  spinner.start();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var result = fn.apply(void 0, _toConsumableArray(args));
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
 								  if (result && result.then) {
 								    result.then(function (r) {
 								      return savePage(r, args, newParser);
 								    });
 								  } else {
 								    spinner.succeed();
 								  }
 								  return result;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function confirmCreateDir(dir, msg) {
 								  if (!fs.existsSync(dir)) {
 								    confirm(fs.mkdirSync, [dir], msg);
 								  }
 								}
 								function getDir(url) {
 								  var _URL$parse2 = URL.parse(url),
 								      hostname = _URL$parse2.hostname;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return "./src/extractors/custom/".concat(hostname);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
 								function scaffoldCustomParser(url) {
 								  var dir = getDir(url);
 								  var _URL$parse3 = URL.parse(url),
 								      hostname = _URL$parse3.hostname;
 								  var newParser = false;
 								  if (!fs.existsSync(dir)) {
 								    newParser = true;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    confirmCreateDir(dir, "Creating ".concat(hostname, " directory"));
 								    confirmCreateDir("./fixtures/".concat(hostname), 'Creating fixtures directory');
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
 								  confirm(mercury.fetchResource, [url], 'Fetching fixture', newParser);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								} // if has arg, just assume that arg is a url and skip prmopt
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								var urlArg = process.argv[2];
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												dx: automate fixture updates (#197)


											
										
										
											5 years ago
+								if (urlArg) {
 								  scaffoldCustomParser(urlArg);
 								} else {
 								  inquirer.prompt(questions).then(function (answers) {
 								    scaffoldCustomParser(answers.website);
 								  });
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								function generateScaffold(url, file, result) {
 								  var _URL$parse4 = URL.parse(url),
 								      hostname = _URL$parse4.hostname;
 								  var extractor = extractorTemplate(hostname, extractorName(hostname));
 								  var extractorTest = extractorTestTemplate(file, url, getDir(url), result, extractorName(hostname));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  fs.writeFileSync("".concat(getDir(url), "/index.js"), extractor);
 								  fs.writeFileSync("".concat(getDir(url), "/index.test.js"), extractorTest);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  child_process.exec("npm run lint-fix-quiet -- ".concat(getDir(url), "/*.js"));
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								function savePage($, _ref, newParser) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var _ref2 = _slicedToArray(_ref, 1),
 								      url = _ref2[0];
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var _URL$parse5 = URL.parse(url),
 								      hostname = _URL$parse5.hostname;
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
 								  spinner.succeed();
 								  var filename = new Date().getTime();
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var file = "./fixtures/".concat(hostname, "/").concat(filename, ".html"); // fix http(s) relative links:
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  makeLinksAbsolute$$1($('*').first(), $, url);
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  $('[src], [href]').each(function (index, node) {
 								    var $node = $(node);
 								    var link = $node.attr('src');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								    if (link && link.slice(0, 2) === '//') {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      $node.attr('src', "http:".concat(link));
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								    }
 								  });
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var html = stripJunkTags($('*').first(), $, ['script']).html();
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  fs.writeFileSync(file, html);
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  mercury.parse(url, {
 								    html: html
 								  }).then(function (result) {
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								    if (newParser) {
 								      confirm(generateScaffold, [url, file, result], 'Generating parser and tests');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      console.log("Your custom site extractor has been set up. To get started building it, run\n      yarn watch:test -- ".concat(hostname, "\n        -- OR --\n      npm run watch:test -- ").concat(hostname));
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								    } else {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								      console.log("\n  It looks like you already have a custom parser for this url.\n  The page you linked to has been added to ".concat(file, ". Copy and paste\n  the following code to use that page in your tests:\n  const html = fs.readFileSync('").concat(file, "');"));
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								    }
 								  });
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function exportString(url) {
 								  var _URL$parse6 = URL.parse(url),
 								      hostname = _URL$parse6.hostname;
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return "export * from './".concat(hostname, "';");
-												feat: parser auto-generates name; lint is more specific

											
										
										
											8 years ago
+								}
 								function extractorName(hostname) {
 								  var name = hostname.split('.').map(function (w) {
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    return "".concat(w.charAt(0).toUpperCase()).concat(w.slice(1));
-												feat: parser auto-generates name; lint is more specific

											
										
										
											8 years ago
+								  }).join('');
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  return "".concat(name, "Extractor");
-												updated generator templates for new style of import/export. also some
adjustments for usability

											
										
										
											8 years ago
+								}
-												feat: update all fixtures and custom parsers to match (#713)

* feat: Refactor and update fixtures

This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example.

Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced.

Finally, all fixtures have been updated.

* Remove reference to deleted extractor

* feat: first batch of test and parser updates due to new fixtures

* feat: update more custom parsers and unit tests

* feat: update more custom parsers and unit tests and remove unnecessary parser

* feat: update more custom parsers and unit tests

* feat: update more parsers and add correct bloomberg html files

* fix: remove console statement

* feat: all parsers updated and tests passing

* fix: update date_published tests to account for test server time difference

* fix: cleanup remaining fixtures in folders

* feat: move fixtures for newest custom parsers

* feat: remove script changes

* fix: update dist files to account for reverting script changes

* adding .DS_Store to .gitignore

* adding .DS_Store to .gitignore -- 2

* adding .DS_Store to .gitignore -- 3 lol

* cleaning up some tests

* fix: ran build:generator command to update generate-custom-parser dist file

* fix: update rollup configs to generate source maps and update source maps

* fix: use underscore in place of unused error variable

* fix: remove unused fixture

Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com>
Co-authored-by: flbn <overasc@gmail.com>
											
										
										
											1 year ago
+								//# sourceMappingURL=generate-custom-parser.js.map