feat: Add custom parser for Reddit (#307)

5 years ago · 7844129fda
parent 13581cd899
commit 7844129fda
16 changed files with 491 additions and 1 deletions
--- a/fixtures/www.reddit.com/1551705199548.html
+++ b/fixtures/www.reddit.com/1551705199548.html
--- a/fixtures/www.reddit.com/1552069905710.html
+++ b/fixtures/www.reddit.com/1552069905710.html
--- a/fixtures/www.reddit.com/1552069933451.html
+++ b/fixtures/www.reddit.com/1552069933451.html
--- a/fixtures/www.reddit.com/1552069947100.html
+++ b/fixtures/www.reddit.com/1552069947100.html
--- a/fixtures/www.reddit.com/1552069958273.html
+++ b/fixtures/www.reddit.com/1552069958273.html
--- a/fixtures/www.reddit.com/1552069973740.html
+++ b/fixtures/www.reddit.com/1552069973740.html
--- a/fixtures/www.reddit.com/1552069996237.html
+++ b/fixtures/www.reddit.com/1552069996237.html
--- a/fixtures/www.reddit.com/1552070031501.html
+++ b/fixtures/www.reddit.com/1552070031501.html
--- a/src/cleaners/constants.js
+++ b/src/cleaners/constants.js
@ -27,6 +27,21 @@ export const SEC_DATE_STRING = /^\d{10}$/i;
 export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
 export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
 export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
+export const TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
+const timeUnits = [
+  'seconds?',
+  'minutes?',
+  'hours?',
+  'days?',
+  'weeks?',
+  'months?',
+  'years?',
+];
+const allTimeUnits = timeUnits.join('|');
+export const TIME_AGO_STRING = new RegExp(
+  `(\\d+)\\s+(${allTimeUnits})\\s+ago`,
+  'i'
+);
 const months = [
  'jan',
  'feb',
--- a/src/cleaners/date-published.js
+++ b/src/cleaners/date-published.js
@ -9,6 +9,8 @@ import {
  SEC_DATE_STRING,
  CLEAN_DATE_STRING_RE,
  SPLIT_DATE_STRING,
+  TIME_AGO_STRING,
+  TIME_NOW_STRING,
  TIME_MERIDIAN_SPACE_RE,
  TIME_MERIDIAN_DOTS_RE,
  TIME_WITH_OFFSET_RE,
@ -28,6 +30,15 @@ export function createDate(dateString, timezone, format) {
    return moment(new Date(dateString));
  }

+  if (TIME_AGO_STRING.test(dateString)) {
+    const fragments = TIME_AGO_STRING.exec(dateString);
+    return moment().subtract(fragments[1], fragments[2]);
+  }
+
+  if (TIME_NOW_STRING.test(dateString)) {
+    return moment();
+  }
+
  return timezone
    ? moment.tz(dateString, format || parseFormat(dateString), timezone)
    : moment(dateString, format || parseFormat(dateString));
--- a/src/cleaners/date-published.test.js
+++ b/src/cleaners/date-published.test.js
@ -34,6 +34,70 @@ describe('cleanDatePublished(dateString)', () => {
    });
    assert.equal(datePublished, '2015-08-03T16:45:00.000Z');
  });
+
+  it('can handle dates formatted as "[just|right] now"', () => {
+    const date1 = cleanDatePublished('now');
+    const newDate1 = moment(date1)
+      .format()
+      .split('T')[0];
+    const expectedDate1 = moment()
+      .format()
+      .split('T')[0];
+    assert.equal(newDate1, expectedDate1);
+
+    const date2 = cleanDatePublished('just now');
+    const newDate2 = moment(date2)
+      .format()
+      .split('T')[0];
+    const expectedDate2 = moment()
+      .format()
+      .split('T')[0];
+    assert.equal(newDate2, expectedDate2);
+
+    const date3 = cleanDatePublished('right now');
+    const newDate3 = moment(date3)
+      .format()
+      .split('T')[0];
+    const expectedDate3 = moment()
+      .format()
+      .split('T')[0];
+    assert.equal(newDate3, expectedDate3);
+  });
+
+  it('can handle dates formatted as "[amount] [time unit] ago"', () => {
+    // This generates an approximate date with a margin of error, for example:
+    // "X days ago" will not be accurate down to the exact time
+    // "X months ago" will not be accurate down to the exact day
+    const date1 = cleanDatePublished('1 hour ago');
+    const newDate1 = moment(date1)
+      .format()
+      .split('T')[0];
+    const expectedDate1 = moment()
+      .subtract(1, 'hour')
+      .format()
+      .split('T')[0];
+    assert.equal(newDate1, expectedDate1);
+
+    const date2 = cleanDatePublished('5 days ago');
+    const newDate2 = moment(date2)
+      .format()
+      .split('T')[0];
+    const expectedDate2 = moment()
+      .subtract(5, 'days')
+      .format()
+      .split('T')[0];
+    assert.equal(newDate2, expectedDate2);
+
+    const date3 = cleanDatePublished('10 months ago');
+    const newDate3 = moment(date3)
+      .format()
+      .split('T')[0];
+    const expectedDate3 = moment()
+      .subtract(10, 'months')
+      .format()
+      .split('T')[0];
+    assert.equal(newDate3, expectedDate3);
+  });
 });

 describe('cleanDateString(dateString)', () => {
--- a/src/extractors/custom/README.md
+++ b/src/extractors/custom/README.md
@ -73,6 +73,28 @@ export const ExampleExtractor = {

 This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception.

+#### Content selectors
+
+If you pass an array selector for the content selection, it behaves differently from the attribute selectors on other types. In such cases, it will be considered as a multi-match selection, which allows the parser to choose several selectors to include in the result, and will include all occurrences of each matching selector in the result.
+
+Note that all selectors in the array must match in order for this selector to trigger.
+
+```javascript
+export const ExampleExtractor = {
+    ...
+
+    // Attempt to match both the content and image
+    // before falling back to just the content
+    content: {
+      selectors: [
+        ['.parsys.content', '.__image-lead__'],
+        '.content'
+      ],
+    },
+
+    ...
+```
+
 ### Cleaning content from an article

 An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.
--- a/src/extractors/custom/index.js
+++ b/src/extractors/custom/index.js
@ -92,4 +92,5 @@ export * from './ici.radio-canada.ca';
 export * from './www.fortinet.com';
 export * from './www.fastcompany.com';
 export * from './blisterreview.com';
-export * from './news.mynavi.jp';
+export * from './news.mynavi.jp';
+export * from './www.reddit.com';
--- a/src/extractors/custom/www.reddit.com/index.js
+++ b/src/extractors/custom/www.reddit.com/index.js
@ -0,0 +1,56 @@
+export const WwwRedditComExtractor = {
+  domain: 'www.reddit.com',
+
+  title: {
+    selectors: ['div[data-test-id="post-content"] h2'],
+  },
+
+  author: {
+    selectors: ['div[data-test-id="post-content"] a[href*="user/"]'],
+  },
+
+  date_published: {
+    selectors: [
+      'div[data-test-id="post-content"] a[data-click-id="timestamp"]',
+    ],
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']],
+  },
+
+  content: {
+    selectors: [
+      ['div[data-test-id="post-content"] p'], // text post
+      [
+        'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
+        'div[data-test-id="post-content"] div[data-click-id="media"]', // embedded media
+      ], // external link with media preview (YouTube, imgur album, etc...)
+      ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
+      [
+        'div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])',
+      ], // external link
+      'div[data-test-id="post-content"]',
+    ],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+      'div[role="img"]': $node => {
+        // External link image preview
+        const $img = $node.find('img');
+        const bgImg = $node.css('background-image');
+        if ($img.length === 1 && bgImg) {
+          $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
+          return $img;
+        }
+        return $node;
+      },
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.icon'],
+  },
+};
--- a/src/extractors/custom/www.reddit.com/index.test.js
+++ b/src/extractors/custom/www.reddit.com/index.test.js
@ -0,0 +1,232 @@
+import assert from 'assert';
+import URL from 'url';
+import cheerio from 'cheerio';
+import moment from 'moment-timezone';
+
+import Mercury from 'mercury';
+import getExtractor from 'extractors/get-extractor';
+import { excerptContent } from 'utils/text';
+
+const fs = require('fs');
+
+describe('WwwRedditComExtractor', () => {
+  describe('initial test case', () => {
+    let result;
+    let url;
+    beforeAll(() => {
+      url =
+        'https://www.reddit.com/r/Showerthoughts/comments/awx46q/vanilla_becoming_the_default_flavour_of_ice_cream/';
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1551705199548.html'
+      );
+      result = Mercury.parse(url, { html, fallback: false });
+    });
+
+    it('is selected properly', () => {
+      // This test should be passing by default.
+      // It sanity checks that the correct parser
+      // is being selected for URLs from this domain
+      const extractor = getExtractor(url);
+      assert.equal(extractor.domain, URL.parse(url).hostname);
+    });
+
+    it('returns the title', async () => {
+      // To pass this test, fill out the title selector
+      // in ./src/extractors/custom/www.reddit.com/index.js.
+      const { title } = await result;
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(
+        title,
+        `Vanilla becoming the default flavour of ice cream is the greatest underdog story of all time.`
+      );
+    });
+
+    it('returns the author', async () => {
+      // To pass this test, fill out the author selector
+      // in ./src/extractors/custom/www.reddit.com/index.js.
+      const { author } = await result;
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(author, 'u/benyacobi');
+    });
+
+    it('returns the date_published', async () => {
+      // To pass this test, fill out the date_published selector
+      // in ./src/extractors/custom/www.reddit.com/index.js.
+      const { date_published } = await result;
+      const newDatePublished = moment(date_published)
+        .format()
+        .split('T')[0];
+      const expectedDate = moment()
+        .subtract(18, 'hours')
+        .format()
+        .split('T')[0];
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(newDatePublished, expectedDate);
+    });
+
+    it('returns the lead_image_url', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552069933451.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/aww/comments/aybw1m/nothing_to_see_here_human/';
+
+      // To pass this test, fill out the lead_image_url selector
+      // in ./src/extractors/custom/www.reddit.com/index.js.
+      const { lead_image_url } = await Mercury.parse(uri, { html });
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(
+        lead_image_url,
+        'https://preview.redd.it/jsc4t74psok21.jpg?auto=webp&s=9c9826487e34d399333f65beb64390206fff4125'
+      );
+    });
+
+    it('returns the content for text posts', async () => {
+      // To pass this test, fill out the content selector
+      // in ./src/extractors/custom/www.reddit.com/index.js.
+      // You may also want to make use of the clean and transform
+      // options.
+      const { content } = await result;
+
+      const $ = cheerio.load(content || '');
+
+      const first13 = excerptContent(
+        $('*')
+          .first()
+          .text(),
+        13
+      );
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(
+        first13,
+        'Edit: thank you for educating me about the ubiquity of vanilla. Still, none'
+      );
+    });
+
+    it('handles posts that only have a title', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552069905710.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/AskReddit/comments/axtih6/what_is_the_most_worth_it_item_you_have_ever/';
+
+      const { content } = await Mercury.parse(uri, { html });
+
+      assert.equal(content, '<div></div>');
+    });
+
+    it('handles image posts', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552069933451.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/aww/comments/aybw1m/nothing_to_see_here_human/';
+
+      const { content } = await Mercury.parse(uri, { html });
+
+      const $ = cheerio.load(content || '');
+
+      const image = $(
+        'img[src="https://preview.redd.it/jsc4t74psok21.jpg?width=960&crop=smart&auto=webp&s=54349b21ff628e8c22c053509e86ba84ff9751d3"]'
+      );
+
+      assert.equal(image.length, 1);
+    });
+
+    it('handles video posts', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552069947100.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/HumansBeingBros/comments/aybtf7/thanks_human/';
+
+      const { content } = await Mercury.parse(uri, { html });
+
+      const $ = cheerio.load(content || '');
+
+      const video = $(
+        'video > source[src="https://v.redd.it/kwhzxoz5rok21/HLSPlaylist.m3u8"]'
+      );
+
+      assert.equal(video.length, 1);
+    });
+
+    it('handles external link posts with image preview', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552069958273.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/todayilearned/comments/aycizd/til_that_when_jrr_tolkiens_son_michael_signed_up/';
+
+      const { content } = await Mercury.parse(uri, { html });
+
+      const $ = cheerio.load(content || '');
+
+      const link = $(
+        'a[href="https://www.1843magazine.com/culture/look-closer/tolkiens-drawings-reveal-a-wizard-at-work"]'
+      );
+
+      const image = $(
+        'img[src="https://b.thumbs.redditmedia.com/gWPmq95XmEzQns6B-H6_l4kBNFeuhScpVDYPjvPsdDs.jpg"]'
+      );
+
+      assert.equal(link.length, 2);
+
+      assert.equal(image.length, 1);
+    });
+
+    it('handles external image posts with image preview', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552070031501.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/gifs/comments/4vv0sa/leonardo_dicaprio_scaring_jonah_hill_on_the/';
+
+      const { content } = await Mercury.parse(uri, { html });
+
+      const $ = cheerio.load(content || '');
+
+      const link = $('a[href="http://i.imgur.com/Qcx1DSD.gifv"]');
+
+      const image = $(
+        'img[src="https://external-preview.redd.it/sKJFPLamiRPOW5u7NTch3ykbFYMwqI5Qr0zlCINMTfU.gif?format=png8&s=56ecd472f8b8b2ee741b3b1cb76cb3a5110a85f9"]'
+      );
+
+      assert.equal(link.length, 1);
+
+      assert.equal(image.length, 1);
+    });
+
+    it('handles external link posts with embedded media', async () => {
+      const html = fs.readFileSync(
+        './fixtures/www.reddit.com/1552069973740.html'
+      );
+      const uri =
+        'https://www.reddit.com/r/videos/comments/5gafop/rick_astley_never_gonna_give_you_up_sped_up_every/';
+
+      const { content } = await Mercury.parse(uri, { html });
+
+      const $ = cheerio.load(content || '');
+
+      const link = $('a[href="https://youtu.be/dQw4w9WgXcQ"]');
+
+      const embed = $(
+        'iframe[src="https://www.redditmedia.com/mediaembed/5gafop?responsive=true"]'
+      );
+
+      assert.equal(link.length, 1);
+
+      assert.equal(embed.length, 1);
+    });
+  });
+});
--- a/src/utils/dom/constants.js
+++ b/src/utils/dom/constants.js
@ -11,6 +11,7 @@ export const KEEP_SELECTORS = [
  'iframe[src^="http://www.youtube.com"]',
  'iframe[src^="https://player.vimeo"]',
  'iframe[src^="http://player.vimeo"]',
+  'iframe[src^="https://www.redditmedia.com"]',
 ];

 // A list of tags to strip from the output if we encounter them.