You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
90 lines
2.5 KiB
JavaScript
90 lines
2.5 KiB
JavaScript
export const MediumExtractor = {
|
|
domain: 'medium.com',
|
|
|
|
title: {
|
|
selectors: ['h1', ['meta[name="og:title"]', 'value']],
|
|
},
|
|
|
|
author: {
|
|
selectors: [['meta[name="author"]', 'value']],
|
|
},
|
|
|
|
content: {
|
|
selectors: ['article'],
|
|
|
|
// Is there anything in the content you selected that needs transformed
|
|
// before it's consumable content? E.g., unusual lazy loaded images
|
|
transforms: {
|
|
// Allow drop cap character.
|
|
'section span:first-of-type': $node => {
|
|
const $text = $node.html();
|
|
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
|
|
$node.replaceWith($text);
|
|
}
|
|
},
|
|
// Re-write lazy-loaded youtube videos
|
|
iframe: $node => {
|
|
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
|
|
const thumb = decodeURIComponent($node.attr('data-thumbnail'));
|
|
const $parent = $node.parents('figure');
|
|
|
|
if (ytRe.test(thumb)) {
|
|
const [_, youtubeId] = thumb.match(ytRe); // eslint-disable-line
|
|
$node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);
|
|
const $caption = $parent.find('figcaption');
|
|
$parent.empty().append([$node, $caption]);
|
|
return;
|
|
}
|
|
|
|
// If we can't draw the YouTube preview, remove the figure.
|
|
$parent.remove();
|
|
},
|
|
|
|
// rewrite figures to pull out image and caption, remove rest
|
|
figure: $node => {
|
|
// ignore if figure has an iframe
|
|
if ($node.find('iframe').length > 0) return;
|
|
|
|
const $img = $node.find('img').slice(-1)[0];
|
|
const $caption = $node.find('figcaption');
|
|
|
|
$node.empty().append([$img, $caption]);
|
|
},
|
|
|
|
// Remove any smaller images that did not get caught by the generic image
|
|
// cleaner (author photo 48px, leading sentence images 79px, etc.).
|
|
img: $node => {
|
|
const width = parseInt($node.attr('width'), 10);
|
|
if (width < 100) $node.remove();
|
|
},
|
|
},
|
|
|
|
// Is there anything that is in the result that shouldn't be?
|
|
// The clean selectors will remove anything that matches from
|
|
// the result
|
|
clean: ['span a', 'svg'],
|
|
},
|
|
|
|
date_published: {
|
|
selectors: [['meta[name="article:published_time"]', 'value']],
|
|
},
|
|
|
|
lead_image_url: {
|
|
selectors: [['meta[name="og:image"]', 'value']],
|
|
},
|
|
|
|
dek: null,
|
|
|
|
next_page_url: {
|
|
selectors: [
|
|
// enter selectors
|
|
],
|
|
},
|
|
|
|
excerpt: {
|
|
selectors: [
|
|
// enter selectors
|
|
],
|
|
},
|
|
};
|