feat: added twitter custom extractor

Squashed commit of the following:

commit 8116f14364869b72a8afabfcb44b2ac154caed96
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 15 16:27:27 2016 -0400

    feat: added twitter custom extractor

commit e478eb1b0bcdcb65fdd5fa64e37be92b6defd702
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 15 16:22:54 2016 -0400

    fix: made custom extractors and cleaners adhere to underscore keys
pull/3/head
Adam Pash 8 years ago
parent d60d396c98
commit 396313aeae

4
.gitignore vendored

@ -1,3 +1,7 @@
node_modules
build
npm-debug.log
dist
TODO.md
fixtures
read

76
dist/iris.js vendored

@ -810,7 +810,7 @@ var NYMagExtractor = {
selectors: ['.lede-feature-teaser']
},
datePublished: {
date_published: {
selectors: ['time.article-timestamp[datetime]', 'time.article-timestamp']
}
};
@ -840,7 +840,7 @@ var BloggerExtractor = {
selectors: ['h2.title']
},
datePublished: {
date_published: {
selectors: ['span.publishdate']
}
};
@ -872,16 +872,55 @@ var WikipediaExtractor = {
selectors: ['h2.title']
},
datePublished: {
date_published: {
selectors: ['#footer-info-lastmod']
}
};
var TwitterExtractor = {
domain: 'twitter.com',
content: {
transforms: {
// We're transforming essentially the whole page here.
// Twitter doesn't have nice selectors, so our initial
// selector grabs the whole page, then we're re-writing
// it to fit our needs before we clean it up.
'.permalink[role=main]': function permalinkRoleMain($node, $) {
var tweets = $node.find('.tweet');
var $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
$tweetContainer.append(tweets);
$node.replaceWith($tweetContainer);
},
// Twitter wraps @ with s, which
// renders as a strikethrough
s: 'span'
},
selectors: ['.permalink[role=main]'],
defaultCleaner: false,
clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']
},
author: {
selectors: ['.tweet.permalink-tweet .username']
},
date_published: {
selectors: ['.tweet.permalink-tweet .metadata']
}
};
var Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor
'wikipedia.org': WikipediaExtractor,
'twitter.com': TwitterExtractor
};
// Spacer images to be removed
@ -2470,9 +2509,9 @@ function resolveSplitTitle(title) {
var Cleaners = {
author: cleanAuthor,
leadImageUrl: clean$1,
lead_image_url: clean$1,
dek: cleanDek,
datePublished: cleanDatePublished,
date_published: cleanDatePublished,
content: extractCleanNode,
title: cleanTitle
};
@ -3842,7 +3881,9 @@ function select(opts) {
$content = transformElements($content, $, extractionOpts);
$content = cleanBySelectors($content, $, extractionOpts);
$content = Cleaners[type]($content, opts);
if (defaultCleaner) {
$content = Cleaners[type]($content, opts);
}
return $.html($content);
}
@ -4029,12 +4070,13 @@ var Iris = {
_ref$fetchAllPages = _ref.fetchAllPages;
fetchAllPages = _ref$fetchAllPages === undefined ? true : _ref$fetchAllPages;
Extractor = getExtractor(url);
// console.log(`Using extractor for ${Extractor.domain}`);
_context.next = 6;
console.log('Using extractor for ' + Extractor.domain);
_context.next = 7;
return Resource.create(url, html);
case 6:
case 7:
$ = _context.sent;
html = $.html();
@ -4052,11 +4094,11 @@ var Iris = {
// Fetch more pages if next_page_url found
if (!(fetchAllPages && next_page_url)) {
_context.next = 19;
_context.next = 20;
break;
}
_context.next = 16;
_context.next = 17;
return collectAllPages({
Extractor: Extractor,
next_page_url: next_page_url,
@ -4068,21 +4110,21 @@ var Iris = {
url: url
});
case 16:
case 17:
result = _context.sent;
_context.next = 20;
_context.next = 21;
break;
case 19:
case 20:
result = _extends({}, result, {
total_pages: 1,
rendered_pages: 1
});
case 20:
case 21:
return _context.abrupt('return', result);
case 21:
case 22:
case 'end':
return _context.stop();
}

2
dist/iris.js.map vendored

File diff suppressed because one or more lines are too long

@ -7,9 +7,9 @@ import cleanTitle from './title';
const Cleaners = {
author: cleanAuthor,
leadImageUrl: cleanImage,
lead_image_url: cleanImage,
dek: cleanDek,
datePublished: cleanDatePublished,
date_published: cleanDatePublished,
content: cleanContent,
title: cleanTitle,
};

@ -1,11 +1,13 @@
import NYMagExtractor from './custom/nymag.com';
import BloggerExtractor from './custom/blogspot.com';
import WikipediaExtractor from './custom/wikipedia.org';
import TwitterExtractor from './custom/twitter.com';
const Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor,
'twitter.com': TwitterExtractor,
};
export default Extractors;

@ -30,7 +30,7 @@ const BloggerExtractor = {
],
},
datePublished: {
date_published: {
selectors: [
'span.publishdate',
],

@ -57,7 +57,7 @@ const NYMagExtractor = {
],
},
datePublished: {
date_published: {
selectors: [
'time.article-timestamp[datetime]',
'time.article-timestamp',

@ -0,0 +1,50 @@
const TwitterExtractor = {
domain: 'twitter.com',
content: {
transforms: {
// We're transforming essentially the whole page here.
// Twitter doesn't have nice selectors, so our initial
// selector grabs the whole page, then we're re-writing
// it to fit our needs before we clean it up.
'.permalink[role=main]': ($node, $) => {
const tweets = $node.find('.tweet');
const $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
$tweetContainer.append(tweets);
$node.replaceWith($tweetContainer);
},
// Twitter wraps @ with s, which
// renders as a strikethrough
s: 'span',
},
selectors: [
'.permalink[role=main]',
],
defaultCleaner: false,
clean: [
'.stream-item-footer',
'button',
'.tweet-details-fixer',
],
},
author: {
selectors: [
'.tweet.permalink-tweet .username',
],
},
date_published: {
selectors: [
'.tweet.permalink-tweet .metadata',
],
},
};
export default TwitterExtractor;

@ -0,0 +1,18 @@
import assert from 'assert';
import fs from 'fs';
import Iris from 'iris';
describe('TwitterExtractor', () => {
it('works with a feature story', (async) () => {
const html = fs.readFileSync('./fixtures/twitter.com/tweet.html');
const uri = 'https://twitter.com/KingBeyonceStan/status/745276948213968896';
const { title, author, date_published } = await Iris.parse(uri, html);
assert.equal(title, 'Lina Morgana on Twitter');
assert.equal(author, '@KingBeyonceStan');
assert.equal(date_published, '2016-06-21T08:27:00.000Z');
});
});

@ -33,7 +33,7 @@ const WikipediaExtractor = {
],
},
datePublished: {
date_published: {
selectors: [
'#footer-info-lastmod',
],

@ -73,7 +73,9 @@ export function select(opts) {
$content = transformElements($content, $, extractionOpts);
$content = cleanBySelectors($content, $, extractionOpts);
$content = Cleaners[type]($content, opts);
if (defaultCleaner) {
$content = Cleaners[type]($content, opts);
}
return $.html($content);
}

@ -158,7 +158,7 @@ describe('select(opts)', () => {
`;
const $ = cheerio.load(html);
const opts = {
type: 'datePublished',
type: 'date_published',
$,
extractionOpts: {
selectors: ['time[datetime]'],

Loading…
Cancel
Save