feat: added twitter custom extractor
Squashed commit of the following: commit 8116f14364869b72a8afabfcb44b2ac154caed96 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 15 16:27:27 2016 -0400 feat: added twitter custom extractor commit e478eb1b0bcdcb65fdd5fa64e37be92b6defd702 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 15 16:22:54 2016 -0400 fix: made custom extractors and cleaners adhere to underscore keyspull/3/head
parent
d60d396c98
commit
396313aeae
@ -1,3 +1,7 @@
|
||||
node_modules
|
||||
build
|
||||
npm-debug.log
|
||||
dist
|
||||
TODO.md
|
||||
fixtures
|
||||
read
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,11 +1,13 @@
|
||||
import NYMagExtractor from './custom/nymag.com';
|
||||
import BloggerExtractor from './custom/blogspot.com';
|
||||
import WikipediaExtractor from './custom/wikipedia.org';
|
||||
import TwitterExtractor from './custom/twitter.com';
|
||||
|
||||
const Extractors = {
|
||||
'nymag.com': NYMagExtractor,
|
||||
'blogspot.com': BloggerExtractor,
|
||||
'wikipedia.org': WikipediaExtractor,
|
||||
'twitter.com': TwitterExtractor,
|
||||
};
|
||||
|
||||
export default Extractors;
|
||||
|
@ -0,0 +1,50 @@
|
||||
const TwitterExtractor = {
|
||||
domain: 'twitter.com',
|
||||
|
||||
content: {
|
||||
transforms: {
|
||||
// We're transforming essentially the whole page here.
|
||||
// Twitter doesn't have nice selectors, so our initial
|
||||
// selector grabs the whole page, then we're re-writing
|
||||
// it to fit our needs before we clean it up.
|
||||
'.permalink[role=main]': ($node, $) => {
|
||||
const tweets = $node.find('.tweet');
|
||||
const $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
|
||||
$tweetContainer.append(tweets);
|
||||
$node.replaceWith($tweetContainer);
|
||||
},
|
||||
|
||||
// Twitter wraps @ with s, which
|
||||
// renders as a strikethrough
|
||||
s: 'span',
|
||||
},
|
||||
|
||||
selectors: [
|
||||
'.permalink[role=main]',
|
||||
],
|
||||
|
||||
defaultCleaner: false,
|
||||
|
||||
clean: [
|
||||
'.stream-item-footer',
|
||||
'button',
|
||||
'.tweet-details-fixer',
|
||||
],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'.tweet.permalink-tweet .username',
|
||||
],
|
||||
},
|
||||
|
||||
date_published: {
|
||||
selectors: [
|
||||
'.tweet.permalink-tweet .metadata',
|
||||
],
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
|
||||
export default TwitterExtractor;
|
@ -0,0 +1,18 @@
|
||||
import assert from 'assert';
|
||||
import fs from 'fs';
|
||||
|
||||
import Iris from 'iris';
|
||||
|
||||
describe('TwitterExtractor', () => {
|
||||
it('works with a feature story', (async) () => {
|
||||
const html = fs.readFileSync('./fixtures/twitter.com/tweet.html');
|
||||
const uri = 'https://twitter.com/KingBeyonceStan/status/745276948213968896';
|
||||
|
||||
const { title, author, date_published } = await Iris.parse(uri, html);
|
||||
|
||||
assert.equal(title, 'Lina Morgana on Twitter');
|
||||
assert.equal(author, '@KingBeyonceStan');
|
||||
assert.equal(date_published, '2016-06-21T08:27:00.000Z');
|
||||
});
|
||||
});
|
||||
|
Loading…
Reference in New Issue