import os import unittest from readability import Document import timeout_decorator SAMPLES = os.path.join(os.path.dirname(__file__), "samples") def load_sample(filename): """Helper to get the content out of the sample files""" with open(os.path.join(SAMPLES, filename)) as f: html = f.read() return html class TestArticleOnly(unittest.TestCase): """The option to not get back a full html doc should work Given a full html document, the call can request just divs of processed content. In this way the developer can then wrap the article however they want in their own view or application. """ def test_si_sample(self): """Using the si sample, load article with only opening body element""" sample = load_sample("si-game.sample.html") doc = Document( sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html", ) res = doc.summary() self.assertEqual("

' " " "

1234567890123456789012345

" " " "" ) doc = Document(sample) doc.summary() def test_correct_cleanup(self): sample = """

test section

Lot of text here.

More text is written here, and contains punctuation and dots.

spam spam spam

The comment is also helpful, but it's still not the correct item to be extracted.

It's even longer than the article itself!"

""" doc = Document(sample) s = doc.summary() # print(s) assert "punctuation" in s assert not "comment" in s assert not "aside" in s # Many spaces make some regexes run forever @timeout_decorator.timeout(seconds=3, use_signals=False) def test_many_repeated_spaces(self): long_space = " " * 1000000 sample = "

foo" + long_space + "

" doc = Document(sample) s = doc.summary() assert "foo" in s def test_not_self_closing(self): sample = '

foobar

' doc = Document(sample) assert ( '

foobar

' == doc.summary() ) def test_utf8_kanji(self): """Using the UTF-8 kanji sample, load article which is written in kanji""" sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary()