Tweaks after the code reorg

0.3.0.dev
Richard Harding 12 years ago
parent f8315d011c
commit 5cb4b8b8c0

@ -69,6 +69,7 @@ Optional `Document` keyword argument:
- attributes:
- debug: output debug messages
- min_text_length:
- multipage: should we try to parse and combine multiple page articles?
- retry_length:
- url: will allow adjusting links to be absolute

@ -1,6 +1,8 @@
import os
import unittest
from helpers import load_regression_data
from helpers import REGRESSION_DATA
from readability_lxml.readability import Document
from readability_lxml import readability as r
from readability_lxml import urlfetch
@ -143,13 +145,28 @@ class TestFindBaseUrl(unittest.TestCase):
self._run_urls(specs)
class TestMultiPageHelpers(unittest.TestCase):
def test_find_next_page_url(self):
"""Verify we can find a next page url in the html body"""
html = """
<html><body><a href="/?page=2">2</a></body></html>
"""
from lxml.html import document_fromstring
doc = document_fromstring(html)
res = r.find_next_page_url(set(), None, doc)
self.assertEqual('/?page=2', res,
'Should find out page 2 url in the body.')
class TestFindNextPageLink(unittest.TestCase):
def _test_page(self, url, html_path, expected):
html = load_regression_data(html_path)
doc = r.parse(html, url)
parsed_urls = {url}
actual = r.find_next_page_link(parsed_urls, url, doc)
actual = r.find_next_page_url(parsed_urls, url, doc)
self.assertEqual(expected, actual)
def test_basic(self):
@ -178,7 +195,8 @@ class TestMultiPage(unittest.TestCase):
def _make_basic_urldict(self):
url_fmt = 'http://basic.com/article.html?pagewanted=%s'
file_fmt = 'basic-multi-page-%s.html'
pairs = [(url_fmt % i, file_fmt % i) for i in ['2', '3']]
pairs = [(url_fmt % i, os.path.join(REGRESSION_DATA, file_fmt % i)) for i in ['2', '3']]
return dict(pairs)
def test_basic(self):
@ -187,7 +205,11 @@ class TestMultiPage(unittest.TestCase):
fetcher = urlfetch.MockUrlFetch(urldict)
options = {
'url': 'http://basic.com/article.html',
'multipage': True,
'urlfetch': fetcher
}
doc = Document(html, **options)
doc.summary()
res = doc.summary()
self.assertIn('Page 2', res, 'Should find the page 2 heading')
self.assertIn('Page 3', res, 'Should find the page 3 heading')

Loading…
Cancel
Save