Refactor code for easier testing

Conflicts:

	src/readability_lxml/readability.py
0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent 8cadc4a958
commit 3fe416a5d1

File diff suppressed because it is too large Load Diff

@ -0,0 +1,22 @@
import urllib2
class UrlFetch():
"""
A class for fetching URLs. This provides a layer of abstraction that can
be easily replaced for testing.
"""
def urlread(self, url):
return urllib2.urlopen(url).read()
class MockUrlFetch(UrlFetch):
def __init__(self, urldict):
self._urldict = urldict
def urlread(self, url):
path = self._urldict[url]
with open(path, 'r') as f:
return f.read()

@ -0,0 +1,52 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>A Simple Multi-Page Article For Testing : Page 2</title>
</head>
<body>
<h1>A Simple Multi-Page Article For Testing : Page 2</h1>
<p>
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam
risus vehicula nibh, in scelerisque lorem risus et risus. Aliquam
erat volutpat. Pellentesque habitant morbi tristique senectus et
netus et malesuada fames ac turpis egestas. Donec blandit venenatis
feugiat. Ut quis turpis ac urna consectetur sagittis. Vestibulum
aliquet eros et orci placerat vitae tempus tellus pretium. Quisque
rutrum sapien quis nibh facilisis quis posuere ipsum elementum. In
ac pretium justo. Sed egestas luctus mollis. Donec rutrum leo a
turpis facilisis commodo. Nam quis quam eget mi malesuada
scelerisque. Pellentesque semper condimentum sagittis. Nam
lobortis, tortor ut placerat viverra, ante felis vehicula sem,
blandit ultricies purus urna eget elit. Pellentesque habitant morbi
tristique senectus et netus et malesuada fames ac turpis egestas.
Sed vel nulla sollicitudin dolor adipiscing dapibus aliquam vitae
leo. Phasellus at turpis tempus lectus pellentesque faucibus.
</p>
<p>
Quisque egestas congue metus quis semper. Integer in ornare nunc.
Nunc in est eget risus pulvinar tincidunt. Nullam eu tempus tortor.
Suspendisse potenti. Aliquam erat volutpat. Praesent sem leo,
molestie a dignissim eget, aliquet sit amet est. Suspendisse sed
libero in urna tincidunt viverra. Maecenas posuere risus non elit
adipiscing a tristique nibh aliquet. Nullam varius risus vitae
turpis lacinia pharetra bibendum magna aliquam. Nam consectetur
mattis lectus, vitae hendrerit lectus iaculis ut. Curabitur commodo
pharetra nibh mollis pulvinar. Nulla in metus dui, vitae ultrices
nibh. Cum sociis natoque penatibus et magnis dis parturient montes,
nascetur ridiculus mus. Cras sed condimentum mi. Morbi vitae velit
in neque tincidunt imperdiet quis quis orci. Proin molestie, erat
convallis vulputate consectetur, diam odio interdum arcu, non
semper neque ante a dolor.
</p>
<ul id="pageNumbers">
<li> 1 </li>
<li>
<a title="Page 1" href="/article.html">1</a>
</li>
<li>
<a title="Page 3" href="/article.html?pagewanted=3">3</a>
</li>
</ul>
</body>
</html>

@ -0,0 +1,60 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>A Simple Multi-Page Article For Testing</title>
</head>
<body>
<h1>A Simple Multi-Page Article For Testing</h1>
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
suscipit posuere velit. Proin est orci, sollicitudin at luctus
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
eu, placerat sed sem.
</p>
<p>
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
habitant morbi tristique senectus et netus et malesuada fames ac
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
nisi.
</p>
<p>
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
pharetra at consequat tristique, convallis nec turpis. Vestibulum
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
dignissim vitae libero. Nunc at mauris et ante accumsan
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
venenatis eu consectetur non, vehicula vel metus. Curabitur
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
risus, sagittis ut elementum ut, porttitor non libero. Integer
fringilla magna quis augue dapibus malesuada. Nulla consectetur
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
massa.
</p>
<ul id="pageNumbers">
<li> 1 </li>
<li>
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
</li>
<li>
<a title="Page 3" href="/article.html?pagewanted=3">3</a>
</li>
</ul>
</body>
</html>

@ -3,6 +3,7 @@ import unittest
from helpers import load_regression_data
from readability_lxml.readability import Document
from readability_lxml import readability as r
from readability_lxml import urlfetch
class TestReadabilityDocument(unittest.TestCase):
@ -144,14 +145,49 @@ class TestFindBaseUrl(unittest.TestCase):
class TestFindNextPageLink(unittest.TestCase):
def _test_page(self, url, html_path, expected):
html = load_regression_data(html_path)
doc = r.parse(html, url)
parsed_urls = {url}
actual = r.find_next_page_link(parsed_urls, url, doc)
self.assertEqual(expected, actual)
def test_basic(self):
self._test_page(
'http://basic.com/article.html',
'basic-multi-page.html',
'http://basic.com/article.html?pagewanted=2'
)
def test_nytimes(self):
# This better work for the New York Times.
html = load_regression_data('nytimes-next-page.html')
expected = '/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1'
self._test_page(
'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html',
'nytimes-next-page.html',
'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1'
)
doc = r.document_fromstring(html)
url = 'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html'
parsed_urls = {url}
actual = r.find_next_page_link(parsed_urls, url, doc)
logging.debug('next page link: ' + str(actual))
class TestMultiPage(unittest.TestCase):
"""
Tests the full path of generating a readable page for a multi-page article.
The test article is very simple, so this test should be resilient to tweaks
of the algorithm.
"""
def _make_basic_urldict(self):
url_fmt = 'http://basic.com/article.html?pagewanted=%s'
file_fmt = 'basic-multi-page-%s.html'
pairs = [(url_fmt % i, file_fmt % i) for i in ['2', '3']]
return dict(pairs)
def test_basic(self):
html = load_regression_data('basic-multi-page.html')
urldict = self._make_basic_urldict()
fetcher = urlfetch.MockUrlFetch(urldict)
options = {
'url': 'http://basic.com/article.html',
'urlfetch': fetcher
}
doc = Document(html, **options)
doc.summary()

Loading…
Cancel
Save