Clean up tests/changes to merge into 0.3.0.dev

0.3.0.dev
Richard Harding 12 years ago
parent eefb8e1125
commit d708744822

@ -59,10 +59,11 @@ def norm_title(title):
def get_title(doc):
title = doc.find('.//title').text
if not title:
return '[no-title]'
title_node = doc.find('.//title')
if not title_node:
return '[no-title]'
title = title_node.text
return norm_title(title)
@ -74,10 +75,11 @@ def add_match(collection, text, orig):
def shorten_title(doc):
title = doc.find('.//title').text
if not title:
title_node = doc.find('.//title')
if not title_node:
return ''
title = title_node.text
title = orig = norm_title(title)
candidates = set()

@ -28,14 +28,14 @@ class TestArticleOnly(unittest.TestCase):
sample,
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary()
self.assertEqual('<html><body><div><div class', res[0:27])
self.assertEqual('<html><body id="page"><div><div class', res[0:37])
def test_si_sample_html_partial(self):
"""Using the si sample, make sure we can get the article alone."""
sample = load_sample('si-game.sample.html')
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary(enclose_with_html_tag=False)
self.assertEqual('<div><div class="', res[0:17])
self.assertEqual('<div id="page"><div class="', res[0:27])
def test_si_sample_full_summary(self):
"""We should parse the doc and get a full summary with confidence"""
@ -50,7 +50,6 @@ class TestArticleOnly(unittest.TestCase):
'res should have an titile attrib')
self.assertTrue(hasattr(res, 'short_title'),
'res should have an short_title attrib')
self.assertEqual('<div><div class="', res.html[0:17])
self.assertEqual('<div id="page"><div class="', res.html[0:27])
self.assertTrue(res.confidence > 50,
'The confidence score should be larger than 50: ' + str(res.confidence))

@ -233,3 +233,21 @@ class TestMultiPage(unittest.TestCase):
for i in deletions:
print('unexpected deletion: %s' % i.xpath('string()'))
self.fail('readability result does not match expected')
class TestIsSuspectedDuplicate(unittest.TestCase):
def setUp(self):
super(TestIsSuspectedDuplicate, self).setUp()
html = load_regression_data('duplicate-page-article.html')
self._article = r.fragment_fromstring(html)
def test_unique(self):
html = load_regression_data('duplicate-page-unique.html')
page = r.fragment_fromstring(html)
self.assertFalse(r.is_suspected_duplicate(self._article, page))
def test_duplicate(self):
html = load_regression_data('duplicate-page-duplicate.html')
page = r.fragment_fromstring(html)
self.assertTrue(r.is_suspected_duplicate(self._article, page))

@ -20,5 +20,5 @@ def process_article(article):
sample = load_sample(article)
doc = Document(sample)
res = doc.summary()
failed_msg = "Failed to process the article: " + article
assert '<html><body><div><div class' == res[0:27], failed_msg
failed_msg = "Failed to process the article: " + res[0:37]
assert '<html><body id="page"><div><div class' == res[0:37], failed_msg

Loading…
Cancel
Save