Fix test for the multipage test with actual content

0.3.0.dev
Richard Harding 12 years ago
parent 816c66482e
commit cfc6f94634

@ -9,7 +9,6 @@ import urlfetch
from collections import namedtuple
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.html.diff import htmldiff
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
@ -448,7 +447,11 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
# We don't want to append directly to output, but the div
# in html->body->div
if enclose_with_html_tag:
output.getchildren()[0].getchildren()[0].append(sibling)
if sibling.tag == 'body':
for elem in sibling.getchildren():
output.getchildren()[0].getchildren()[0].append(elem)
else:
output.getchildren()[0].getchildren()[0].append(sibling)
else:
output.append(sibling)
@ -824,8 +827,20 @@ def append_next_page(parsed_urls, page_url, doc, options):
# page_doc is a singular element containing the page article elements. We
# want to add its children to the main article document to which we are
# appending a page.
for elem in page_doc:
doc.append(elem)
if doc.tag == 'html':
children = doc.getchildren()
if children[0].tag == 'head':
import ipdb; ipdb.set_trace()
for elem in page_doc:
doc.getchildren()[1].append(elem)
else:
import ipdb; ipdb.set_trace()
for elem in page_doc:
doc.getchildren()[0].append(elem)
else:
import ipdb; ipdb.set_trace()
for elem in page_doc:
doc.append(elem)
if next_page_url is not None:
append_next_page(parsed_urls, next_page_url, doc, options)

@ -1,6 +1,9 @@
import os
import unittest
from lxml.html import document_fromstring
from lxml.html.diff import htmldiff
from helpers import load_regression_data
from helpers import REGRESSION_DATA
from readability_lxml.readability import Document
@ -209,7 +212,24 @@ class TestMultiPage(unittest.TestCase):
'urlfetch': fetcher
}
doc = Document(html, **options)
res = doc.summary()
res = doc.summary_with_metadata()
self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
self.assertIn('Page 3', res.html, 'Should find the page 3 heading')
expected_html = load_regression_data('basic-multi-page-expected.html')
diff_html = htmldiff(expected_html, res.html)
diff_doc = document_fromstring(diff_html)
insertions = diff_doc.xpath('//ins')
deletions = diff_doc.xpath('//del')
if len(insertions) != 0:
for i in insertions:
print('unexpected insertion: %s' % i.xpath('string()'))
self.fail('readability result does not match expected')
self.assertIn('Page 2', res, 'Should find the page 2 heading')
self.assertIn('Page 3', res, 'Should find the page 3 heading')
if len(deletions) != 0:
for i in deletions:
print('unexpected deletion: %s' % i.xpath('string()'))
self.fail('readability result does not match expected')

@ -1,60 +0,0 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>A Simple Multi-Page Article For Testing : Page 3</title>
</head>
<body>
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
<p>
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
erat, lobortis varius est massa quis metus. Donec vitae justo
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
imperdiet est.
</p>
<p>
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
dolor, imperdiet eget rutrum tempus, euismod nec augue.
</p>
<p>
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
neque magna, in laoreet felis. Aenean elit ligula, tempor in
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
magna scelerisque vitae vulputate ipsum luctus.
</p>
<ul id="pageNumbers">
<li> 1 </li>
<li>
<a title="Page 1" href="/article.html">1</a>
</li>
<li>
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
</li>
</ul>
</body>
</html>
Loading…
Cancel
Save