Correct appending and allow for document only

- Fix the appending of siblings to the correct nested element
- Add a document only flag so that you can get a dom tree you can nest
yourself without html/body tags.
pull/15/head
Richard Harding 12 years ago
parent edccec5d3b
commit 5a98e2c1b8

@ -33,3 +33,8 @@ Usage::
Command-line usage::
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
Document() kwarg options:
url=xxx will run make_links_absolute()

@ -98,7 +98,6 @@ class Document:
ruthless = True
while True:
self._html(True)
for i in self.tags(self.html, 'script', 'style'):
i.drop_tree()
for i in self.tags(self.html, 'body'):
@ -111,7 +110,8 @@ class Document:
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
article = self.get_article(candidates, best_candidate)
article = self.get_article(candidates, best_candidate,
document_only=document_only)
else:
if ruthless:
logging.debug("ruthless removal did not work. ")
@ -136,12 +136,15 @@ class Document:
logging.exception('error getting summary: ' )
raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article(self, candidates, best_candidate):
def get_article(self, candidates, best_candidate, document_only=False):
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>')
# create a new html document with a html->body->div
if document_only:
output = fragment_fromstring('<div/>')
else:
output = document_fromstring('<div/>')
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@ -163,7 +166,12 @@ class Document:
append = True
if append:
output.append(sibling)
# We don't want to append directly to output, but the div
# in html->body->div
if document_only:
output.append(sibling)
else:
output.getchildren()[0].getchildren()[0].append(sibling)
#if output is not None:
# output.append(best_elem)
return output
@ -454,13 +462,7 @@ class Document:
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
# There can be two nodes here. We really want to tounicode only one of
# them.
# To start with let's hack it to get the longest tree as our document.
if len(node.getchildren()) > 1:
children = node.getchildren()
sorted_list = sorted(children, key=len, reverse=True)
node = sorted_list[0]
return clean_attributes(tounicode(node))

@ -21,19 +21,19 @@ class TestArticleOnly(unittest.TestCase):
"""
def setUp(self):
""""""
pass
def tearDown(self):
""""""
pass
def test_si_sample(self):
"""Using the si sample, load article with only opening body element"""
sample = load_sample('si-game.sample.html')
doc = Document(
sample,
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary()
self.assertEqual('<html><body><div><div class', res[0:27])
def test_si_sample_doc_only(self):
"""Using the si sample, make sure we can get the article alone."""
sample = load_sample('si-game.sample.html')
doc = Document(sample)
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary(document_only=True)
self.assertEqual('<div class="', res[0:12])
self.assertEqual('<div><div class="', res[0:17])

Loading…
Cancel
Save