From 5a98e2c1b85b16699fda4e2e91905d5837ef8314 Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Mon, 16 Apr 2012 20:55:13 -0400 Subject: [PATCH] Correct appending and allow for document only - Fix the appending of siblings to the correct nested element - Add a document only flag so that you can get a dom tree you can nest yourself without html/body tags. --- README | 5 +++++ readability/readability.py | 28 +++++++++++++++------------- tests/test_article_only.py | 22 +++++++++++----------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/README b/README index 36bb28c..acc96fa 100644 --- a/README +++ b/README @@ -33,3 +33,8 @@ Usage:: Command-line usage:: python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml + + +Document() kwarg options: + url=xxx will run make_links_absolute() + diff --git a/readability/readability.py b/readability/readability.py index b409c59..ae760c5 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -98,7 +98,6 @@ class Document: ruthless = True while True: self._html(True) - for i in self.tags(self.html, 'script', 'style'): i.drop_tree() for i in self.tags(self.html, 'body'): @@ -111,7 +110,8 @@ class Document: best_candidate = self.select_best_candidate(candidates) if best_candidate: - article = self.get_article(candidates, best_candidate) + article = self.get_article(candidates, best_candidate, + document_only=document_only) else: if ruthless: logging.debug("ruthless removal did not work. ") @@ -136,12 +136,15 @@ class Document: logging.exception('error getting summary: ' ) raise Unparseable(str(e)), None, sys.exc_info()[2] - def get_article(self, candidates, best_candidate): + def get_article(self, candidates, best_candidate, document_only=False): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. - sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) - output = document_fromstring('
') + # create a new html document with a html->body->div + if document_only: + output = fragment_fromstring('
') + else: + output = document_fromstring('
') best_elem = best_candidate['elem'] for sibling in best_elem.getparent().getchildren(): #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text @@ -163,7 +166,12 @@ class Document: append = True if append: - output.append(sibling) + # We don't want to append directly to output, but the div + # in html->body->div + if document_only: + output.append(sibling) + else: + output.getchildren()[0].getchildren()[0].append(sibling) #if output is not None: # output.append(best_elem) return output @@ -454,13 +462,7 @@ class Document: if not (self.options['attributes']): #el.attrib = {} #FIXME:Checkout the effects of disabling this pass - # There can be two nodes here. We really want to tounicode only one of - # them. - # To start with let's hack it to get the longest tree as our document. - if len(node.getchildren()) > 1: - children = node.getchildren() - sorted_list = sorted(children, key=len, reverse=True) - node = sorted_list[0] + return clean_attributes(tounicode(node)) diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 41bfd85..28240bd 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -21,19 +21,19 @@ class TestArticleOnly(unittest.TestCase): """ - def setUp(self): - """""" - pass - - def tearDown(self): - """""" - pass - def test_si_sample(self): + """Using the si sample, load article with only opening body element""" + sample = load_sample('si-game.sample.html') + doc = Document( + sample, + url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') + res = doc.summary() + self.assertEqual('