Correct appending and allow for document only

- Fix the appending of siblings to the correct nested element - Add a document only flag so that you can get a dom tree you can nest yourself without html/body tags.
12 years ago · 5a98e2c1b8
parent edccec5d3b
commit 5a98e2c1b8
3 changed files with 31 additions and 24 deletions
--- a/5
+++ b/5
@ -33,3 +33,8 @@ Usage::
 Command-line usage::

    python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
+
+
+Document() kwarg options:
+    url=xxx will run make_links_absolute()
+
--- a/readability/readability.py
+++ b/readability/readability.py
@ -98,7 +98,6 @@ class Document:
            ruthless = True
            while True:
                self._html(True)
-
                for i in self.tags(self.html, 'script', 'style'):
                    i.drop_tree()
                for i in self.tags(self.html, 'body'):
@ -111,7 +110,8 @@ class Document:
                best_candidate = self.select_best_candidate(candidates)

                if best_candidate:
-                    article = self.get_article(candidates, best_candidate)
+                    article = self.get_article(candidates, best_candidate,
+                            document_only=document_only)
                else:
                    if ruthless:
                        logging.debug("ruthless removal did not work. ")
@ -136,12 +136,15 @@ class Document:
            logging.exception('error getting summary: ' )
            raise Unparseable(str(e)), None, sys.exc_info()[2]

-    def get_article(self, candidates, best_candidate):
+    def get_article(self, candidates, best_candidate, document_only=False):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.
-
        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
-        output = document_fromstring('<div/>')
+        # create a new html document with a html->body->div
+        if document_only:
+            output = fragment_fromstring('<div/>')
+        else:
+            output = document_fromstring('<div/>')
        best_elem = best_candidate['elem']
        for sibling in best_elem.getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@ -163,7 +166,12 @@ class Document:
                    append = True

            if append:
-                output.append(sibling)
+                # We don't want to append directly to output, but the div
+                # in html->body->div
+                if document_only:
+                    output.append(sibling)
+                else:
+                    output.getchildren()[0].getchildren()[0].append(sibling)
        #if output is not None:
        #    output.append(best_elem)
        return output
@ -454,13 +462,7 @@ class Document:
            if not (self.options['attributes']):
                #el.attrib = {} #FIXME:Checkout the effects of disabling this
                pass
-        # There can be two nodes here. We really want to tounicode only one of
-        # them.
-        # To start with let's hack it to get the longest tree as our document.
-        if len(node.getchildren()) > 1:
-            children = node.getchildren()
-            sorted_list = sorted(children, key=len, reverse=True)
-            node = sorted_list[0]
+
        return clean_attributes(tounicode(node))


--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@ -21,19 +21,19 @@ class TestArticleOnly(unittest.TestCase):

    """

-    def setUp(self):
-        """"""
-        pass
-
-    def tearDown(self):
-        """"""
-        pass
-
    def test_si_sample(self):
+        """Using the si sample, load article with only opening body element"""
+        sample = load_sample('si-game.sample.html')
+        doc = Document(
+            sample,
+            url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
+        res = doc.summary()
+        self.assertEqual('<html><body><div><div class', res[0:27])
+
+    def test_si_sample_doc_only(self):
        """Using the si sample, make sure we can get the article alone."""
        sample = load_sample('si-game.sample.html')
-        doc = Document(sample)
+        doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
        res = doc.summary(document_only=True)
-
-        self.assertEqual('<div class="', res[0:12])
+        self.assertEqual('<div><div class="', res[0:17])