Release version 0.7 . Better HTML5 support and an important bugfix.

6 years ago · 0e50b53d05
parent 537de2b8f6
commit 0e50b53d05
6 changed files with 56 additions and 24 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -11,7 +11,7 @@ env:

 before_install:
  # work around https://github.com/travis-ci/travis-ci/issues/8363
-  - pyenv global system 3.5
+  - pyenv global system 3.6

 install:
  - travis_retry pip install -U pip wheel tox
--- a/22
+++ b/22
@ -1,10 +1,9 @@
 # Makefile to help automate tasks
 WD := $(shell pwd)
-PY := .env/bin/python
-PIP := .env/bin/pip
-PEP8 := .env/bin/pep8
-NOSE := .env/bin/nosetests
-
+PY := .venv/bin/python
+PIP := .venv/bin/pip
+PEP8 := .venv/bin/pep8
+NOSE := .venv/bin/nosetests

 # ###########
 # Tests rule!
@ -22,16 +21,17 @@ $(NOSE):
 .PHONY: all
 all: venv develop

-venv: bin/python
-bin/python:
-	virtualenv .env
+venv: .venv/bin/python
+
+.venv/bin/python:
+	virtualenv .venv

 .PHONY: clean_venv
 clean_venv:
-	rm -rf .env
+	rm -rf .venv

-develop: .env/lib/python*/site-packages/readability-lxml.egg-link
-.env/lib/python*/site-packages/readability-lxml.egg-link:
+develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
+.venv/lib/python*/site-packages/readability-lxml.egg-link:
 	$(PY) setup.py develop


--- a/README.rst
+++ b/README.rst
@ -35,13 +35,15 @@ Usage
 Change Log
 ----------

-  0.3 Added Document.encoding, positive\_keywords and
-   negative\_keywords
-  0.4 Added Videos loading and allowed more images per paragraph
-  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
-   3.4
+-  0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important
+bug with stripping unwanted HTML nodes (only first matching node was removed before).
 -  0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3
   and 3.4
+-  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
+   3.4
+-  0.4 Added Videos loading and allowed more images per paragraph
+-  0.3 Added Document.encoding, positive\_keywords and
+   negative\_keywords

 Licensing
 =========
--- a/readability/readability.py
+++ b/readability/readability.py
@ -381,13 +381,13 @@ class Document:
    def score_node(self, elem):
        content_score = self.class_weight(elem)
        name = elem.tag.lower()
-        if name == "div":
+        if name in ["div", "article"]:
            content_score += 5
        elif name in ["pre", "td", "blockquote"]:
            content_score += 3
-        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
+        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
            content_score -= 3
-        elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
+        elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]:
            content_score -= 5
        return {
            'content_score': content_score,
@ -463,7 +463,7 @@ class Document:

        allowed = {}
        # Conditionally clean <table>s, <ul>s, and <div>s
-        for el in self.reverse_tags(node, "table", "ul", "div"):
+        for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"):
            if el in allowed:
                continue
            weight = self.class_weight(el)
@ -577,7 +577,7 @@ class Document:
                    if siblings and sum(siblings) > 1000:
                        to_remove = False
                        log.debug("Allowing %s" % describe(el))
-                        for desnode in self.tags(el, "table", "ul", "div"):
+                        for desnode in self.tags(el, "table", "ul", "div", "section"):
                            allowed[desnode] = True

                if to_remove:
--- a/setup.py
+++ b/setup.py
@ -14,7 +14,7 @@ if sys.platform == 'darwin':

 setup(
    name="readability-lxml",
-    version="0.6.2",
+    version="0.7",
    author="Yuri Baburov",
    author_email="burchik@gmail.com",
    description="fast html to text parser (article readability tool) with python3 support",
@ -43,6 +43,5 @@ setup(
        "Programming Language :: Python :: 3.4",
        "Programming Language :: Python :: 3.5",
        "Programming Language :: Python :: 3.6",
-
    ],
 )
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@ -61,3 +61,34 @@ class TestArticleOnly(unittest.TestCase):
        )
        doc = Document(sample)
        doc.summary()
+
+    def test_correct_cleanup(self):
+        sample = """
+        <html>
+            <body>
+                <section>test section</section>
+                <article class="">
+<p>Lot of text here.</p>
+                <div id="advertisement"><a href="link">Ad</a></div>
+<p>More text is written here, and contains punctuation and dots.</p>
+</article>
+                <aside id="comment1"/>
+                <div id="comment2">
+                    <a href="asd">spam</a>
+                    <a href="asd">spam</a>
+                    <a href="asd">spam</a>
+                </div>
+                <div id="comment3"/>
+                <aside id="comment4">A small comment.</aside>
+                <div id="comment5"><p>The comment is also helpful, but it's
+                    still not the correct item to be extracted.</p>
+                    <p>It's even longer than the article itself!"</p></div>
+            </body>
+        </html>
+        """
+        doc = Document(sample)
+        s = doc.summary()
+        #print(s)
+        assert('punctuation' in s)
+        assert(not 'comment' in s)
+        assert(not 'aside' in s)