Merge pull request #64 from martinth/master

Added python 3 support (Supported: python 2.6, 2.7, 3.3, 3.4). Thanks a lot to @martinth
9 years ago · 154658798b
parent 83a7ce67c1 386e48d29b
commit 154658798b
10 changed files with 84 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,6 @@ dist
 /man
 nosetests.xml
 .coverage
+.tox
+.idea
+.cache
--- a/readability/compat/init.py
+++ b/readability/compat/init.py
@ -0,0 +1,6 @@
+"""
+This module contains compatibility helpers for Python 2/3 interoperability.
+
+It mainly exists because their are certain incompatibilities in the Python
+syntax that can only be solved by conditionally importing different functions.
+"""
--- a/readability/compat/three.py
+++ b/readability/compat/three.py
@ -0,0 +1,6 @@
+def raise_with_traceback(exc_type, traceback, *args, **kwargs):
+    """
+    Raise a new exception of type `exc_type` with an existing `traceback`. All
+    additional (keyword-)arguments are forwarded to `exc_type`
+    """
+    raise exc_type(*args, **kwargs).with_traceback(traceback)
--- a/readability/compat/two.py
+++ b/readability/compat/two.py
@ -0,0 +1,6 @@
+def raise_with_traceback(exc_type, traceback, *args, **kwargs):
+    """
+    Raise a new exception of type `exc_type` with an existing `traceback`. All
+    additional (keyword-)arguments are forwarded to `exc_type`
+    """
+    raise exc_type(*args, **kwargs), None, traceback
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,27 +1,33 @@
 import re
 import chardet
+import sys

 def get_encoding(page):
    # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+    charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+    pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+    xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

    declared_encodings = (charset_re.findall(page) +
            pragma_re.findall(page) +
            xml_re.findall(page))

    # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    for declared_encoding in declared_encodings:
+        try:
+            if sys.version_info[0] == 3:
+                # declared_encoding will actually be bytes but .decode() only
+                # accepts `str` type. Decode blindly with ascii because no one should
+                # ever use non-ascii characters in the name of an encoding.
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
+
+            page.decode(custom_decode(declared_encoding))
+            return custom_decode(declared_encoding)
+        except UnicodeDecodeError:
+            pass

    # Fallback to chardet if declared encodings fail
-    text = re.sub('</?[^>]*>\s*', ' ', page)
+    text = re.sub(b'</?[^>]*>\s*', b' ', page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
        return enc # can't guess
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -8,8 +8,11 @@ from .encoding import get_encoding

 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

+if sys.version_info[0] == 2:
+    str = unicode
+
 def build_doc(page):
-    if isinstance(page, unicode):
+    if isinstance(page, str):
        enc = None
        page_unicode = page
    else:
@ -33,7 +36,7 @@ def normalize_entities(cur_title):
        u'\u00BB': '"',
        u'&quot;': '"',
    }
-    for c, r in entities.iteritems():
+    for c, r in entities.items():
        if c in cur_title:
            cur_title = cur_title.replace(c, r)

@ -105,7 +108,7 @@ def shorten_title(doc):

 def get_body(doc):
    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    raw_html = str(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
--- a/readability/readability.py
+++ b/readability/readability.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 import logging
 import re
 import sys
@ -19,6 +20,8 @@ from .htmls import shorten_title

 log = logging.getLogger()

+if sys.version_info[0] == 2:
+    str = unicode

 REGEXES = {
    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -80,11 +83,12 @@ regexp_type = type(re.compile('hello, world'))
 def compile_pattern(elements):
    if not elements:
        return None
-    if isinstance(elements, regexp_type):
+    elif isinstance(elements, regexp_type):
        return elements
-    if isinstance(elements, basestring):
+    else:
+        # assume string or string like object
        elements = elements.split(',')
-    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
+        return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)

 class Document:
    """Class to build a etree document out of html."""
@ -194,9 +198,13 @@ class Document:
                    continue
                else:
                    return cleaned_article
-        except StandardError, e:
+        except Exception as e:
            log.exception('error getting summary: ')
-            raise Unparseable(str(e)), None, sys.exc_info()[2]
+            if sys.version_info[0] == 2:
+                from .compat.two import raise_with_traceback
+            else:
+                from .compat.three import raise_with_traceback
+            raise_with_traceback(Unparseable, sys.exc_info()[2], str(e))

    def get_article(self, candidates, best_candidate, html_partial=False):
        # Now that we have the top candidate, look through its siblings for
@ -389,7 +397,7 @@ class Document:
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
+                    str(''.join(map(str, map(tostring, list(elem)))))):
                #self.debug("Altering %s to p" % (describe(elem)))
                elem.tag = "p"
                #print "Fixed element "+describe(elem)
@ -612,18 +620,18 @@ def main():

    file = None
    if options.url:
-        import urllib
-        file = urllib.urlopen(options.url)
+        import urllib.request, urllib.parse, urllib.error
+        file = urllib.request.urlopen(options.url)
    else:
        file = open(args[0], 'rt')
    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
    try:
-        print Document(file.read(),
+        print(Document(file.read(),
            debug=options.verbose,
            url=options.url,
            positive_keywords = options.positive_keywords,
            negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        ).summary().encode(enc, 'replace'))
    finally:
        file.close()

--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+-e .
--- a/setup.py
+++ b/setup.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 from setuptools import setup, find_packages
 import sys

--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,20 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py26, py27, py33, py34
+
+[testenv]
+deps=pytest
+# This creates the virtual envs with --site-packages so already packages
+# that are already installed will be reused. This is especially useful on
+# Windows. Since we use lxml instead of compiling it locally (which in turn
+# requires a Compiler and the build dependencies), you can download
+# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
+# $PYTHONDIR\Scripts\pip.exe install *.whl
+sitepackages=True
+commands =
+    pip install -r requirements.txt
+    py.test