Adds Python 3.4 support.

Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax.
9 years ago · aa4132f57a
parent 13cca1dd19
commit aa4132f57a
4 changed files with 35 additions and 16 deletions
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -8,8 +8,11 @@ from .encoding import get_encoding

 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

+if sys.version_info[0] == 2:
+    str = unicode
+
 def build_doc(page):
-    if isinstance(page, unicode):
+    if isinstance(page, str):
        enc = None
        page_unicode = page
    else:
@ -33,7 +36,7 @@ def normalize_entities(cur_title):
        u'\u00BB': '"',
        u'&quot;': '"',
    }
-    for c, r in entities.iteritems():
+    for c, r in list(entities.items()):
        if c in cur_title:
            cur_title = cur_title.replace(c, r)

@ -105,7 +108,7 @@ def shorten_title(doc):

 def get_body(doc):
    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    raw_html = str(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
--- a/readability/readability.py
+++ b/readability/readability.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 import logging
 import re
 import sys
@ -20,6 +21,8 @@ from .htmls import shorten_title
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger()

+if sys.version_info[0] == 2:
+    str = unicode

 REGEXES = {
    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -81,11 +84,12 @@ regexp_type = type(re.compile('hello, world'))
 def compile_pattern(elements):
    if not elements:
        return None
-    if isinstance(elements, regexp_type):
+    elif isinstance(elements, regexp_type):
        return elements
-    if isinstance(elements, basestring):
+    else:
+        # assume string or string like object
        elements = elements.split(',')
-    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
+        return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U)

 class Document:
    """Class to build a etree document out of html."""
@ -195,9 +199,20 @@ class Document:
                    continue
                else:
                    return cleaned_article
-        except StandardError, e:
+        except Exception as e:
            log.exception('error getting summary: ')
-            raise Unparseable(str(e)), None, sys.exc_info()[2]
+            if sys.version_info[0] == 2:
+                # This is the only reason why we can't support Python 3.3:
+                # 3.3s parser fails to accept the old syntax (although this
+                # code never runs) which would require write this line as:
+                # write this line as
+                #    Unparseable(str(e))
+                # but then we loose the traceback information. 3.4 on the
+                # other hand accepts the old syntax and would only complain
+                # at runtime.
+                raise Unparseable(str(e)), None, sys.exc_info()[2]
+            else:
+                raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])

    def get_article(self, candidates, best_candidate, html_partial=False):
        # Now that we have the top candidate, look through its siblings for
@ -247,7 +262,7 @@ class Document:
        return output

    def select_best_candidate(self, candidates):
-        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+        sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True)
        for candidate in sorted_candidates[:5]:
            elem = candidate['elem']
            self.debug("Top 5 : %6.3f %s" % (
@ -388,7 +403,7 @@ class Document:
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
+                    str(''.join(map(str, map(tostring, list(elem)))))):
                #self.debug("Altering %s to p" % (describe(elem)))
                elem.tag = "p"
                #print "Fixed element "+describe(elem)
@ -609,18 +624,18 @@ def main():

    file = None
    if options.url:
-        import urllib
-        file = urllib.urlopen(options.url)
+        import urllib.request, urllib.parse, urllib.error
+        file = urllib.request.urlopen(options.url)
    else:
        file = open(args[0], 'rt')
    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
    try:
-        print Document(file.read(),
+        print(Document(file.read(),
            debug=options.verbose,
            url=options.url,
            positive_keywords = options.positive_keywords,
            negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        ).summary().encode(enc, 'replace'))
    finally:
        file.close()

--- a/setup.py
+++ b/setup.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 from setuptools import setup, find_packages
 import sys

@ -8,7 +9,7 @@ if sys.platform == 'darwin':
    mac_ver = platform.mac_ver()[0]
    mac_ver_no = int(mac_ver.split('.')[1])
    if mac_ver_no < 9:
-        print "Using lxml<2.4"
+        print("Using lxml<2.4")
        lxml_requirement = "lxml<2.4"

 setup(
--- a/tox.ini
+++ b/tox.ini
@ -4,7 +4,7 @@
 # and then run "tox" from this directory.

 [tox]
-envlist = py26, py27
+envlist = py26, py27, py34

 [testenv]
 deps=pytest