diff --git a/readability/htmls.py b/readability/htmls.py index 536b21b..526fbce 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -8,8 +8,11 @@ from .encoding import get_encoding utf8_parser = lxml.html.HTMLParser(encoding='utf-8') +if sys.version_info[0] == 2: + str = unicode + def build_doc(page): - if isinstance(page, unicode): + if isinstance(page, str): enc = None page_unicode = page else: @@ -33,7 +36,7 @@ def normalize_entities(cur_title): u'\u00BB': '"', u'"': '"', } - for c, r in entities.iteritems(): + for c, r in list(entities.items()): if c in cur_title: cur_title = cur_title.replace(c, r) @@ -105,7 +108,7 @@ def shorten_title(doc): def get_body(doc): [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] - raw_html = unicode(tostring(doc.body or doc)) + raw_html = str(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? diff --git a/readability/readability.py b/readability/readability.py index 255e877..c6391d7 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import logging import re import sys @@ -20,6 +21,8 @@ from .htmls import shorten_title logging.basicConfig(level=logging.INFO) log = logging.getLogger() +if sys.version_info[0] == 2: + str = unicode REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), @@ -81,11 +84,12 @@ regexp_type = type(re.compile('hello, world')) def compile_pattern(elements): if not elements: return None - if isinstance(elements, regexp_type): + elif isinstance(elements, regexp_type): return elements - if isinstance(elements, basestring): + else: + # assume string or string like object elements = elements.split(',') - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U) class Document: """Class to build a etree document out of html.""" @@ -195,9 +199,20 @@ class Document: continue else: return cleaned_article - except StandardError, e: + except Exception as e: log.exception('error getting summary: ') - raise Unparseable(str(e)), None, sys.exc_info()[2] + if sys.version_info[0] == 2: + # This is the only reason why we can't support Python 3.3: + # 3.3s parser fails to accept the old syntax (although this + # code never runs) which would require write this line as: + # write this line as + # Unparseable(str(e)) + # but then we loose the traceback information. 3.4 on the + # other hand accepts the old syntax and would only complain + # at runtime. + raise Unparseable(str(e)), None, sys.exc_info()[2] + else: + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -247,7 +262,7 @@ class Document: return output def select_best_candidate(self, candidates): - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) + sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( @@ -388,7 +403,7 @@ class Document: # This results in incorrect results in case there is an # buried within an for example if not REGEXES['divToPElementsRe'].search( - unicode(''.join(map(tostring, list(elem))))): + str(''.join(map(str, map(tostring, list(elem)))))): #self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) @@ -609,18 +624,18 @@ def main(): file = None if options.url: - import urllib - file = urllib.urlopen(options.url) + import urllib.request, urllib.parse, urllib.error + file = urllib.request.urlopen(options.url) else: file = open(args[0], 'rt') enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING try: - print Document(file.read(), + print(Document(file.read(), debug=options.verbose, url=options.url, positive_keywords = options.positive_keywords, negative_keywords = options.negative_keywords, - ).summary().encode(enc, 'replace') + ).summary().encode(enc, 'replace')) finally: file.close() diff --git a/setup.py b/setup.py index 5d472d2..6f4cbbf 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from setuptools import setup, find_packages import sys @@ -8,7 +9,7 @@ if sys.platform == 'darwin': mac_ver = platform.mac_ver()[0] mac_ver_no = int(mac_ver.split('.')[1]) if mac_ver_no < 9: - print "Using lxml<2.4" + print("Using lxml<2.4") lxml_requirement = "lxml<2.4" setup( diff --git a/tox.ini b/tox.ini index e6fced9..f7c6e93 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27 +envlist = py26, py27, py34 [testenv] deps=pytest