diff --git a/readability/htmls.py b/readability/htmls.py
index 536b21b..526fbce 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -8,8 +8,11 @@ from .encoding import get_encoding
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+if sys.version_info[0] == 2:
+ str = unicode
+
def build_doc(page):
- if isinstance(page, unicode):
+ if isinstance(page, str):
enc = None
page_unicode = page
else:
@@ -33,7 +36,7 @@ def normalize_entities(cur_title):
u'\u00BB': '"',
u'"': '"',
}
- for c, r in entities.iteritems():
+ for c, r in list(entities.items()):
if c in cur_title:
cur_title = cur_title.replace(c, r)
@@ -105,7 +108,7 @@ def shorten_title(doc):
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
- raw_html = unicode(tostring(doc.body or doc))
+ raw_html = str(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
diff --git a/readability/readability.py b/readability/readability.py
index 255e877..c6391d7 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import print_function
import logging
import re
import sys
@@ -20,6 +21,8 @@ from .htmls import shorten_title
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
+if sys.version_info[0] == 2:
+ str = unicode
REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@@ -81,11 +84,12 @@ regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements):
if not elements:
return None
- if isinstance(elements, regexp_type):
+ elif isinstance(elements, regexp_type):
return elements
- if isinstance(elements, basestring):
+ else:
+ # assume string or string like object
elements = elements.split(',')
- return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
+ return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U)
class Document:
"""Class to build a etree document out of html."""
@@ -195,9 +199,20 @@ class Document:
continue
else:
return cleaned_article
- except StandardError, e:
+ except Exception as e:
log.exception('error getting summary: ')
- raise Unparseable(str(e)), None, sys.exc_info()[2]
+ if sys.version_info[0] == 2:
+ # This is the only reason why we can't support Python 3.3:
+ # 3.3s parser fails to accept the old syntax (although this
+ # code never runs) which would require write this line as:
+ # write this line as
+ # Unparseable(str(e))
+ # but then we loose the traceback information. 3.4 on the
+ # other hand accepts the old syntax and would only complain
+ # at runtime.
+ raise Unparseable(str(e)), None, sys.exc_info()[2]
+ else:
+ raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
@@ -247,7 +262,7 @@ class Document:
return output
def select_best_candidate(self, candidates):
- sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+ sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % (
@@ -388,7 +403,7 @@ class Document:
# This results in incorrect results in case there is an
# buried within an for example
if not REGEXES['divToPElementsRe'].search(
- unicode(''.join(map(tostring, list(elem))))):
+ str(''.join(map(str, map(tostring, list(elem)))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
@@ -609,18 +624,18 @@ def main():
file = None
if options.url:
- import urllib
- file = urllib.urlopen(options.url)
+ import urllib.request, urllib.parse, urllib.error
+ file = urllib.request.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
try:
- print Document(file.read(),
+ print(Document(file.read(),
debug=options.verbose,
url=options.url,
positive_keywords = options.positive_keywords,
negative_keywords = options.negative_keywords,
- ).summary().encode(enc, 'replace')
+ ).summary().encode(enc, 'replace'))
finally:
file.close()
diff --git a/setup.py b/setup.py
index 5d472d2..6f4cbbf 100755
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import print_function
from setuptools import setup, find_packages
import sys
@@ -8,7 +9,7 @@ if sys.platform == 'darwin':
mac_ver = platform.mac_ver()[0]
mac_ver_no = int(mac_ver.split('.')[1])
if mac_ver_no < 9:
- print "Using lxml<2.4"
+ print("Using lxml<2.4")
lxml_requirement = "lxml<2.4"
setup(
diff --git a/tox.ini b/tox.ini
index e6fced9..f7c6e93 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
# and then run "tox" from this directory.
[tox]
-envlist = py26, py27
+envlist = py26, py27, py34
[testenv]
deps=pytest