diff --git a/.gitignore b/.gitignore index 84fca1f..16a2c86 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ dist /man nosetests.xml .coverage +.tox +.idea +.cache diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py new file mode 100644 index 0000000..ed4d350 --- /dev/null +++ b/readability/compat/__init__.py @@ -0,0 +1,6 @@ +""" +This module contains compatibility helpers for Python 2/3 interoperability. + +It mainly exists because their are certain incompatibilities in the Python +syntax that can only be solved by conditionally importing different functions. +""" diff --git a/readability/compat/three.py b/readability/compat/three.py new file mode 100644 index 0000000..2635157 --- /dev/null +++ b/readability/compat/three.py @@ -0,0 +1,6 @@ +def raise_with_traceback(exc_type, traceback, *args, **kwargs): + """ + Raise a new exception of type `exc_type` with an existing `traceback`. All + additional (keyword-)arguments are forwarded to `exc_type` + """ + raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py new file mode 100644 index 0000000..642ecb7 --- /dev/null +++ b/readability/compat/two.py @@ -0,0 +1,6 @@ +def raise_with_traceback(exc_type, traceback, *args, **kwargs): + """ + Raise a new exception of type `exc_type` with an existing `traceback`. All + additional (keyword-)arguments are forwarded to `exc_type` + """ + raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/encoding.py b/readability/encoding.py index fb4761d..b91c3e2 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,27 +1,33 @@ import re import chardet +import sys def get_encoding(page): # Regex for XML and HTML Meta charset declaration - charset_re = re.compile(r']', flags=re.I) - pragma_re = re.compile(r']', flags=re.I) - xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') + charset_re = re.compile(br']', flags=re.I) + pragma_re = re.compile(br']', flags=re.I) + xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') declared_encodings = (charset_re.findall(page) + pragma_re.findall(page) + xml_re.findall(page)) # Try any declared encodings - if len(declared_encodings) > 0: - for declared_encoding in declared_encodings: - try: - page.decode(custom_decode(declared_encoding)) - return custom_decode(declared_encoding) - except UnicodeDecodeError: - pass + for declared_encoding in declared_encodings: + try: + if sys.version_info[0] == 3: + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode('ascii', 'replace') + + page.decode(custom_decode(declared_encoding)) + return custom_decode(declared_encoding) + except UnicodeDecodeError: + pass # Fallback to chardet if declared encodings fail - text = re.sub(']*>\s*', ' ', page) + text = re.sub(b']*>\s*', b' ', page) enc = 'utf-8' if not text.strip() or len(text) < 10: return enc # can't guess diff --git a/readability/htmls.py b/readability/htmls.py index 536b21b..292b4bb 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -8,8 +8,11 @@ from .encoding import get_encoding utf8_parser = lxml.html.HTMLParser(encoding='utf-8') +if sys.version_info[0] == 2: + str = unicode + def build_doc(page): - if isinstance(page, unicode): + if isinstance(page, str): enc = None page_unicode = page else: @@ -33,7 +36,7 @@ def normalize_entities(cur_title): u'\u00BB': '"', u'"': '"', } - for c, r in entities.iteritems(): + for c, r in entities.items(): if c in cur_title: cur_title = cur_title.replace(c, r) @@ -105,7 +108,7 @@ def shorten_title(doc): def get_body(doc): [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] - raw_html = unicode(tostring(doc.body or doc)) + raw_html = str(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? diff --git a/readability/readability.py b/readability/readability.py index ec4835c..993d972 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import logging import re import sys @@ -19,6 +20,8 @@ from .htmls import shorten_title log = logging.getLogger() +if sys.version_info[0] == 2: + str = unicode REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), @@ -80,11 +83,12 @@ regexp_type = type(re.compile('hello, world')) def compile_pattern(elements): if not elements: return None - if isinstance(elements, regexp_type): + elif isinstance(elements, regexp_type): return elements - if isinstance(elements, basestring): + else: + # assume string or string like object elements = elements.split(',') - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) class Document: """Class to build a etree document out of html.""" @@ -194,9 +198,13 @@ class Document: continue else: return cleaned_article - except StandardError, e: + except Exception as e: log.exception('error getting summary: ') - raise Unparseable(str(e)), None, sys.exc_info()[2] + if sys.version_info[0] == 2: + from .compat.two import raise_with_traceback + else: + from .compat.three import raise_with_traceback + raise_with_traceback(Unparseable, sys.exc_info()[2], str(e)) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -389,7 +397,7 @@ class Document: # This results in incorrect results in case there is an # buried within an for example if not REGEXES['divToPElementsRe'].search( - unicode(''.join(map(tostring, list(elem))))): + str(''.join(map(str, map(tostring, list(elem)))))): #self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) @@ -612,18 +620,18 @@ def main(): file = None if options.url: - import urllib - file = urllib.urlopen(options.url) + import urllib.request, urllib.parse, urllib.error + file = urllib.request.urlopen(options.url) else: file = open(args[0], 'rt') enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING try: - print Document(file.read(), + print(Document(file.read(), debug=options.verbose, url=options.url, positive_keywords = options.positive_keywords, negative_keywords = options.negative_keywords, - ).summary().encode(enc, 'replace') + ).summary().encode(enc, 'replace')) finally: file.close() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d6e1198 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-e . diff --git a/setup.py b/setup.py index b55fae5..27c422d 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from setuptools import setup, find_packages import sys diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..50b4a74 --- /dev/null +++ b/tox.ini @@ -0,0 +1,20 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py26, py27, py33, py34 + +[testenv] +deps=pytest +# This creates the virtual envs with --site-packages so already packages +# that are already installed will be reused. This is especially useful on +# Windows. Since we use lxml instead of compiling it locally (which in turn +# requires a Compiler and the build dependencies), you can download +# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via +# $PYTHONDIR\Scripts\pip.exe install *.whl +sitepackages=True +commands = + pip install -r requirements.txt + py.test