Merge pull request #64 from martinth/master

Added python 3 support (Supported: python 2.6, 2.7, 3.3, 3.4).
Thanks a lot to @martinth
pull/73/head
Yuri Baburov 9 years ago
commit 154658798b

3
.gitignore vendored

@ -9,3 +9,6 @@ dist
/man
nosetests.xml
.coverage
.tox
.idea
.cache

@ -0,0 +1,6 @@
"""
This module contains compatibility helpers for Python 2/3 interoperability.
It mainly exists because their are certain incompatibilities in the Python
syntax that can only be solved by conditionally importing different functions.
"""

@ -0,0 +1,6 @@
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
"""
Raise a new exception of type `exc_type` with an existing `traceback`. All
additional (keyword-)arguments are forwarded to `exc_type`
"""
raise exc_type(*args, **kwargs).with_traceback(traceback)

@ -0,0 +1,6 @@
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
"""
Raise a new exception of type `exc_type` with an existing `traceback`. All
additional (keyword-)arguments are forwarded to `exc_type`
"""
raise exc_type(*args, **kwargs), None, traceback

@ -1,27 +1,33 @@
import re
import chardet
import sys
def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) +
xml_re.findall(page))
# Try any declared encodings
if len(declared_encodings) > 0:
for declared_encoding in declared_encodings:
try:
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
for declared_encoding in declared_encodings:
try:
if sys.version_info[0] == 3:
# declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode('ascii', 'replace')
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)
text = re.sub(b'</?[^>]*>\s*', b' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess

@ -8,8 +8,11 @@ from .encoding import get_encoding
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
if sys.version_info[0] == 2:
str = unicode
def build_doc(page):
if isinstance(page, unicode):
if isinstance(page, str):
enc = None
page_unicode = page
else:
@ -33,7 +36,7 @@ def normalize_entities(cur_title):
u'\u00BB': '"',
u'&quot;': '"',
}
for c, r in entities.iteritems():
for c, r in entities.items():
if c in cur_title:
cur_title = cur_title.replace(c, r)
@ -105,7 +108,7 @@ def shorten_title(doc):
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
raw_html = str(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
import logging
import re
import sys
@ -19,6 +20,8 @@ from .htmls import shorten_title
log = logging.getLogger()
if sys.version_info[0] == 2:
str = unicode
REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -80,11 +83,12 @@ regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements):
if not elements:
return None
if isinstance(elements, regexp_type):
elif isinstance(elements, regexp_type):
return elements
if isinstance(elements, basestring):
else:
# assume string or string like object
elements = elements.split(',')
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
class Document:
"""Class to build a etree document out of html."""
@ -194,9 +198,13 @@ class Document:
continue
else:
return cleaned_article
except StandardError, e:
except Exception as e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
if sys.version_info[0] == 2:
from .compat.two import raise_with_traceback
else:
from .compat.three import raise_with_traceback
raise_with_traceback(Unparseable, sys.exc_info()[2], str(e))
def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
@ -389,7 +397,7 @@ class Document:
# This results in incorrect results in case there is an <img>
# buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))):
str(''.join(map(str, map(tostring, list(elem)))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
@ -612,18 +620,18 @@ def main():
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
import urllib.request, urllib.parse, urllib.error
file = urllib.request.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
try:
print Document(file.read(),
print(Document(file.read(),
debug=options.verbose,
url=options.url,
positive_keywords = options.positive_keywords,
negative_keywords = options.negative_keywords,
).summary().encode(enc, 'replace')
).summary().encode(enc, 'replace'))
finally:
file.close()

@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
from setuptools import setup, find_packages
import sys

@ -0,0 +1,20 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
envlist = py26, py27, py33, py34
[testenv]
deps=pytest
# This creates the virtual envs with --site-packages so already packages
# that are already installed will be reused. This is especially useful on
# Windows. Since we use lxml instead of compiling it locally (which in turn
# requires a Compiler and the build dependencies), you can download
# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
# $PYTHONDIR\Scripts\pip.exe install *.whl
sitepackages=True
commands =
pip install -r requirements.txt
py.test
Loading…
Cancel
Save