diff --git a/readability/encoding.py b/readability/encoding.py index c3bf26d..c6040eb 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -45,7 +45,6 @@ def get_encoding(page): except UnicodeDecodeError: log.info('Encoding "%s", specified in the document as "%s" ' 'didn\'t work' % (encoding, declared_encoding)) - print "Content encoding didn't work:", encoding # Fallback to chardet if declared encodings fail text = re.sub(']*>\s*', ' ', page) diff --git a/readability/readability.py b/readability/readability.py index 8807756..1167e34 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -17,8 +17,7 @@ from .htmls import shorten_title from encoding import get_encoding from debug import describe, text_content, open_in_browser -log = logging.getLogger('readbility.readability') -StandardError = Exception in python3 +log = logging.getLogger(__file__) REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), @@ -190,7 +189,7 @@ class Document: continue else: return cleaned_article - except StandardError as e: + except Exception as e: log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2] @@ -635,9 +634,8 @@ def main(): file = urllib.urlopen(options.url) else: file = open(args[0], 'rt') - output_encoding = sys.__stdout__.encoding or 'utf-8' - # XXX: a hack, better set PYTHONIOENCODING explicitly html = file.read() # bytes object + encoding = get_encoding(html) html = html.decode(encoding) try: @@ -648,6 +646,9 @@ def main(): result = 'Title: ' + doc.short_title() + '
' + doc.summary() open_in_browser(result) else: + # XXX: a hack, better to set PYTHONIOENCODING explicitly + output_encoding = sys.__stdout__.encoding or 'utf-8' + print 'Title:', doc.short_title().encode(output_encoding, 'replace') print doc.summary().encode(output_encoding, 'replace') finally: