Several updates from dev version.

dev
Yuri Baburov 9 years ago
parent 40e430c27d
commit e8f86bdcf9

@ -45,7 +45,6 @@ def get_encoding(page):
except UnicodeDecodeError:
log.info('Encoding "%s", specified in the document as "%s" '
'didn\'t work' % (encoding, declared_encoding))
print "Content encoding didn't work:", encoding
# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)

@ -17,8 +17,7 @@ from .htmls import shorten_title
from encoding import get_encoding
from debug import describe, text_content, open_in_browser
log = logging.getLogger('readbility.readability')
StandardError = Exception in python3
log = logging.getLogger(__file__)
REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -190,7 +189,7 @@ class Document:
continue
else:
return cleaned_article
except StandardError as e:
except Exception as e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
@ -635,9 +634,8 @@ def main():
file = urllib.urlopen(options.url)
else:
file = open(args[0], 'rt')
output_encoding = sys.__stdout__.encoding or 'utf-8'
# XXX: a hack, better set PYTHONIOENCODING explicitly
html = file.read() # bytes object
encoding = get_encoding(html)
html = html.decode(encoding)
try:
@ -648,6 +646,9 @@ def main():
result = 'Title: ' + doc.short_title() + '<br/>' + doc.summary()
open_in_browser(result)
else:
# XXX: a hack, better to set PYTHONIOENCODING explicitly
output_encoding = sys.__stdout__.encoding or 'utf-8'
print 'Title:', doc.short_title().encode(output_encoding, 'replace')
print doc.summary().encode(output_encoding, 'replace')
finally:

Loading…
Cancel
Save