From 386e48d29b28e1c988cf676a33bbbfb5a41b038a Mon Sep 17 00:00:00 2001 From: Martin Thurau Date: Thu, 30 Apr 2015 11:47:32 +0200 Subject: [PATCH] Fixes checking of declared encodings in get_encoding. In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it. --- readability/encoding.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index 1c1e505..b91c3e2 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,5 +1,6 @@ import re import chardet +import sys def get_encoding(page): # Regex for XML and HTML Meta charset declaration @@ -12,13 +13,18 @@ def get_encoding(page): xml_re.findall(page)) # Try any declared encodings - if len(declared_encodings) > 0: - for declared_encoding in declared_encodings: - try: - page.decode(custom_decode(declared_encoding)) - return custom_decode(declared_encoding) - except UnicodeDecodeError: - pass + for declared_encoding in declared_encodings: + try: + if sys.version_info[0] == 3: + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode('ascii', 'replace') + + page.decode(custom_decode(declared_encoding)) + return custom_decode(declared_encoding) + except UnicodeDecodeError: + pass # Fallback to chardet if declared encodings fail text = re.sub(b']*>\s*', b' ', page)