Fixes checking of declared encodings in get_encoding.

In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it.
9 years ago · 386e48d29b
parent 046d2c10c3
commit 386e48d29b
1 changed files with 13 additions and 7 deletions
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,5 +1,6 @@
 import re
 import chardet
+import sys

 def get_encoding(page):
    # Regex for XML and HTML Meta charset declaration
@ -12,13 +13,18 @@ def get_encoding(page):
            xml_re.findall(page))

    # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    for declared_encoding in declared_encodings:
+        try:
+            if sys.version_info[0] == 3:
+                # declared_encoding will actually be bytes but .decode() only
+                # accepts `str` type. Decode blindly with ascii because no one should
+                # ever use non-ascii characters in the name of an encoding.
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
+
+            page.decode(custom_decode(declared_encoding))
+            return custom_decode(declared_encoding)
+        except UnicodeDecodeError:
+            pass

    # Fallback to chardet if declared encodings fail
    text = re.sub(b'</?[^>]*>\s*', b' ', page)