From 386e48d29b28e1c988cf676a33bbbfb5a41b038a Mon Sep 17 00:00:00 2001
From: Martin Thurau <martin.thurau@gmail.com>
Date: Thu, 30 Apr 2015 11:47:32 +0200
Subject: [PATCH] Fixes checking of declared encodings in get_encoding.

In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it.
---
 readability/encoding.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/readability/encoding.py b/readability/encoding.py
index 1c1e505..b91c3e2 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -1,5 +1,6 @@
 import re
 import chardet
+import sys
 
 def get_encoding(page):
     # Regex for XML and HTML Meta charset declaration
@@ -12,13 +13,18 @@ def get_encoding(page):
             xml_re.findall(page))
 
     # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    for declared_encoding in declared_encodings:
+        try:
+            if sys.version_info[0] == 3:
+                # declared_encoding will actually be bytes but .decode() only
+                # accepts `str` type. Decode blindly with ascii because no one should
+                # ever use non-ascii characters in the name of an encoding.
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
+
+            page.decode(custom_decode(declared_encoding))
+            return custom_decode(declared_encoding)
+        except UnicodeDecodeError:
+            pass
 
     # Fallback to chardet if declared encodings fail
     text = re.sub(b'</?[^>]*>\s*', b' ', page)