Fixes checking of declared encodings in get_encoding.

In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it.
pull/64/head
Martin Thurau 9 years ago
parent 046d2c10c3
commit 386e48d29b

@ -1,5 +1,6 @@
import re
import chardet
import sys
def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
@ -12,13 +13,18 @@ def get_encoding(page):
xml_re.findall(page))
# Try any declared encodings
if len(declared_encodings) > 0:
for declared_encoding in declared_encodings:
try:
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
for declared_encoding in declared_encodings:
try:
if sys.version_info[0] == 3:
# declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode('ascii', 'replace')
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
# Fallback to chardet if declared encodings fail
text = re.sub(b'</?[^>]*>\s*', b' ', page)

Loading…
Cancel
Save