|
|
|
@ -3,9 +3,9 @@ import chardet
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
|
|
|
|
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
|
|
|
|
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
|
|
|
|
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
|
|
|
|
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
|
|
|
|
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
|
|
|
|
|
|
|
|
|
CHARSETS = {
|
|
|
|
|
'big5': 'big5hkscs',
|
|
|
|
@ -34,6 +34,7 @@ def get_encoding(page):
|
|
|
|
|
# Try any declared encodings
|
|
|
|
|
for declared_encoding in declared_encodings:
|
|
|
|
|
try:
|
|
|
|
|
# Python3 only
|
|
|
|
|
if sys.version_info[0] == 3:
|
|
|
|
|
# declared_encoding will actually be bytes but .decode() only
|
|
|
|
|
# accepts `str` type. Decode blindly with ascii because no one should
|
|
|
|
@ -41,17 +42,16 @@ def get_encoding(page):
|
|
|
|
|
declared_encoding = declared_encoding.decode('ascii', 'replace')
|
|
|
|
|
|
|
|
|
|
encoding = fix_charset(declared_encoding)
|
|
|
|
|
|
|
|
|
|
# Now let's decode the page
|
|
|
|
|
page.decode(encoding)
|
|
|
|
|
# It worked!
|
|
|
|
|
return encoding
|
|
|
|
|
except (UnicodeDecodeError, LookupError):
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Fallback to chardet if declared encodings fail
|
|
|
|
|
# Remove all HTML tags, and leave only text for chardet
|
|
|
|
|
text = re.sub(br'(\s*</?[^>]*>)+\s*', b' ', page).strip()
|
|
|
|
|
text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip()
|
|
|
|
|
enc = 'utf-8'
|
|
|
|
|
if len(text) < 10:
|
|
|
|
|
return enc # can't guess
|
|
|
|
|