You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
python-readability/readability/encoding.py

63 lines
2.0 KiB
Python

import re
import chardet
import sys
RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'maccyrillic': 'cp1251',
'win1251': 'cp1251',
'win-1251': 'cp1251',
'windows-1251': 'cp1251',
}
def fix_charset(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
return CHARSETS.get(encoding, encoding)
def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
declared_encodings = (RE_CHARSET.findall(page) +
RE_PRAGMA.findall(page) +
RE_XML.findall(page))
# Try any declared encodings
for declared_encoding in declared_encodings:
try:
if sys.version_info[0] == 3:
# declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode('ascii', 'replace')
encoding = fix_charset(declared_encoding)
# Now let's decode the page
page.decode(encoding)
# It worked!
return encoding
except (UnicodeDecodeError, LookupError):
pass
# Fallback to chardet if declared encodings fail
# Remove all HTML tags, and leave only text for chardet
text = re.sub(br'(\s*</?[^>]*>)+\s*', b' ', page).strip()
enc = 'utf-8'
if len(text) < 10:
return enc # can't guess
res = chardet.detect(text)
enc = res['encoding'] or 'utf-8'
#print '->', enc, "%.2f" % res['confidence']
enc = fix_charset(enc)
return enc