|
|
|
import re
|
|
|
|
import chardet
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
|
|
|
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
|
|
|
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
|
|
|
|
|
|
|
CHARSETS = {
|
|
|
|
"big5": "big5hkscs",
|
|
|
|
"gb2312": "gb18030",
|
|
|
|
"ascii": "utf-8",
|
|
|
|
"maccyrillic": "cp1251",
|
|
|
|
"win1251": "cp1251",
|
|
|
|
"win-1251": "cp1251",
|
|
|
|
"windows-1251": "cp1251",
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def fix_charset(encoding):
|
|
|
|
"""Overrides encoding when charset declaration
|
|
|
|
or charset determination is a subset of a larger
|
|
|
|
charset. Created because of issues with Chinese websites"""
|
|
|
|
encoding = encoding.lower()
|
|
|
|
return CHARSETS.get(encoding, encoding)
|
|
|
|
|
|
|
|
|
|
|
|
def get_encoding(page):
|
|
|
|
# Regex for XML and HTML Meta charset declaration
|
|
|
|
declared_encodings = (
|
|
|
|
RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
|
|
|
|
)
|
|
|
|
|
|
|
|
# Try any declared encodings
|
|
|
|
for declared_encoding in declared_encodings:
|
|
|
|
try:
|
|
|
|
if sys.version_info[0] == 3:
|
|
|
|
# declared_encoding will actually be bytes but .decode() only
|
|
|
|
# accepts `str` type. Decode blindly with ascii because no one should
|
|
|
|
# ever use non-ascii characters in the name of an encoding.
|
|
|
|
declared_encoding = declared_encoding.decode("ascii", "replace")
|
|
|
|
|
|
|
|
encoding = fix_charset(declared_encoding)
|
|
|
|
|
|
|
|
# Now let's decode the page
|
|
|
|
page.decode(encoding)
|
|
|
|
# It worked!
|
|
|
|
return encoding
|
Catch LookupError in case of bad encoding string
I've seen cases where bad encoding strings will result in errors, catching LookupError should solve the problem by falling back onto `chardet` or `utf-8`
Here's one case:
```
textPayload: "Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/readability/readability.py", line 189, in summary
self._html(True)
File "/opt/conda/lib/python3.7/site-packages/readability/readability.py", line 132, in _html
self.html = self._parse(self.input)
File "/opt/conda/lib/python3.7/site-packages/readability/readability.py", line 141, in _parse
doc, self.encoding = build_doc(input)
File "/opt/conda/lib/python3.7/site-packages/readability/htmls.py", line 17, in build_doc
encoding = get_encoding(page) or 'utf-8'
File "/opt/conda/lib/python3.7/site-packages/readability/encoding.py", line 46, in get_encoding
page.decode(encoding)
LookupError: unknown encoding: utf-8, ie=edge, chrome=1
```
5 years ago
|
|
|
except (UnicodeDecodeError, LookupError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# Fallback to chardet if declared encodings fail
|
|
|
|
# Remove all HTML tags, and leave only text for chardet
|
|
|
|
text = re.sub(br"(\s*</?[^>]*>)+\s*", b" ", page).strip()
|
|
|
|
enc = "utf-8"
|
|
|
|
if len(text) < 10:
|
|
|
|
return enc # can't guess
|
|
|
|
res = chardet.detect(text)
|
|
|
|
enc = res["encoding"] or "utf-8"
|
|
|
|
# print '->', enc, "%.2f" % res['confidence']
|
|
|
|
enc = fix_charset(enc)
|
|
|
|
return enc
|