From 3a43a3fe7e65698bfbbf22f19b3cf70425d34bb6 Mon Sep 17 00:00:00 2001 From: Mark Perdomo Date: Tue, 13 May 2014 15:09:47 +0800 Subject: [PATCH] Added code to check declared encodings first and check them from kennethreitz/requests/utils.py. Also I added some superset encodings I have found in Chinese pages that are mishandled by chardet/character declarations. --- readability/encoding.py | 45 ++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index d05b7f4..a72c34d 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -2,20 +2,47 @@ import re import chardet def get_encoding(page): + # Regex for XML and HTML Meta charset declaration + charset_re = re.compile(r']', flags=re.I) + pragma_re = re.compile(r']', flags=re.I) + xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') + + declared_encodings = (charset_re.findall(page) + + pragma_re.findall(page) + + xml_re.findall(page)) + + # Try any declared encodings + if len(declared_encodings) > 0: + for declared_encoding in declared_encodings: + try: + page.decode(custom_decode(declared_encoding)) + return custom_decode(declared_encoding) + except UnicodeDecodeError: + pass + + # Fallback to chardet if declared encodings fail text = re.sub(']*>\s*', ' ', page) enc = 'utf-8' if not text.strip() or len(text) < 10: return enc # can't guess - try: - diff = text.decode(enc, 'ignore').encode(enc) - sizes = len(diff), len(text) - if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8 - return enc - except UnicodeDecodeError: - pass res = chardet.detect(text) enc = res['encoding'] #print '->', enc, "%.2f" % res['confidence'] - if enc == 'MacCyrillic': - enc = 'cp1251' + enc = custom_decode(enc) return enc + +def custom_decode(encoding): + """Overrides encoding when charset declaration + or charset determination is a subset of a larger + charset. Created because of issues with Chinese websites""" + encoding = encoding.lower() + alternates = { + 'big5': 'big5hkscs', + 'gb2312': 'gb18030', + 'ascii': 'utf-8', + 'MacCyrillic': 'cp1251', + } + if encoding in alternates: + return alternates[encoding] + else: + return encoding \ No newline at end of file