cleaned code

4 years ago · 6872c948e4
parent 14d4474f33
commit 6872c948e4
1 changed files with 6 additions and 6 deletions
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -3,9 +3,9 @@ import chardet
 import sys


-RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

 CHARSETS = {
    'big5': 'big5hkscs',
@ -34,6 +34,7 @@ def get_encoding(page):
    # Try any declared encodings
    for declared_encoding in declared_encodings:
        try:
+            # Python3 only
            if sys.version_info[0] == 3:
                # declared_encoding will actually be bytes but .decode() only
                # accepts `str` type. Decode blindly with ascii because no one should
@ -41,17 +42,16 @@ def get_encoding(page):
                declared_encoding = declared_encoding.decode('ascii', 'replace')

            encoding = fix_charset(declared_encoding)
-
            # Now let's decode the page
            page.decode(encoding)
            # It worked!
            return encoding
-        except (UnicodeDecodeError, LookupError):
+        except UnicodeDecodeError:
            pass

    # Fallback to chardet if declared encodings fail
    # Remove all HTML tags, and leave only text for chardet
-    text = re.sub(br'(\s*</?[^>]*>)+\s*', b' ', page).strip()
+    text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip()
    enc = 'utf-8'
    if len(text) < 10:
        return enc # can't guess