Fixes regex declaration in get_encoding.

Since get_encoding() is only called when the input is *not* already unicode we need to declare the regexs as byte type so they continue to work in Python 3.
9 years ago · 046d2c10c3
parent ce7ca26835
commit 046d2c10c3
1 changed files with 4 additions and 4 deletions
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -3,9 +3,9 @@ import chardet

 def get_encoding(page):
    # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+    charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+    pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+    xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

    declared_encodings = (charset_re.findall(page) +
            pragma_re.findall(page) +
@ -21,7 +21,7 @@ def get_encoding(page):
                pass

    # Fallback to chardet if declared encodings fail
-    text = re.sub('</?[^>]*>\s*', ' ', page)
+    text = re.sub(b'</?[^>]*>\s*', b' ', page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
        return enc # can't guess