Fixes regex declaration in get_encoding.

Since get_encoding() is only called when the input is *not* already unicode we need to declare the regexs as byte type so they continue to work in Python 3.
pull/64/head
Martin Thurau 9 years ago
parent ce7ca26835
commit 046d2c10c3

@ -3,9 +3,9 @@ import chardet
def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) +
@ -21,7 +21,7 @@ def get_encoding(page):
pass
# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)
text = re.sub(b'</?[^>]*>\s*', b' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess

Loading…
Cancel
Save