Defaulting to utf-8 when chardet returns None

On articles like this one chardet returns None:
http://news.zing.vn/nhip-song-tre/thay-giao-gay-sot-tung-bo-luat-tinh-yeu/a291427.html
This causes exceptions later on when encoding.lower() is called
pull/56/head
Nathan Breit 10 years ago
parent 0c2f29ed0d
commit 75e2e0cb3a

@ -26,7 +26,7 @@ def get_encoding(page):
if not text.strip() or len(text) < 10: if not text.strip() or len(text) < 10:
return enc # can't guess return enc # can't guess
res = chardet.detect(text) res = chardet.detect(text)
enc = res['encoding'] enc = res['encoding'] or 'utf-8'
#print '->', enc, "%.2f" % res['confidence'] #print '->', enc, "%.2f" % res['confidence']
enc = custom_decode(enc) enc = custom_decode(enc)
return enc return enc
@ -45,4 +45,4 @@ def custom_decode(encoding):
if encoding in alternates: if encoding in alternates:
return alternates[encoding] return alternates[encoding]
else: else:
return encoding return encoding

Loading…
Cancel
Save