From 75e2e0cb3ab86f4a1231a141c724c19fdb4839c9 Mon Sep 17 00:00:00 2001 From: Nathan Breit Date: Thu, 18 Dec 2014 18:48:22 -0800 Subject: [PATCH] Defaulting to utf-8 when chardet returns None On articles like this one chardet returns None: http://news.zing.vn/nhip-song-tre/thay-giao-gay-sot-tung-bo-luat-tinh-yeu/a291427.html This causes exceptions later on when encoding.lower() is called --- readability/encoding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index a72c34d..fb4761d 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -26,7 +26,7 @@ def get_encoding(page): if not text.strip() or len(text) < 10: return enc # can't guess res = chardet.detect(text) - enc = res['encoding'] + enc = res['encoding'] or 'utf-8' #print '->', enc, "%.2f" % res['confidence'] enc = custom_decode(enc) return enc @@ -45,4 +45,4 @@ def custom_decode(encoding): if encoding in alternates: return alternates[encoding] else: - return encoding \ No newline at end of file + return encoding