From 04423589420a563cc0244e2b5a282c279e48c47e Mon Sep 17 00:00:00 2001 From: Dario <4755+dariobig@users.noreply.github.com> Date: Sat, 16 Nov 2019 19:44:34 -0500 Subject: [PATCH] Catch LookupError in case of bad encoding string I've seen cases where bad encoding strings will result in errors, catching LookupError should solve the problem by falling back onto `chardet` or `utf-8` Here's one case: ``` textPayload: "Traceback (most recent call last): File "/opt/conda/lib/python3.7/site-packages/readability/readability.py", line 189, in summary self._html(True) File "/opt/conda/lib/python3.7/site-packages/readability/readability.py", line 132, in _html self.html = self._parse(self.input) File "/opt/conda/lib/python3.7/site-packages/readability/readability.py", line 141, in _parse doc, self.encoding = build_doc(input) File "/opt/conda/lib/python3.7/site-packages/readability/htmls.py", line 17, in build_doc encoding = get_encoding(page) or 'utf-8' File "/opt/conda/lib/python3.7/site-packages/readability/encoding.py", line 46, in get_encoding page.decode(encoding) LookupError: unknown encoding: utf-8, ie=edge, chrome=1 ``` --- readability/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readability/encoding.py b/readability/encoding.py index cc14320..ebacded 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -46,7 +46,7 @@ def get_encoding(page): page.decode(encoding) # It worked! return encoding - except UnicodeDecodeError: + except (UnicodeDecodeError, LookupError): pass # Fallback to chardet if declared encodings fail