From 15f3692e68f8bd66affc898f8ce31aca86c3e886 Mon Sep 17 00:00:00 2001 From: Raphael Cohen Date: Tue, 4 Feb 2020 16:15:50 +0100 Subject: [PATCH] fix: Decodes bytes if needed in get_body --- readability/htmls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/readability/htmls.py b/readability/htmls.py index 17a75c7..b2eb3ce 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -134,7 +134,9 @@ def get_body(doc): elem.drop_tree() # tostring() always return utf-8 encoded string # FIXME: isn't better to use tounicode? - raw_html = str_(tostring(doc.body or doc)) + raw_html = tostring(doc.body or doc) + if isinstance(raw_html, bytes): + raw_html = raw_html.decode() cleaned = clean_attributes(raw_html) try: # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?