Merge pull request #147 from anekos/fix/UnicodeDecodeError-on-python2

Fix UnicodeDecodeError on python2
4 years ago · 1e3b8504bb
parent e4a699bbb0 667114463d
commit 1e3b8504bb
2 changed files with 7 additions and 3 deletions
--- a/readability/compat/init.py
+++ b/readability/compat/init.py
@ -5,11 +5,16 @@ It mainly exists because their are certain incompatibilities in the Python
 syntax that can only be solved by conditionally importing different functions.
 """
 import sys
+from lxml.etree import tostring

 if sys.version_info[0] == 2:
    bytes_ = str
    str_ = unicode
+    def tostring_(s):
+        return tostring(s, encoding='utf-8').decode('utf-8')

 elif sys.version_info[0] == 3:
    bytes_ = bytes
    str_ = str
+    def tostring_(s):
+        return tostring(s, encoding='utf-8')
--- a/readability/readability.py
+++ b/readability/readability.py
@ -4,7 +4,6 @@ import logging
 import re
 import sys

-from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
@ -15,7 +14,7 @@ from .htmls import build_doc
 from .htmls import get_body
 from .htmls import get_title
 from .htmls import shorten_title
-from .compat import str_, bytes_
+from .compat import str_, bytes_, tostring_
 from .debug import describe, text_content


@ -464,7 +463,7 @@ class Document:
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES["divToPElementsRe"].search(
-                str_(b"".join(map(lambda it: tostring(it, encoding="utf-8"), list(elem))))
+                str_(b"".join(map(tostring_, list(elem))))
            ):
                # log.debug("Altering %s to p" % (describe(elem)))
                elem.tag = "p"