diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py index 900f819..c648633 100644 --- a/readability/compat/__init__.py +++ b/readability/compat/__init__.py @@ -5,11 +5,16 @@ It mainly exists because their are certain incompatibilities in the Python syntax that can only be solved by conditionally importing different functions. """ import sys +from lxml.etree import tostring if sys.version_info[0] == 2: bytes_ = str str_ = unicode + def tostring_(s): + return tostring(s, encoding='utf-8').decode('utf-8') elif sys.version_info[0] == 3: bytes_ = bytes str_ = str + def tostring_(s): + return tostring(s, encoding='utf-8') diff --git a/readability/readability.py b/readability/readability.py index eca389d..6676d57 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -4,7 +4,6 @@ import logging import re import sys -from lxml.etree import tostring from lxml.etree import tounicode from lxml.html import document_fromstring from lxml.html import fragment_fromstring @@ -15,7 +14,7 @@ from .htmls import build_doc from .htmls import get_body from .htmls import get_title from .htmls import shorten_title -from .compat import str_, bytes_ +from .compat import str_, bytes_, tostring_ from .debug import describe, text_content @@ -464,7 +463,7 @@ class Document: # This results in incorrect results in case there is an # buried within an for example if not REGEXES["divToPElementsRe"].search( - str_(b"".join(map(lambda it: tostring(it, encoding="utf-8"), list(elem)))) + str_(b"".join(map(tostring_, list(elem)))) ): # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p"