Fix UnicodeDecodeError on python2

pull/147/head
anekos 4 years ago
parent e4a699bbb0
commit 667114463d

@ -5,11 +5,16 @@ It mainly exists because their are certain incompatibilities in the Python
syntax that can only be solved by conditionally importing different functions. syntax that can only be solved by conditionally importing different functions.
""" """
import sys import sys
from lxml.etree import tostring
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
bytes_ = str bytes_ = str
str_ = unicode str_ = unicode
def tostring_(s):
return tostring(s, encoding='utf-8').decode('utf-8')
elif sys.version_info[0] == 3: elif sys.version_info[0] == 3:
bytes_ = bytes bytes_ = bytes
str_ = str str_ = str
def tostring_(s):
return tostring(s, encoding='utf-8')

@ -4,7 +4,6 @@ import logging
import re import re
import sys import sys
from lxml.etree import tostring
from lxml.etree import tounicode from lxml.etree import tounicode
from lxml.html import document_fromstring from lxml.html import document_fromstring
from lxml.html import fragment_fromstring from lxml.html import fragment_fromstring
@ -15,7 +14,7 @@ from .htmls import build_doc
from .htmls import get_body from .htmls import get_body
from .htmls import get_title from .htmls import get_title
from .htmls import shorten_title from .htmls import shorten_title
from .compat import str_, bytes_ from .compat import str_, bytes_, tostring_
from .debug import describe, text_content from .debug import describe, text_content
@ -464,7 +463,7 @@ class Document:
# This results in incorrect results in case there is an <img> # This results in incorrect results in case there is an <img>
# buried within an <a> for example # buried within an <a> for example
if not REGEXES["divToPElementsRe"].search( if not REGEXES["divToPElementsRe"].search(
str_(b"".join(map(lambda it: tostring(it, encoding="utf-8"), list(elem)))) str_(b"".join(map(tostring_, list(elem))))
): ):
# log.debug("Altering %s to p" % (describe(elem))) # log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p" elem.tag = "p"

Loading…
Cancel
Save