Merge pull request #147 from anekos/fix/UnicodeDecodeError-on-python2

Fix UnicodeDecodeError on python2
pull/148/head
Yuri Baburov 4 years ago committed by GitHub
commit 1e3b8504bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -5,11 +5,16 @@ It mainly exists because their are certain incompatibilities in the Python
syntax that can only be solved by conditionally importing different functions.
"""
import sys
from lxml.etree import tostring
if sys.version_info[0] == 2:
bytes_ = str
str_ = unicode
def tostring_(s):
return tostring(s, encoding='utf-8').decode('utf-8')
elif sys.version_info[0] == 3:
bytes_ = bytes
str_ = str
def tostring_(s):
return tostring(s, encoding='utf-8')

@ -4,7 +4,6 @@ import logging
import re
import sys
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
@ -15,7 +14,7 @@ from .htmls import build_doc
from .htmls import get_body
from .htmls import get_title
from .htmls import shorten_title
from .compat import str_, bytes_
from .compat import str_, bytes_, tostring_
from .debug import describe, text_content
@ -464,7 +463,7 @@ class Document:
# This results in incorrect results in case there is an <img>
# buried within an <a> for example
if not REGEXES["divToPElementsRe"].search(
str_(b"".join(map(lambda it: tostring(it, encoding="utf-8"), list(elem))))
str_(b"".join(map(tostring_, list(elem))))
):
# log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"

Loading…
Cancel
Save