Minor fix in encoding guessing. Claiming it v0.3.0.1

pull/44/head
Yuri Baburov 11 years ago
parent 08658d1d31
commit 318f25c577

@ -11,20 +11,10 @@ def build_doc(page):
if isinstance(page, unicode):
page_unicode = page
else:
enc = get_encoding(page)
if enc:
page_unicode = page.decode(enc, 'replace')
encoding = enc
else:
try:
#try utf-8
page_unicode = page.decode('utf-8', 'strict')
encoding = 'utf-8'
except UnicodeDecodeError:
page_unicode = page.decode('utf-8', 'replace')
encoding = 'utf-8'
enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, encoding
return doc, enc
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))

@ -9,7 +9,7 @@ else:
setup(
name="readability-lxml",
version="0.3",
version="0.3.0.1",
author="Yuri Baburov",
author_email="burchik@gmail.com",
description="fast python port of arc90's readability tool",

Loading…
Cancel
Save