Sorted out unicode issues, thanks to Lee Semel.

pull/9/head
Yuri Baburov 13 years ago
parent 45781a600f
commit c2ec1d1c38

@ -3,17 +3,15 @@ import chardet
def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return 'ascii'
return enc # can't guess
try:
enc = 'utf-8'
diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text)
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
#print '->', enc, '100%'
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
return enc
except UnicodeDecodeError:
#import traceback;traceback.print_exc()
pass
res = chardet.detect(text)
enc = res['encoding']

@ -10,12 +10,12 @@ logging.getLogger().setLevel(logging.DEBUG)
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page):
if type(page) != unicode:
enc = get_encoding(page)
page_enc = page.decode(enc, 'replace')
if isinstance(page, unicode):
page_unicode = page
else:
page_enc = page
doc = lxml.html.document_fromstring(page_enc.encode('utf-8'), parser=utf8_parser)
enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc
def js_re(src, pattern, flags, repl):

@ -496,8 +496,9 @@ def main():
file = urllib.urlopen(options.url)
else:
file = open(args[0])
enc = sys.stdout.encoding or 'utf-8'
try:
print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore')
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
finally:
file.close()

Loading…
Cancel
Save