parent
2fab5ffa6b
commit
ae1f1adfff
@ -1,25 +1,62 @@
|
||||
def save_to_file(text, filename):
|
||||
f = open(filename, 'wt')
|
||||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
f.write(text.encode('utf-8'))
|
||||
f.close()
|
||||
import re
|
||||
|
||||
uids = {}
|
||||
def describe(node, depth=2):
|
||||
|
||||
uids = {}
|
||||
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
|
||||
|
||||
|
||||
def open_in_browser(html):
|
||||
"""
|
||||
Open the HTML document in a web browser, saving it to a temporary
|
||||
file to open it. Note that this does not delete the file after
|
||||
use. This is mainly meant for debugging.
|
||||
"""
|
||||
import os
|
||||
import webbrowser
|
||||
import tempfile
|
||||
handle, fn = tempfile.mkstemp(suffix='.html')
|
||||
f = os.fdopen(handle, 'wb')
|
||||
try:
|
||||
f.write("<meta charset='UTF-8' />")
|
||||
f.write(html.encode('utf-8'))
|
||||
finally:
|
||||
# we leak the file itself here, but we should at least close it
|
||||
f.close()
|
||||
url = 'file://' + fn.replace(os.path.sep, '/')
|
||||
webbrowser.open(url)
|
||||
return url
|
||||
|
||||
|
||||
def describe_node(node):
|
||||
if node is None:
|
||||
return ''
|
||||
if not hasattr(node, 'tag'):
|
||||
return "[%s]" % type(node)
|
||||
name = node.tag
|
||||
if node.get('id', ''): name += '#'+node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ','.')
|
||||
if node.get('id', ''):
|
||||
name += '#' + node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ', '.')
|
||||
if name[:4] in ['div#', 'div.']:
|
||||
name = name[3:]
|
||||
if name in ['tr', 'td', 'div', 'p']:
|
||||
if not node in uids:
|
||||
uid = uids[node] = len(uids)+1
|
||||
else:
|
||||
uid = uids.get(node)
|
||||
name += "%02d" % (uid)
|
||||
if depth and node.getparent() is not None:
|
||||
return name+' - '+describe(node.getparent(), depth-1)
|
||||
uid = uids.get(node)
|
||||
if uid is None:
|
||||
uid = uids[node] = len(uids) + 1
|
||||
name += "{%02d}" % uid
|
||||
return name
|
||||
|
||||
|
||||
def describe(node, depth=2):
|
||||
#return repr(NodeRepr(node))
|
||||
parent = ''
|
||||
if depth and node.getparent() is not None:
|
||||
parent = describe(node.getparent(), depth=depth - 1)
|
||||
return parent + '/' + describe_node(node)
|
||||
|
||||
|
||||
def text_content(elem, length=40):
|
||||
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
|
||||
if len(content) < length:
|
||||
return content
|
||||
return content[:length] + '...'
|
||||
|
@ -1,48 +1,58 @@
|
||||
import re
|
||||
import chardet
|
||||
import logging
|
||||
|
||||
log = logging.getLogger('readbility.encoding')
|
||||
|
||||
|
||||
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
|
||||
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
|
||||
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
||||
|
||||
CHARSETS = {
|
||||
'big5': 'big5hkscs',
|
||||
'gb2312': 'gb18030',
|
||||
'ascii': 'utf-8',
|
||||
'MacCyrillic': 'cp1251',
|
||||
}
|
||||
|
||||
|
||||
def fix_charset(encoding):
|
||||
"""Overrides encoding when charset declaration
|
||||
or charset determination is a subset of a larger
|
||||
charset. Created because of issues with Chinese websites"""
|
||||
encoding = encoding.lower()
|
||||
return CHARSETS.get(encoding, encoding)
|
||||
|
||||
|
||||
def get_encoding(page):
|
||||
# Regex for XML and HTML Meta charset declaration
|
||||
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
||||
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
||||
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
||||
|
||||
declared_encodings = (charset_re.findall(page) +
|
||||
pragma_re.findall(page) +
|
||||
xml_re.findall(page))
|
||||
|
||||
# Try any declared encodings
|
||||
if len(declared_encodings) > 0:
|
||||
for declared_encoding in declared_encodings:
|
||||
try:
|
||||
page.decode(custom_decode(declared_encoding))
|
||||
return custom_decode(declared_encoding)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
declared_encodings = (RE_CHARSET.findall(page) +
|
||||
RE_PRAGMA.findall(page) +
|
||||
RE_XML.findall(page))
|
||||
|
||||
log.debug("Document has the following encodings: %s" % declared_encodings)
|
||||
|
||||
# Try declared encodings, if any
|
||||
for declared_encoding in declared_encodings:
|
||||
encoding = fix_charset(declared_encoding)
|
||||
try:
|
||||
page.decode(encoding)
|
||||
log.info('Using encoding "%s"' % encoding)
|
||||
return encoding
|
||||
except UnicodeDecodeError:
|
||||
log.info('Encoding "%s", specified in the document as "%s" '
|
||||
'didn\'t work' % (encoding, declared_encoding))
|
||||
print "Content encoding didn't work:", encoding
|
||||
|
||||
# Fallback to chardet if declared encodings fail
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
log.debug("Can't guess encoding because text is too short")
|
||||
return enc
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
enc = fix_charset(res['encoding'])
|
||||
log.info('Trying encoding "%s" guessed '
|
||||
'with confidence %.2f' % (enc, res['confidence']))
|
||||
#print '->', enc, "%.2f" % res['confidence']
|
||||
enc = custom_decode(enc)
|
||||
return enc
|
||||
|
||||
def custom_decode(encoding):
|
||||
"""Overrides encoding when charset declaration
|
||||
or charset determination is a subset of a larger
|
||||
charset. Created because of issues with Chinese websites"""
|
||||
encoding = encoding.lower()
|
||||
alternates = {
|
||||
'big5': 'big5hkscs',
|
||||
'gb2312': 'gb18030',
|
||||
'ascii': 'utf-8',
|
||||
'MacCyrillic': 'cp1251',
|
||||
}
|
||||
if encoding in alternates:
|
||||
return alternates[encoding]
|
||||
else:
|
||||
return encoding
|
Loading…
Reference in New Issue