pull/50/merge
palkeo 10 years ago
commit 28923e8a73

@ -1,48 +0,0 @@
import re
import chardet
def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) +
xml_re.findall(page))
# Try any declared encodings
if len(declared_encodings) > 0:
for declared_encoding in declared_encodings:
try:
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
res = chardet.detect(text)
enc = res['encoding']
#print '->', enc, "%.2f" % res['confidence']
enc = custom_decode(enc)
return enc
def custom_decode(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
alternates = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'MacCyrillic': 'cp1251',
}
if encoding in alternates:
return alternates[encoding]
else:
return encoding

@ -1,5 +1,4 @@
from cleaners import normalize_spaces, clean_attributes
from encoding import get_encoding
from .cleaners import normalize_spaces, clean_attributes
from lxml.html import tostring
import logging
import lxml.html
@ -8,14 +7,8 @@ import re, sys
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page):
if isinstance(page, unicode):
enc = None
page_unicode = page
else:
enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, enc
doc = lxml.html.document_fromstring(page, parser=utf8_parser)
return doc
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@ -104,7 +97,7 @@ def shorten_title(doc):
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
raw_html = tostring(doc.body or doc)
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

@ -4,17 +4,16 @@ import re
import sys
from collections import defaultdict
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.etree import tostring, tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from cleaners import clean_attributes
from cleaners import html_cleaner
from htmls import build_doc
from htmls import get_body
from htmls import get_title
from htmls import shorten_title
from .cleaners import clean_attributes
from .cleaners import html_cleaner
from .htmls import build_doc
from .htmls import get_body
from .htmls import get_title
from .htmls import shorten_title
logging.basicConfig(level=logging.INFO)
@ -110,7 +109,6 @@ class Document:
self.input = input
self.options = options
self.html = None
self.encoding = None
self.positive_keywords = compile_pattern(positive_keywords)
self.negative_keywords = compile_pattern(negative_keywords)
@ -120,7 +118,7 @@ class Document:
return self.html
def _parse(self, input):
doc, self.encoding = build_doc(input)
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.options.get('url', None)
if base_href:
@ -194,9 +192,9 @@ class Document:
continue
else:
return cleaned_article
except StandardError, e:
except Exception as e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
raise Unparseable(str(e))
def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
@ -387,7 +385,7 @@ class Document:
# This results in incorrect results in case there is an <img>
# buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))):
''.join(map(tounicode, list(elem)))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
@ -599,20 +597,18 @@ def main():
parser.print_help()
sys.exit(1)
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
import requests
data = requests.get(options.url).raw_text
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
data = open(args[0], 'rt').read()
try:
print Document(file.read(),
print(Document(data,
debug=options.verbose,
url=options.url,
positive_keywords = options.positive_keywords,
negative_keywords = options.negative_keywords,
).summary().encode(enc, 'replace')
).summary())
finally:
file.close()

@ -7,7 +7,7 @@ if sys.platform == 'darwin':
import platform
mac_ver = platform.mac_ver()[0]
if mac_ver < '10.9':
print "Using lxml<2.4"
print("Using lxml<2.4")
lxml_requirement = "lxml<2.4"
setup(

Loading…
Cancel
Save