Merge 44d61e557f into 0c2f29ed0d

10 years ago · 28923e8a73
parent 0c2f29ed0d 44d61e557f
commit 28923e8a73
4 changed files with 21 additions and 80 deletions
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,48 +0,0 @@
-import re
-import chardet
-
-def get_encoding(page):
-    # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
-
-    declared_encodings = (charset_re.findall(page) +
-            pragma_re.findall(page) +
-            xml_re.findall(page))
-
-    # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
-
-    # Fallback to chardet if declared encodings fail
-    text = re.sub('</?[^>]*>\s*', ' ', page)
-    enc = 'utf-8'
-    if not text.strip() or len(text) < 10:
-        return enc # can't guess
-    res = chardet.detect(text)
-    enc = res['encoding']
-    #print '->', enc, "%.2f" % res['confidence']
-    enc = custom_decode(enc)
-    return enc
-
-def custom_decode(encoding):
-    """Overrides encoding when charset declaration
-       or charset determination is a subset of a larger
-       charset.  Created because of issues with Chinese websites"""
-    encoding = encoding.lower()
-    alternates = {
-        'big5': 'big5hkscs',
-        'gb2312': 'gb18030',
-        'ascii': 'utf-8',
-        'MacCyrillic': 'cp1251',
-    }
-    if encoding in alternates:
-        return alternates[encoding]
-    else:
-        return encoding
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -1,5 +1,4 @@
-from cleaners import normalize_spaces, clean_attributes
-from encoding import get_encoding
+from .cleaners import normalize_spaces, clean_attributes
 from lxml.html import tostring
 import logging
 import lxml.html
@ -8,14 +7,8 @@ import re, sys
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

 def build_doc(page):
-    if isinstance(page, unicode):
-        enc = None
-        page_unicode = page
-    else:
-        enc = get_encoding(page) or 'utf-8'
-        page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
-    return doc, enc
+    doc = lxml.html.document_fromstring(page, parser=utf8_parser)
+    return doc

 def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@ -104,7 +97,7 @@ def shorten_title(doc):

 def get_body(doc):
    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    raw_html = tostring(doc.body or doc)
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
--- a/readability/readability.py
+++ b/readability/readability.py
@ -4,17 +4,16 @@ import re
 import sys

 from collections import defaultdict
-from lxml.etree import tostring
-from lxml.etree import tounicode
+from lxml.etree import tostring, tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring

-from cleaners import clean_attributes
-from cleaners import html_cleaner
-from htmls import build_doc
-from htmls import get_body
-from htmls import get_title
-from htmls import shorten_title
+from .cleaners import clean_attributes
+from .cleaners import html_cleaner
+from .htmls import build_doc
+from .htmls import get_body
+from .htmls import get_title
+from .htmls import shorten_title


 logging.basicConfig(level=logging.INFO)
@ -110,7 +109,6 @@ class Document:
        self.input = input
        self.options = options
        self.html = None
-        self.encoding = None
        self.positive_keywords = compile_pattern(positive_keywords)
        self.negative_keywords = compile_pattern(negative_keywords)

@ -120,7 +118,7 @@ class Document:
        return self.html

    def _parse(self, input):
-        doc, self.encoding = build_doc(input)
+        doc = build_doc(input)
        doc = html_cleaner.clean_html(doc)
        base_href = self.options.get('url', None)
        if base_href:
@ -194,9 +192,9 @@ class Document:
                    continue
                else:
                    return cleaned_article
-        except StandardError, e:
+        except Exception as e:
            log.exception('error getting summary: ')
-            raise Unparseable(str(e)), None, sys.exc_info()[2]
+            raise Unparseable(str(e))

    def get_article(self, candidates, best_candidate, html_partial=False):
        # Now that we have the top candidate, look through its siblings for
@ -387,7 +385,7 @@ class Document:
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
+                    ''.join(map(tounicode, list(elem)))):
                #self.debug("Altering %s to p" % (describe(elem)))
                elem.tag = "p"
                #print "Fixed element "+describe(elem)
@ -599,20 +597,18 @@ def main():
        parser.print_help()
        sys.exit(1)

-    file = None
    if options.url:
-        import urllib
-        file = urllib.urlopen(options.url)
+        import requests
+        data = requests.get(options.url).raw_text
    else:
-        file = open(args[0], 'rt')
-    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
+        data = open(args[0], 'rt').read()
    try:
-        print Document(file.read(),
+        print(Document(data,
            debug=options.verbose,
            url=options.url,
            positive_keywords = options.positive_keywords,
            negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        ).summary())
    finally:
        file.close()

--- a/setup.py
+++ b/setup.py
@ -7,7 +7,7 @@ if sys.platform == 'darwin':
    import platform
    mac_ver = platform.mac_ver()[0]
    if mac_ver < '10.9':
-        print "Using lxml<2.4"
+        print("Using lxml<2.4")
        lxml_requirement = "lxml<2.4"

 setup(