WIP: update to support python2 and python3

9 years ago · 8048160d66
parent 71294f094f
commit 8048160d66
4 changed files with 86 additions and 55 deletions
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@ -1,15 +1,20 @@
-# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+# -*- encoding: utf-8 -*-
+
+# strip out a set of nuisance html attributes that can mess up rendering
+# in RSS feeds
+
 import re
 from lxml.html.clean import Cleaner

-bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ['width', 'height', 'style',
+             '[-a-z]*color', 'background[-a-z]*', 'on*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
 non_space = '[^ "\'>]+'
-htmlstrip = re.compile("<" # open
-    "([^>]+) " # prefix
-    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
-    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+htmlstrip = re.compile("<"  # open
+    "([^>]+) "  # prefix
+    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
    "([^>]*)"  # postfix
    ">"        # end
 , re.I)
@ -20,13 +25,15 @@ def clean_attributes(html):
    return html

 def normalize_spaces(s):
-    if not s: return ''
+    if not s:
+        return ''
    """replace any sequence of whitespace
    characters with a single space"""
    return ' '.join(s.split())

 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
-                  page_structure=False, processing_instructions=True, embedded=False,
-                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
+                  page_structure=False, processing_instructions=True,
+                  embedded=False, frames=False, forms=False,
+                  annoying_tags=False, remove_tags=None,
                  remove_unknown_tags=False, safe_attrs_only=False)
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -2,7 +2,7 @@ import re
 import chardet
 import logging

-log = logging.getLogger('readbility.encoding')
+log = logging.getLogger(__name__)


 RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -5,7 +5,7 @@ import logging
 import lxml.html
 import re

-log = logging.getLogger('readability.htmls')
+log = logging.getLogger(__name__)

 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

--- a/readability/readability.py
+++ b/readability/readability.py
@ -8,16 +8,17 @@ from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring

-from cleaners import clean_attributes
-from cleaners import html_cleaner
-from htmls import build_doc
-from htmls import get_body
-from htmls import get_title
-from htmls import shorten_title
+from .cleaners import clean_attributes
+from .cleaners import html_cleaner
+from .htmls import build_doc
+from .htmls import get_body
+from .htmls import get_title
+from .htmls import shorten_title
 from encoding import get_encoding
 from debug import describe, text_content, open_in_browser

 log = logging.getLogger('readbility.readability')
+StandardError = Exception in python3

 REGEXES = {
    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -68,7 +69,8 @@ def compile_pattern(elements):
        return None
    if isinstance(elements, regexp_type):
        return elements
-    if isinstance(elements, basestring):
+
+    if isinstance(elements, _basestring):
        elements = elements.split(',')
    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)

@ -78,7 +80,8 @@ class Document:
    TEXT_LENGTH_THRESHOLD = 25
    RETRY_LENGTH = 250

-    def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
+    def __init__(self, input, positive_keywords=None, negative_keywords=None,
+                 **options):
        """Generate the document

        :param input: string of the html content.
@ -88,8 +91,11 @@ class Document:
            - min_text_length:
            - retry_length:
            - url: will allow adjusting links to be absolute
-            - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
-            - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
+            - positive_keywords: the list of positive search patterns in
+                classes and ids, for example: ["news-item", "block"]
+            - negative_keywords: the list of negative
+                search patterns in classes
+                and ids, for example: ["mysidebar", "related", "ads"]
            Also positive_keywords and negative_keywords could be a regexp.
        """
        self.input = input
@ -184,7 +190,7 @@ class Document:
                    continue
                else:
                    return cleaned_article
-        except StandardError, e:
+        except StandardError as e:
            log.exception('error getting summary: ')
            raise Unparseable(str(e)), None, sys.exc_info()[2]

@ -208,7 +214,9 @@ class Document:
            if sibling is best_elem:
                append = True
            sibling_key = sibling  # HashableElement(sibling)
-            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+            if sibling_key in candidates and \
+                    candidates[sibling_key]['content_score'] >= \
+                    sibling_score_threshold:
                append = True

            if sibling.tag == "p":
@ -218,30 +226,37 @@ class Document:

                if node_length > 80 and link_density < 0.25:
                    append = True
-                elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
+                elif node_length <= 80 \
+                        and link_density == 0 \
+                        and re.search('\.( |$)', node_content):
                    append = True

            if append:
-                # We don't want to append directly to output, but to the div
+                # We don't want to append directly to output, but the div
                # in html->body->div
                if html_partial:
                    output.append(sibling)
                else:
                    output.getchildren()[0].getchildren()[0].append(sibling)
-        #if output is not None:
-        #    output.append(best_elem)
+        # if output is not None:
+        # output.append(best_elem)
        return output

    def select_best_candidate(self, candidates):
        if not candidates:
            return None

-        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+        sorted_candidates = sorted(
+            candidates.values(),
+            key=lambda x: x['content_score'],
+            reverse=True
+        )
+
        for candidate in sorted_candidates[:5]:
            elem = candidate['elem']
-            log.info("Top 5 : %6.3f %s: %s" % (
+            log.info("Top 5 : %6.3f %s" % (
                candidate['content_score'],
-                describe(elem), text_content(elem)))
+                describe(elem)))

        best_candidate = sorted_candidates[0]
        return best_candidate
@ -279,7 +294,8 @@ class Document:
                candidates[parent_node] = self.score_node(parent_node)
                ordered.append(parent_node)

-            if grand_parent_node is not None and grand_parent_node not in candidates:
+            if grand_parent_node is not None and \
+                grand_parent_node not in candidates:
                candidates[grand_parent_node] = self.score_node(
                    grand_parent_node)
                ordered.append(grand_parent_node)
@ -318,16 +334,20 @@ class Document:
                if REGEXES['positiveRe'].search(feature):
                    weight += 25

-                if self.positive_keywords and self.positive_keywords.search(feature):
+                if self.positive_keywords and self.positive_keywords.search(
+                        feature):
                    weight += 25

-                if self.negative_keywords and self.negative_keywords.search(feature):
+                if self.negative_keywords and self.negative_keywords.search(
+                        feature):
                    weight -= 25

-        if self.positive_keywords and self.positive_keywords.match('tag-' + e.tag):
+        if self.positive_keywords and self.positive_keywords.match(
+                'tag-' + e.tag):
            weight += 25

-        if self.negative_keywords and self.negative_keywords.match('tag-' + e.tag):
+        if self.negative_keywords and self.negative_keywords.match(
+                'tag-' + e.tag):
            weight -= 25

        return weight
@ -365,15 +385,15 @@ class Document:
        for elem in self.tags(self.html, 'div'):
            # transform <div>s that do not contain other block elements into
            # <p>s
-            #FIXME: The current implementation ignores all descendants that
+            # FIXME: The current implementation ignores all descendants that
            # are not direct children of elem
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
                    unicode(''.join(map(tostring, list(elem))))):
-                #self.debug("Altering %s to p" % describe(elem))
+                # self.debug("Altering %s to p" % describe(elem))
                elem.tag = "p"
-                #self.debug("Fixed element "+describe(elem))
+                # self.debug("Fixed element "+describe(elem))

        for elem in self.tags(self.html, 'div'):
            if elem.text and elem.text.strip():
@ -381,7 +401,7 @@ class Document:
                p.text = elem.text
                elem.text = None
                elem.insert(0, p)
-                #print "Appended "+tounicode(p)+" to "+describe(elem)
+                # print "Appended "+tounicode(p)+" to "+describe(elem)

            for pos, child in reversed(list(enumerate(elem))):
                if child.tail and child.tail.strip():
@ -389,9 +409,9 @@ class Document:
                    p.text = child.tail
                    child.tail = None
                    elem.insert(pos + 1, p)
-                    #print "Inserted "+tounicode(p)+" to "+describe(elem)
+                    # print "Inserted "+tounicode(p)+" to "+describe(elem)
                if child.tag == 'br':
-                    #print 'Dropped <br> at '+describe(elem)
+                    # print 'Dropped <br> at '+describe(elem)
                    child.drop_tree()

    def tags(self, node, *tag_names):
@ -407,7 +427,8 @@ class Document:
    def sanitize(self, node, candidates):
        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
        for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
-            if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
+            if self.class_weight(header) < 0 or \
+                    self.get_link_density(header) > 0.33:
                header.drop_tree()

        for elem in self.tags(node, "form", "iframe", "textarea"):
@ -421,7 +442,7 @@ class Document:
            weight = self.class_weight(el)
            if el in candidates:
                content_score = candidates[el]['content_score']
-                #print '!',el, '-> %6.3f' % content_score
+                # print '!',el, '-> %6.3f' % content_score
            else:
                content_score = 0
            tag = el.tag
@ -443,24 +464,26 @@ class Document:
                parent_node = el.getparent()
                if parent_node is not None:
                    if parent_node in candidates:
-                        content_score = candidates[parent_node]['content_score']
+                        content_score = candidates[
+                            parent_node]['content_score']
                    else:
                        content_score = 0
-                #if parent_node is not None:
-                    #pweight = self.class_weight(parent_node) + content_score
-                    #pname = describe(parent_node)
-                #else:
-                    #pweight = 0
-                    #pname = "no parent"
+                # if parent_node is not None:
+                    # pweight = self.class_weight(parent_node) + content_score
+                    # pname = describe(parent_node)
+                # else:
+                    # pweight = 0
+                    # pname = "no parent"
                to_remove = False
                reason = ""

-                #if el.tag == 'div' and counts["img"] >= 1:
-                #    continue
+                # if el.tag == 'div' and counts["img"] >= 1:
+                # continue
                if content_length and counts["img"] * 100 >= content_length:
                    reason = "too many images (%s) for text " % counts["img"]
                    to_remove = True
-                elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
+                elif counts["li"] > counts["p"] \
+                        and tag != "ul" and tag != "ol":
                    reason = "more <li>s than <p>s"
                    to_remove = True
                elif counts["input"] > (counts["p"] / 3):
@ -544,7 +567,7 @@ class Document:

        for el in ([node] + [n for n in node.iter()]):
            if not self.options.get('attributes', None):
-                #el.attrib = {} #FIXME:Checkout the effects of disabling this
+                # el.attrib = {} #FIXME:Checkout the effects of disabling this
                pass

        self.html = node
@ -612,7 +635,8 @@ def main():
        file = urllib.urlopen(options.url)
    else:
        file = open(args[0], 'rt')
-    output_encoding = sys.__stdout__.encoding or 'utf-8'  # XXX: a hack, better set PYTHONIOENCODING explicitly
+    output_encoding = sys.__stdout__.encoding or 'utf-8'
+    # XXX: a hack, better set PYTHONIOENCODING explicitly
    html = file.read()  # bytes object
    encoding = get_encoding(html)
    html = html.decode(encoding)