refactoring

9 years ago · 950366ac71
parent 6f1b969f21
commit 950366ac71
4 changed files with 35 additions and 34 deletions
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@ -19,10 +19,10 @@ htmlstrip = re.compile("<"  # open
                       "(?:%s) *" % ('|'.join(bad_attrs),) +
                       # value
                       '= *(?:%s|%s|%s)' % (
-                       non_space, single_quoted, double_quoted) +
+                           non_space, single_quoted, double_quoted) +
                       "([^>]*)"  # postfix
-                       ">"  # end
-                       , re.I)
+                       ">",  # end
+                       re.I)


 def clean_attributes(html):
--- a/readability/debug.py
+++ b/readability/debug.py
@ -4,7 +4,8 @@
 def save_to_file(text, filename):
    f = open(filename, 'wt')
    f.write(
-        '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+        '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
+    )
    f.write(text.encode('utf-8'))
    f.close()

@ -23,7 +24,7 @@ def describe(node, depth=2):
    if name[:4] in ['div#', 'div.']:
        name = name[3:]
    if name in ['tr', 'td', 'div', 'p']:
-        if not node in uids:
+        if node not in uids:
            uid = uids[node] = len(uids) + 1
        else:
            uid = uids.get(node)
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -124,5 +124,4 @@ def get_body(doc):
        # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
    except Exception:  # FIXME find the equivalent lxml error
-        # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
        return raw_html
--- a/readability/readability.py
+++ b/readability/readability.py
@ -21,17 +21,31 @@ from htmls import shorten_title
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger()

+unlikelyCandidatesReList = ['combx', 'comment', 'community', 'disqus', 'extra',
+                            'foot', 'header', 'menu',
+                            'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor',
+                            'ad-break', 'agegate',
+                            'pagination', 'pager', 'popup', 'tweet', 'twitter']
+
+positiveReList = ['article', 'body', 'content', 'entry', 'hentry', 'main',
+                  'page', 'pagination', 'post', 'text', 'blog', 'story']
+
+negativeReList = [
+    'combx', 'comment', 'com-', 'contact', 'foot', 'footer', 'footnote',
+    'masthead', 'media', 'meta', 'outbrain', 'promo', 'related', 'scroll',
+    'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags', 'tool', 'widget', ]
+
 REGEXES = {
    'unlikelyCandidatesRe': re.compile(
-        'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',
+        '|'.join(unlikelyCandidatesReList),
        re.I),
    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',
                                         re.I),
    'positiveRe': re.compile(
-        'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',
+        '|'.join(positiveReList),
        re.I),
    'negativeRe': re.compile(
-        'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',
+        '|'.join(negativeReList),
        re.I),
    'divToPElementsRe': re.compile(
        '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
@ -116,8 +130,10 @@ class Document:
            - min_text_length:
            - retry_length:
            - url: will allow adjusting links to be absolute
-            - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
-            - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
+            - positive_keywords: the list of positive search patterns in classes
+                and ids, for example: ["news-item", "block"]
+            - negative_keywords: the list of negative search patterns in classes
+                and ids, for example: ["mysidebar", "related", "ads"]
            Also positive_keywords and negative_keywords could be a regexp.
        """
        self.input = input
@ -307,7 +323,8 @@ class Document:
                candidates[parent_node] = self.score_node(parent_node)
                ordered.append(parent_node)

-            if grand_parent_node is not None and grand_parent_node not in candidates:
+            if grand_parent_node is not None and \
+                            grand_parent_node not in candidates:
                candidates[grand_parent_node] = self.score_node(
                    grand_parent_node)
                ordered.append(grand_parent_node)
@ -488,7 +505,8 @@ class Document:
                    else:
                        content_score = 0
                        # if parent_node is not None:
-                        # pweight = self.class_weight(parent_node) + content_score
+                        # pweight = self.class_weight(parent_node) +
+                        # + content_score
                        # pname = describe(parent_node)
                        # else:
                        # pweight = 0
@ -509,7 +527,8 @@ class Document:
                    to_remove = True
                elif content_length < (MIN_LEN) and (
                                counts["img"] == 0 or counts["img"] > 2):
-                    reason = "too short content length %s without a single image" % content_length
+                    reason = "too short content length %s " \
+                             "without a single image" % content_length
                    to_remove = True
                elif weight < 25 and link_density > 0.2:
                    reason = "too many links %.3f for its weight %s" % (
@ -521,27 +540,9 @@ class Document:
                    to_remove = True
                elif (counts["embed"] == 1 and content_length < 75) or counts[
                    "embed"] > 1:
-                    reason = "<embed>s with too short content length, or too many <embed>s"
+                    reason = "<embed>s with too short" \
+                             " content length, or too many <embed>s"
                    to_remove = True
-                    # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
-                    # imgs = el.findall('.//img')
-                    #                    valid_img = False
-                    #                    self.debug(tounicode(el))
-                    #                    for img in imgs:
-                    #
-                    #                        height = img.get('height')
-                    #                        text_length = img.get('text_length')
-                    #                        self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
-                    #                        if to_int(height) >= 100 or to_int(text_length) >= 100:
-                    #                            valid_img = True
-                    #                            self.debug("valid image" + tounicode(img))
-                    #                            break
-                    #                    if valid_img:
-                    #                        to_remove = False
-                    #                        self.debug("Allowing %s" %el.text_content())
-                    #                        for desnode in self.tags(el, "table", "ul", "div"):
-                    #                            allowed[desnode] = True
-
                    # find x non empty preceding and succeeding siblings
                    i, j = 0, 0
                    x = 1