PEP8 again ...

12 years ago · 99efa5c10b
parent a012fd2362
commit 99efa5c10b
1 changed files with 41 additions and 19 deletions
--- a/src/readability_lxml/readability.py
+++ b/src/readability_lxml/readability.py
@ -108,6 +108,7 @@ def tags(node, *tag_names):
        for e in node.findall('.//%s' % tag_name):
            yield e

+
 def class_weight(e):
    weight = 0
    if e.get('class', None):
@ -126,6 +127,7 @@ def class_weight(e):

    return weight

+
 def score_node(elem):
    content_score = class_weight(elem)
    name = elem.tag.lower()
@ -146,7 +148,8 @@ def score_node(elem):
 def transform_misused_divs_into_paragraphs(doc):
    for elem in tags(doc, 'div'):
        # transform <div>s that do not contain other block elements into <p>s
-        if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+        if not REGEXES['divToPElementsRe'].search(
+            unicode(''.join(map(tostring, list(elem))))):
            logging.debug("Altering %s to p" % (describe(elem)))
            elem.tag = "p"
            #print "Fixed element "+describe(elem)
@ -166,12 +169,15 @@ def transform_misused_divs_into_paragraphs(doc):
                p.text = child.tail
                child.tail = None
                elem.insert(pos + 1, p)
-                logging.debug("Inserted %s to %s" % (tounicode(p), describe(elem)))
+                logging.debug("Inserted %s to %s" % (
+                    tounicode(p),
+                    describe(elem)))
                #print "Inserted "+tounicode(p)+" to "+describe(elem)
            if child.tag == 'br':
                #print 'Dropped <br> at '+describe(elem)
                child.drop_tree()

+
 def remove_unlikely_candidates(doc):
    for elem in doc.iter():
        s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
@ -184,6 +190,7 @@ def remove_unlikely_candidates(doc):
            logging.debug("Removing unlikely candidate - %s" % describe(elem))
            elem.drop_tree()

+
 def get_link_density(elem):
    link_length = 0
    for i in elem.findall(".//a"):
@ -232,17 +239,23 @@ def score_paragraphs(doc, min_text_len):
        if grand_parent_node is not None:
            candidates[grand_parent_node]['content_score'] += content_score / 2.0

-    # Scale the final candidates score based on link density. Good content should have a
-    # relatively small link density (5% or less) and be mostly unaffected by this operation.
+    # Scale the final candidates score based on link density. Good content
+    # should have a relatively small link density (5% or less) and be mostly
+    # unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
-        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
+        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
+            score,
+            describe(elem),
+            ld,
+            score * (1 - ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates

+
 def select_best_candidate(candidates):
    sorted_candidates = sorted(candidates.values(),
        key=lambda x: x['content_score'],
@ -266,6 +279,7 @@ def reverse_tags(node, *tag_names):
        for e in reversed(node.findall('.//%s' % tag_name)):
            yield e

+
 def sanitize(node, candidates, min_text_len):
    for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
        if class_weight(header) < 0 or get_link_density(header) > 0.33:
@ -293,10 +307,11 @@ def sanitize(node, candidates, min_text_len):
        elif el.text_content().count(",") < 10:
            counts = {}
            for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
-                counts[kind] = len(el.findall('.//%s' %kind))
+                counts[kind] = len(el.findall('.//%s' % kind))
            counts["li"] -= 100

-            content_length = text_length(el) # Count the text length excluding any surrounding whitespace
+            # Count the text length excluding any surrounding whitespace
+            content_length = text_length(el)
            link_density = get_link_density(el)
            parent_node = el.getparent()
            if parent_node is not None:
@ -347,13 +362,13 @@ def sanitize(node, candidates, min_text_len):

                #find x non empty preceding and succeeding siblings
                i, j = 0, 0
-                x  = 1
+                x = 1
                siblings = []
                for sib in el.itersiblings():
                    #logging.debug(sib.text_content())
                    sib_content_length = text_length(sib)
                    if sib_content_length:
-                        i =+ 1
+                        i += 1
                        siblings.append(sib_content_length)
                        if i == x:
                            break
@ -361,12 +376,12 @@ def sanitize(node, candidates, min_text_len):
                    #logging.debug(sib.text_content())
                    sib_content_length = text_length(sib)
                    if sib_content_length:
-                        j =+ 1
+                        j += 1
                        siblings.append(sib_content_length)
                        if j == x:
                            break
                #logging.debug(str(siblings))
-                if siblings and sum(siblings) > 1000 :
+                if siblings and sum(siblings) > 1000:
                    to_remove = False
                    logging.debug("Allowing %s" % describe(el))
                    for desnode in tags(el, "table", "ul", "div"):
@ -388,9 +403,9 @@ def sanitize(node, candidates, min_text_len):


 def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
-    # Now that we have the top candidate, look through its siblings for content that might also be related.
-    # Things like preambles, content split by ads that we removed, etc.
-
+    # Now that we have the top candidate, look through its siblings for
+    # content that might also be related. Things like preambles, content
+    # split by ads that we removed, etc.
    sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
    if enclose_with_html_tag:
        output = document_fromstring('<div/>')
@ -398,11 +413,12 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
        output = fragment_fromstring('<div/>')
    best_elem = best_candidate['elem']
    for sibling in best_elem.getparent().getchildren():
-        #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
+        #if isinstance(sibling, NavigableString): continue#in lxml there no
+        # concept of simple text
        append = False
        if sibling is best_elem:
            append = True
-        sibling_key = sibling #HashableElement(sibling)
+        sibling_key = sibling  # HashableElement(sibling)

        # Print out sibling information for debugging.
        if sibling_key in candidates:
@ -476,7 +492,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
            of_acceptable_length = len(cleaned_article or '') >= retry_len
            if ruthless and not of_acceptable_length:
                ruthless = False
-                continue # try again
+                continue  # try again
            else:
                return Summary(confidence=confidence,
                    html=cleaned_article,
@ -484,8 +500,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
                    title=get_title(doc))

    except StandardError as e:
-        #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
-        logging.exception('error getting summary: ' )
+        logging.exception('error getting summary: ')
        raise Unparseable(str(e)), None, sys.exc_info()[2]


@ -533,6 +548,7 @@ def clean_segment_number(segments, index, segment):
    else:
        return segment

+
 def clean_segment_index(segments, index, segment):
    if index == (len(segments) - 1) and segment.lower() == 'index':
        return None
@ -555,6 +571,7 @@ def clean_segment_short(segments, index, segment):
    else:
        return segment

+
 def clean_segment(segments, index, segment):
    """
    Cleans a single segment of a URL to find the base URL.  The base URL is as
@ -613,6 +630,7 @@ class CandidatePage():
        self.href = href
        self.score = 0

+
 def same_domain(lhs, rhs):
    split_lhs = urlparse.urlsplit(lhs)
    split_rhs = urlparse.urlsplit(rhs)
@ -625,6 +643,7 @@ def same_domain(lhs, rhs):
 def strip_trailing_slash(s):
    return re.sub(r'/$', '', s)

+
 def eval_href(parsed_urls, url, base_url, link):
    raw_href = link.get('href')
    if raw_href is None:
@ -644,6 +663,7 @@ def eval_href(parsed_urls, url, base_url, link):

    return raw_href, href, True

+
 def eval_link_text(link):
    link_text = clean(link.text_content() or '')
    if REGEXES['extraneous'].search(link_text) or len(link_text) > 25:
@ -651,6 +671,7 @@ def eval_link_text(link):
    else:
        return link_text, True

+
 def find_or_create_page(candidates, href, link_text):
    '''
    Finds or creates a candidate page object for a next-page href.  If one
@ -666,6 +687,7 @@ def find_or_create_page(candidates, href, link_text):
        candidates[href] = candidate
        return candidate, True

+
 def eval_possible_next_page_link(
            parsed_urls, url, base_url, candidates, link):