Merge pull request #135 from adbar/master

unnecessary imports removed added lines for conformity and readability linted code parts
4 years ago · 07f6861ece
parent 17ffad5a26 bd8293eb63
commit 07f6861ece
4 changed files with 22 additions and 14 deletions
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@ -14,11 +14,13 @@ htmlstrip = re.compile("<" # open
    ">"        # end
 , re.I)

+
 def clean_attributes(html):
    while htmlstrip.search(html):
        html = htmlstrip.sub('<\\1\\2>', html)
    return html

+
 def normalize_spaces(s):
    if not s:
        return ''
@ -26,6 +28,7 @@ def normalize_spaces(s):
    characters with a single space"""
    return ' '.join(s.split())

+
 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
                  page_structure=False, processing_instructions=True, embedded=False,
--- a/readability/debug.py
+++ b/readability/debug.py
@ -49,5 +49,3 @@ def text_content(elem, length=40):
    if len(content) < length:
        return content
    return content[:length] + '...'
-
-
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -1,7 +1,6 @@
 from lxml.html import tostring
-import logging
 import lxml.html
-import re, sys
+import re

 from .cleaners import normalize_spaces, clean_attributes
 from .encoding import get_encoding
@ -9,6 +8,7 @@ from .compat import str_

 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

+
 def build_doc(page):
    if isinstance(page, str_):
        encoding = None
@ -21,9 +21,11 @@ def build_doc(page):
    doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
    return doc, encoding

+
 def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))

+
 def normalize_entities(cur_title):
    entities = {
        u'\u2014':'-',
@ -41,9 +43,11 @@ def normalize_entities(cur_title):

    return cur_title

+
 def norm_title(title):
    return normalize_entities(normalize_spaces(title))

+
 def get_title(doc):
    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
@ -51,16 +55,19 @@ def get_title(doc):

    return norm_title(title.text)

+
 def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
        if text.replace('"', '') in orig.replace('"', ''):
            collection.add(text)

+
 TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
                        '.news_title', '.title', '.head', '.heading',
                        '.contentheading', '.small_header_red']

+
 def shorten_title(doc):
    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
@ -109,6 +116,8 @@ def shorten_title(doc):

    return title

+
+# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
 def get_body(doc):
    for elem in doc.xpath('.//script | .//link | .//style'):
        elem.drop_tree()
--- a/readability/readability.py
+++ b/readability/readability.py
@ -4,7 +4,6 @@ import logging
 import re
 import sys

-from collections import defaultdict
 from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
@ -56,7 +55,6 @@ def to_int(x):
 def clean(text):
    # Many spaces make the following regexes run forever
    text = re.sub(r'\s{255,}', ' ' * 255, text)
-
    text = re.sub(r'\s*\n\s*', '\n', text)
    text = re.sub(r'\t|[ \t]{2,}', ' ', text)
    return text.strip()
@ -65,12 +63,11 @@ def clean(text):
 def text_length(i):
    return len(clean(i.text_content() or ""))

-regexp_type = type(re.compile('hello, world'))

 def compile_pattern(elements):
    if not elements:
        return None
-    elif isinstance(elements, regexp_type):
+    elif isinstance(elements, re._pattern_type):
        return elements
    elif isinstance(elements, (str_, bytes_)):
        if isinstance(elements, bytes_):
@ -82,6 +79,7 @@ def compile_pattern(elements):
        raise Exception("Unknown type for the pattern: {}".format(type(elements)))
        # assume string or string like object

+
 class Document:
    """Class to build a etree document out of html."""

@ -517,10 +515,10 @@ class Document:

                #if el.tag == 'div' and counts["img"] >= 1:
                #    continue
-                if counts["p"] and counts["img"] > 1+counts["p"]*1.3:
+                if counts["p"] and counts["img"] > 1 + counts["p"]*1.3:
                    reason = "too many images (%s)" % counts["img"]
                    to_remove = True
-                elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
+                elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
                    reason = "more <li>s than <p>s"
                    to_remove = True
                elif counts["input"] > (counts["p"] / 3):