diff --git a/readability/cleaners.py b/readability/cleaners.py index 5cbab47..2a99b7a 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -14,11 +14,13 @@ htmlstrip = re.compile("<" # open ">" # end , re.I) + def clean_attributes(html): while htmlstrip.search(html): html = htmlstrip.sub('<\\1\\2>', html) return html + def normalize_spaces(s): if not s: return '' @@ -26,6 +28,7 @@ def normalize_spaces(s): characters with a single space""" return ' '.join(s.split()) + html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, diff --git a/readability/debug.py b/readability/debug.py index 061014d..b0ffe9a 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -49,5 +49,3 @@ def text_content(elem, length=40): if len(content) < length: return content return content[:length] + '...' - - diff --git a/readability/htmls.py b/readability/htmls.py index 843f0c0..8088ac2 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,7 +1,6 @@ from lxml.html import tostring -import logging import lxml.html -import re, sys +import re from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding @@ -9,6 +8,7 @@ from .compat import str_ utf8_parser = lxml.html.HTMLParser(encoding='utf-8') + def build_doc(page): if isinstance(page, str_): encoding = None @@ -16,14 +16,16 @@ def build_doc(page): else: encoding = get_encoding(page) or 'utf-8' decoded_page = page.decode(encoding, 'replace') - + # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser) return doc, encoding + def js_re(src, pattern, flags, repl): return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) + def normalize_entities(cur_title): entities = { u'\u2014':'-', @@ -41,9 +43,11 @@ def normalize_entities(cur_title): return cur_title + def norm_title(title): return normalize_entities(normalize_spaces(title)) + def get_title(doc): title = doc.find('.//title') if title is None or title.text is None or len(title.text) == 0: @@ -51,16 +55,19 @@ def get_title(doc): return norm_title(title.text) + def add_match(collection, text, orig): text = norm_title(text) if len(text.split()) >= 2 and len(text) >= 15: if text.replace('"', '') in orig.replace('"', ''): collection.add(text) + TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red'] + def shorten_title(doc): title = doc.find('.//title') if title is None or title.text is None or len(title.text) == 0: @@ -109,6 +116,8 @@ def shorten_title(doc): return title + +# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py def get_body(doc): for elem in doc.xpath('.//script | .//link | .//style'): elem.drop_tree() diff --git a/readability/readability.py b/readability/readability.py index b0323b3..7116c0e 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -4,7 +4,6 @@ import logging import re import sys -from collections import defaultdict from lxml.etree import tostring from lxml.etree import tounicode from lxml.html import document_fromstring @@ -56,7 +55,6 @@ def to_int(x): def clean(text): # Many spaces make the following regexes run forever text = re.sub(r'\s{255,}', ' ' * 255, text) - text = re.sub(r'\s*\n\s*', '\n', text) text = re.sub(r'\t|[ \t]{2,}', ' ', text) return text.strip() @@ -65,12 +63,11 @@ def clean(text): def text_length(i): return len(clean(i.text_content() or "")) -regexp_type = type(re.compile('hello, world')) def compile_pattern(elements): if not elements: return None - elif isinstance(elements, regexp_type): + elif isinstance(elements, re._pattern_type): return elements elif isinstance(elements, (str_, bytes_)): if isinstance(elements, bytes_): @@ -82,6 +79,7 @@ def compile_pattern(elements): raise Exception("Unknown type for the pattern: {}".format(type(elements))) # assume string or string like object + class Document: """Class to build a etree document out of html.""" @@ -98,9 +96,9 @@ class Document: :param xpath: If set to True, adds x="..." attribute to each HTML node, containing xpath path pointing to original document path (allows to reconstruct selected summary in original document). - :param handle_failures: Parameter passed to `lxml` for handling failure during exception. + :param handle_failures: Parameter passed to `lxml` for handling failure during exception. Support options = ["discard", "ignore", None] - + Examples: positive_keywords=["news-item", "block"] positive_keywords=["news-item, block"] @@ -290,7 +288,7 @@ class Document: return None sorted_candidates = sorted( - candidates.values(), + candidates.values(), key=lambda x: x['content_score'], reverse=True ) @@ -517,10 +515,10 @@ class Document: #if el.tag == 'div' and counts["img"] >= 1: # continue - if counts["p"] and counts["img"] > 1+counts["p"]*1.3: + if counts["p"] and counts["img"] > 1 + counts["p"]*1.3: reason = "too many images (%s)" % counts["img"] to_remove = True - elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": + elif counts["li"] > counts["p"] and tag not in ("ol", "ul"): reason = "more
  • s than

    s" to_remove = True elif counts["input"] > (counts["p"] / 3):