code linting

pull/135/head
Adrien Barbaresi 4 years ago
parent 615ce803c6
commit bd8293eb63

@ -14,11 +14,13 @@ htmlstrip = re.compile("<" # open
">" # end
, re.I)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html
def normalize_spaces(s):
if not s:
return ''
@ -26,6 +28,7 @@ def normalize_spaces(s):
characters with a single space"""
return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False,

@ -49,5 +49,3 @@ def text_content(elem, length=40):
if len(content) < length:
return content
return content[:length] + '...'

@ -1,7 +1,6 @@
from lxml.html import tostring
import logging
import lxml.html
import re, sys
import re
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
@ -9,6 +8,7 @@ from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page):
if isinstance(page, str_):
encoding = None
@ -16,14 +16,16 @@ def build_doc(page):
else:
encoding = get_encoding(page) or 'utf-8'
decoded_page = page.decode(encoding, 'replace')
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, encoding
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title):
entities = {
u'\u2014':'-',
@ -41,9 +43,11 @@ def normalize_entities(cur_title):
return cur_title
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0:
@ -51,16 +55,19 @@ def get_title(doc):
return norm_title(title.text)
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
'.news_title', '.title', '.head', '.heading',
'.contentheading', '.small_header_red']
def shorten_title(doc):
title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0:
@ -109,6 +116,8 @@ def shorten_title(doc):
return title
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc):
for elem in doc.xpath('.//script | .//link | .//style'):
elem.drop_tree()

@ -4,7 +4,6 @@ import logging
import re
import sys
from collections import defaultdict
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.html import document_fromstring
@ -56,7 +55,6 @@ def to_int(x):
def clean(text):
# Many spaces make the following regexes run forever
text = re.sub(r'\s{255,}', ' ' * 255, text)
text = re.sub(r'\s*\n\s*', '\n', text)
text = re.sub(r'\t|[ \t]{2,}', ' ', text)
return text.strip()
@ -65,12 +63,11 @@ def clean(text):
def text_length(i):
return len(clean(i.text_content() or ""))
regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements):
if not elements:
return None
elif isinstance(elements, regexp_type):
elif isinstance(elements, re._pattern_type):
return elements
elif isinstance(elements, (str_, bytes_)):
if isinstance(elements, bytes_):
@ -82,6 +79,7 @@ def compile_pattern(elements):
raise Exception("Unknown type for the pattern: {}".format(type(elements)))
# assume string or string like object
class Document:
"""Class to build a etree document out of html."""
@ -98,9 +96,9 @@ class Document:
:param xpath: If set to True, adds x="..." attribute to each HTML node,
containing xpath path pointing to original document path (allows to
reconstruct selected summary in original document).
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
Support options = ["discard", "ignore", None]
Examples:
positive_keywords=["news-item", "block"]
positive_keywords=["news-item, block"]
@ -290,7 +288,7 @@ class Document:
return None
sorted_candidates = sorted(
candidates.values(),
candidates.values(),
key=lambda x: x['content_score'],
reverse=True
)
@ -517,10 +515,10 @@ class Document:
#if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > 1+counts["p"]*1.3:
if counts["p"] and counts["img"] > 1 + counts["p"]*1.3:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):

Loading…
Cancel
Save