code linting

pull/135/head
Adrien Barbaresi 4 years ago
parent 615ce803c6
commit bd8293eb63

@ -14,11 +14,13 @@ htmlstrip = re.compile("<" # open
">" # end ">" # end
, re.I) , re.I)
def clean_attributes(html): def clean_attributes(html):
while htmlstrip.search(html): while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html) html = htmlstrip.sub('<\\1\\2>', html)
return html return html
def normalize_spaces(s): def normalize_spaces(s):
if not s: if not s:
return '' return ''
@ -26,6 +28,7 @@ def normalize_spaces(s):
characters with a single space""" characters with a single space"""
return ' '.join(s.split()) return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False, style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False, page_structure=False, processing_instructions=True, embedded=False,

@ -49,5 +49,3 @@ def text_content(elem, length=40):
if len(content) < length: if len(content) < length:
return content return content
return content[:length] + '...' return content[:length] + '...'

@ -1,7 +1,6 @@
from lxml.html import tostring from lxml.html import tostring
import logging
import lxml.html import lxml.html
import re, sys import re
from .cleaners import normalize_spaces, clean_attributes from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding from .encoding import get_encoding
@ -9,6 +8,7 @@ from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding='utf-8') utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page): def build_doc(page):
if isinstance(page, str_): if isinstance(page, str_):
encoding = None encoding = None
@ -16,14 +16,16 @@ def build_doc(page):
else: else:
encoding = get_encoding(page) or 'utf-8' encoding = get_encoding(page) or 'utf-8'
decoded_page = page.decode(encoding, 'replace') decoded_page = page.decode(encoding, 'replace')
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser) doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, encoding return doc, encoding
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title): def normalize_entities(cur_title):
entities = { entities = {
u'\u2014':'-', u'\u2014':'-',
@ -41,9 +43,11 @@ def normalize_entities(cur_title):
return cur_title return cur_title
def norm_title(title): def norm_title(title):
return normalize_entities(normalize_spaces(title)) return normalize_entities(normalize_spaces(title))
def get_title(doc): def get_title(doc):
title = doc.find('.//title') title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0: if title is None or title.text is None or len(title.text) == 0:
@ -51,16 +55,19 @@ def get_title(doc):
return norm_title(title.text) return norm_title(title.text)
def add_match(collection, text, orig): def add_match(collection, text, orig):
text = norm_title(text) text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15: if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''): if text.replace('"', '') in orig.replace('"', ''):
collection.add(text) collection.add(text)
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle', TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
'.news_title', '.title', '.head', '.heading', '.news_title', '.title', '.head', '.heading',
'.contentheading', '.small_header_red'] '.contentheading', '.small_header_red']
def shorten_title(doc): def shorten_title(doc):
title = doc.find('.//title') title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0: if title is None or title.text is None or len(title.text) == 0:
@ -109,6 +116,8 @@ def shorten_title(doc):
return title return title
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc): def get_body(doc):
for elem in doc.xpath('.//script | .//link | .//style'): for elem in doc.xpath('.//script | .//link | .//style'):
elem.drop_tree() elem.drop_tree()

@ -4,7 +4,6 @@ import logging
import re import re
import sys import sys
from collections import defaultdict
from lxml.etree import tostring from lxml.etree import tostring
from lxml.etree import tounicode from lxml.etree import tounicode
from lxml.html import document_fromstring from lxml.html import document_fromstring
@ -56,7 +55,6 @@ def to_int(x):
def clean(text): def clean(text):
# Many spaces make the following regexes run forever # Many spaces make the following regexes run forever
text = re.sub(r'\s{255,}', ' ' * 255, text) text = re.sub(r'\s{255,}', ' ' * 255, text)
text = re.sub(r'\s*\n\s*', '\n', text) text = re.sub(r'\s*\n\s*', '\n', text)
text = re.sub(r'\t|[ \t]{2,}', ' ', text) text = re.sub(r'\t|[ \t]{2,}', ' ', text)
return text.strip() return text.strip()
@ -65,12 +63,11 @@ def clean(text):
def text_length(i): def text_length(i):
return len(clean(i.text_content() or "")) return len(clean(i.text_content() or ""))
regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements): def compile_pattern(elements):
if not elements: if not elements:
return None return None
elif isinstance(elements, regexp_type): elif isinstance(elements, re._pattern_type):
return elements return elements
elif isinstance(elements, (str_, bytes_)): elif isinstance(elements, (str_, bytes_)):
if isinstance(elements, bytes_): if isinstance(elements, bytes_):
@ -82,6 +79,7 @@ def compile_pattern(elements):
raise Exception("Unknown type for the pattern: {}".format(type(elements))) raise Exception("Unknown type for the pattern: {}".format(type(elements)))
# assume string or string like object # assume string or string like object
class Document: class Document:
"""Class to build a etree document out of html.""" """Class to build a etree document out of html."""
@ -98,9 +96,9 @@ class Document:
:param xpath: If set to True, adds x="..." attribute to each HTML node, :param xpath: If set to True, adds x="..." attribute to each HTML node,
containing xpath path pointing to original document path (allows to containing xpath path pointing to original document path (allows to
reconstruct selected summary in original document). reconstruct selected summary in original document).
:param handle_failures: Parameter passed to `lxml` for handling failure during exception. :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
Support options = ["discard", "ignore", None] Support options = ["discard", "ignore", None]
Examples: Examples:
positive_keywords=["news-item", "block"] positive_keywords=["news-item", "block"]
positive_keywords=["news-item, block"] positive_keywords=["news-item, block"]
@ -290,7 +288,7 @@ class Document:
return None return None
sorted_candidates = sorted( sorted_candidates = sorted(
candidates.values(), candidates.values(),
key=lambda x: x['content_score'], key=lambda x: x['content_score'],
reverse=True reverse=True
) )
@ -517,10 +515,10 @@ class Document:
#if el.tag == 'div' and counts["img"] >= 1: #if el.tag == 'div' and counts["img"] >= 1:
# continue # continue
if counts["p"] and counts["img"] > 1+counts["p"]*1.3: if counts["p"] and counts["img"] > 1 + counts["p"]*1.3:
reason = "too many images (%s)" % counts["img"] reason = "too many images (%s)" % counts["img"]
to_remove = True to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
reason = "more <li>s than <p>s" reason = "more <li>s than <p>s"
to_remove = True to_remove = True
elif counts["input"] > (counts["p"] / 3): elif counts["input"] > (counts["p"] / 3):

Loading…
Cancel
Save