Merge pull request #131 from azmeuk/black

Used black to format the code
pull/137/head
Yuri Baburov 4 years ago committed by GitHub
commit 4b864d6306
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,14 +7,15 @@ def open_in_browser(html):
import os import os
import webbrowser import webbrowser
import tempfile import tempfile
handle, fn = tempfile.mkstemp(suffix='.html')
f = os.fdopen(handle, 'wb') handle, fn = tempfile.mkstemp(suffix=".html")
f = os.fdopen(handle, "wb")
try: try:
f.write(b"<meta charset='UTF-8' />") f.write(b"<meta charset='UTF-8' />")
f.write(html.encode('utf-8')) f.write(html.encode("utf-8"))
finally: finally:
# we leak the file itself here, but we should at least close it # we leak the file itself here, but we should at least close it
f.close() f.close()
url = 'file://' + fn.replace(os.path.sep, '/') url = "file://" + fn.replace(os.path.sep, "/")
webbrowser.open(url) webbrowser.open(url)
return url return url

@ -2,35 +2,51 @@
import re import re
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
single_quoted = "'[^']+'" single_quoted = "'[^']+'"
double_quoted = '"[^"]+"' double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+' non_space = "[^ \"'>]+"
htmlstrip = re.compile("<" # open htmlstrip = re.compile(
"([^>]+) " # prefix "<" # open
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes "([^>]+) " # prefix
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value "(?:%s) *" % ("|".join(bad_attrs),)
"([^>]*)" # postfix + "= *(?:%s|%s|%s)" # undesirable attributes
">" # end % (non_space, single_quoted, double_quoted)
, re.I) + "([^>]*)" # value # postfix
">", # end
re.I,
)
def clean_attributes(html): def clean_attributes(html):
while htmlstrip.search(html): while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html) html = htmlstrip.sub("<\\1\\2>", html)
return html return html
def normalize_spaces(s): def normalize_spaces(s):
if not s: if not s:
return '' return ""
"""replace any sequence of whitespace """replace any sequence of whitespace
characters with a single space""" characters with a single space"""
return ' '.join(s.split()) return " ".join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, html_cleaner = Cleaner(
style=True, links=True, meta=False, add_nofollow=False, scripts=True,
page_structure=False, processing_instructions=True, embedded=False, javascript=True,
frames=False, forms=False, annoying_tags=False, remove_tags=None, comments=True,
remove_unknown_tags=False, safe_attrs_only=False) style=True,
links=True,
meta=False,
add_nofollow=False,
page_structure=False,
processing_instructions=True,
embedded=False,
frames=False,
forms=False,
annoying_tags=False,
remove_tags=None,
remove_unknown_tags=False,
safe_attrs_only=False,
)

@ -5,10 +5,11 @@ It mainly exists because their are certain incompatibilities in the Python
syntax that can only be solved by conditionally importing different functions. syntax that can only be solved by conditionally importing different functions.
""" """
import sys import sys
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
bytes_ = str bytes_ = str
str_ = unicode str_ = unicode
elif sys.version_info[0] == 3: elif sys.version_info[0] == 3:
bytes_ = bytes bytes_ = bytes
str_ = str str_ = str

@ -1,7 +1,7 @@
import re import re
#FIXME: use with caution, can leak memory # FIXME: use with caution, can leak memory
uids = {} uids = {}
uids_document = None uids_document = None
@ -9,17 +9,17 @@ uids_document = None
def describe_node(node): def describe_node(node):
global uids global uids
if node is None: if node is None:
return '' return ""
if not hasattr(node, 'tag'): if not hasattr(node, "tag"):
return "[%s]" % type(node) return "[%s]" % type(node)
name = node.tag name = node.tag
if node.get('id', ''): if node.get("id", ""):
name += '#' + node.get('id') name += "#" + node.get("id")
if node.get('class', '').strip(): if node.get("class", "").strip():
name += '.' + '.'.join(node.get('class').split()) name += "." + ".".join(node.get("class").split())
if name[:4] in ['div#', 'div.']: if name[:4] in ["div#", "div."]:
name = name[3:] name = name[3:]
if name in ['tr', 'td', 'div', 'p']: if name in ["tr", "td", "div", "p"]:
uid = uids.get(node) uid = uids.get(node)
if uid is None: if uid is None:
uid = uids[node] = len(uids) + 1 uid = uids[node] = len(uids) + 1
@ -34,18 +34,18 @@ def describe(node, depth=1):
uids = {} uids = {}
uids_document = doc uids_document = doc
#return repr(NodeRepr(node)) # return repr(NodeRepr(node))
parent = '' parent = ""
if depth and node.getparent() is not None: if depth and node.getparent() is not None:
parent = describe(node.getparent(), depth=depth - 1) + '>' parent = describe(node.getparent(), depth=depth - 1) + ">"
return parent + describe_node(node) return parent + describe_node(node)
RE_COLLAPSE_WHITESPACES = re.compile(r'\s+', re.U) RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
def text_content(elem, length=40): def text_content(elem, length=40):
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', '')) content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
if len(content) < length: if len(content) < length:
return content return content
return content[:length] + '...' return content[:length] + "..."

@ -8,15 +8,16 @@ RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = { CHARSETS = {
'big5': 'big5hkscs', "big5": "big5hkscs",
'gb2312': 'gb18030', "gb2312": "gb18030",
'ascii': 'utf-8', "ascii": "utf-8",
'maccyrillic': 'cp1251', "maccyrillic": "cp1251",
'win1251': 'cp1251', "win1251": "cp1251",
'win-1251': 'cp1251', "win-1251": "cp1251",
'windows-1251': 'cp1251', "windows-1251": "cp1251",
} }
def fix_charset(encoding): def fix_charset(encoding):
"""Overrides encoding when charset declaration """Overrides encoding when charset declaration
or charset determination is a subset of a larger or charset determination is a subset of a larger
@ -27,9 +28,9 @@ def fix_charset(encoding):
def get_encoding(page): def get_encoding(page):
# Regex for XML and HTML Meta charset declaration # Regex for XML and HTML Meta charset declaration
declared_encodings = (RE_CHARSET.findall(page) + declared_encodings = (
RE_PRAGMA.findall(page) + RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
RE_XML.findall(page)) )
# Try any declared encodings # Try any declared encodings
for declared_encoding in declared_encodings: for declared_encoding in declared_encodings:
@ -38,7 +39,7 @@ def get_encoding(page):
# declared_encoding will actually be bytes but .decode() only # declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should # accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding. # ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode('ascii', 'replace') declared_encoding = declared_encoding.decode("ascii", "replace")
encoding = fix_charset(declared_encoding) encoding = fix_charset(declared_encoding)
@ -51,12 +52,12 @@ def get_encoding(page):
# Fallback to chardet if declared encodings fail # Fallback to chardet if declared encodings fail
# Remove all HTML tags, and leave only text for chardet # Remove all HTML tags, and leave only text for chardet
text = re.sub(br'(\s*</?[^>]*>)+\s*', b' ', page).strip() text = re.sub(br"(\s*</?[^>]*>)+\s*", b" ", page).strip()
enc = 'utf-8' enc = "utf-8"
if len(text) < 10: if len(text) < 10:
return enc # can't guess return enc # can't guess
res = chardet.detect(text) res = chardet.detect(text)
enc = res['encoding'] or 'utf-8' enc = res["encoding"] or "utf-8"
#print '->', enc, "%.2f" % res['confidence'] # print '->', enc, "%.2f" % res['confidence']
enc = fix_charset(enc) enc = fix_charset(enc)
return enc return enc

@ -6,7 +6,7 @@ from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding from .encoding import get_encoding
from .compat import str_ from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding='utf-8') utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
def build_doc(page): def build_doc(page):
@ -14,28 +14,30 @@ def build_doc(page):
encoding = None encoding = None
decoded_page = page decoded_page = page
else: else:
encoding = get_encoding(page) or 'utf-8' encoding = get_encoding(page) or "utf-8"
decoded_page = page.decode(encoding, 'replace') decoded_page = page.decode(encoding, "replace")
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser) doc = lxml.html.document_fromstring(
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
)
return doc, encoding return doc, encoding
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
def normalize_entities(cur_title): def normalize_entities(cur_title):
entities = { entities = {
u'\u2014':'-', u"\u2014": "-",
u'\u2013':'-', u"\u2013": "-",
u'&mdash;': '-', u"&mdash;": "-",
u'&ndash;': '-', u"&ndash;": "-",
u'\u00A0': ' ', u"\u00A0": " ",
u'\u00AB': '"', u"\u00AB": '"',
u'\u00BB': '"', u"\u00BB": '"',
u'&quot;': '"', u"&quot;": '"',
} }
for c, r in entities.items(): for c, r in entities.items():
if c in cur_title: if c in cur_title:
@ -49,9 +51,9 @@ def norm_title(title):
def get_title(doc): def get_title(doc):
title = doc.find('.//title') title = doc.find(".//title")
if title is None or title.text is None or len(title.text) == 0: if title is None or title.text is None or len(title.text) == 0:
return '[no-title]' return "[no-title]"
return norm_title(title.text) return norm_title(title.text)
@ -59,25 +61,34 @@ def get_title(doc):
def add_match(collection, text, orig): def add_match(collection, text, orig):
text = norm_title(text) text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15: if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''): if text.replace('"', "") in orig.replace('"', ""):
collection.add(text) collection.add(text)
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle', TITLE_CSS_HEURISTICS = [
'.news_title', '.title', '.head', '.heading', "#title",
'.contentheading', '.small_header_red'] "#head",
"#heading",
".pageTitle",
".news_title",
".title",
".head",
".heading",
".contentheading",
".small_header_red",
]
def shorten_title(doc): def shorten_title(doc):
title = doc.find('.//title') title = doc.find(".//title")
if title is None or title.text is None or len(title.text) == 0: if title is None or title.text is None or len(title.text) == 0:
return '' return ""
title = orig = norm_title(title.text) title = orig = norm_title(title.text)
candidates = set() candidates = set()
for item in ['.//h1', './/h2', './/h3']: for item in [".//h1", ".//h2", ".//h3"]:
for e in list(doc.iterfind(item)): for e in list(doc.iterfind(item)):
if e.text: if e.text:
add_match(candidates, e.text, orig) add_match(candidates, e.text, orig)
@ -94,7 +105,7 @@ def shorten_title(doc):
if candidates: if candidates:
title = sorted(candidates, key=len)[-1] title = sorted(candidates, key=len)[-1]
else: else:
for delimiter in [' | ', ' - ', ' :: ', ' / ']: for delimiter in [" | ", " - ", " :: ", " / "]:
if delimiter in title: if delimiter in title:
parts = orig.split(delimiter) parts = orig.split(delimiter)
if len(parts[0].split()) >= 4: if len(parts[0].split()) >= 4:
@ -104,12 +115,12 @@ def shorten_title(doc):
title = parts[-1] title = parts[-1]
break break
else: else:
if ': ' in title: if ": " in title:
parts = orig.split(': ') parts = orig.split(": ")
if len(parts[-1].split()) >= 4: if len(parts[-1].split()) >= 4:
title = parts[-1] title = parts[-1]
else: else:
title = orig.split(': ', 1)[1] title = orig.split(": ", 1)[1]
if not 15 < len(title) < 150: if not 15 < len(title) < 150:
return orig return orig
@ -119,15 +130,15 @@ def shorten_title(doc):
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py # is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc): def get_body(doc):
for elem in doc.xpath('.//script | .//link | .//style'): for elem in doc.xpath(".//script | .//link | .//style"):
elem.drop_tree() elem.drop_tree()
# tostring() always return utf-8 encoded string # tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode? # FIXME: isn't better to use tounicode?
raw_html = str_(tostring(doc.body or doc)) raw_html = str_(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html) cleaned = clean_attributes(raw_html)
try: try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it? # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned return cleaned
except Exception: #FIXME find the equivalent lxml error except Exception: # FIXME find the equivalent lxml error
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html return raw_html

@ -22,18 +22,29 @@ from .debug import describe, text_content
log = logging.getLogger("readability.readability") log = logging.getLogger("readability.readability")
REGEXES = { REGEXES = {
'unlikelyCandidatesRe': re.compile(r'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), "unlikelyCandidatesRe": re.compile(
'okMaybeItsACandidateRe': re.compile(r'and|article|body|column|main|shadow', re.I), r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), re.I,
'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), ),
'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), "okMaybeItsACandidateRe": re.compile(r"and|article|body|column|main|shadow", re.I),
"positiveRe": re.compile(
r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",
re.I,
),
"negativeRe": re.compile(
r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget",
re.I,
),
"divToPElementsRe": re.compile(
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
),
#'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I), #'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile(r'^\s+|\s+$/'), #'trimRe': re.compile(r'^\s+|\s+$/'),
#'normalizeRe': re.compile(r'\s{2,}/'), #'normalizeRe': re.compile(r'\s{2,}/'),
#'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'), #'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
'videoRe': re.compile(r'https?:\/\/(www\.)?(youtube|vimeo)\.com', re.I), "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
} }
@ -45,18 +56,18 @@ def to_int(x):
if not x: if not x:
return None return None
x = x.strip() x = x.strip()
if x.endswith('px'): if x.endswith("px"):
return int(x[:-2]) return int(x[:-2])
if x.endswith('em'): if x.endswith("em"):
return int(x[:-2]) * 12 return int(x[:-2]) * 12
return int(x) return int(x)
def clean(text): def clean(text):
# Many spaces make the following regexes run forever # Many spaces make the following regexes run forever
text = re.sub(r'\s{255,}', ' ' * 255, text) text = re.sub(r"\s{255,}", " " * 255, text)
text = re.sub(r'\s*\n\s*', '\n', text) text = re.sub(r"\s*\n\s*", "\n", text)
text = re.sub(r'\t|[ \t]{2,}', ' ', text) text = re.sub(r"\t|[ \t]{2,}", " ", text)
return text.strip() return text.strip()
@ -71,10 +82,10 @@ def compile_pattern(elements):
return elements return elements
elif isinstance(elements, (str_, bytes_)): elif isinstance(elements, (str_, bytes_)):
if isinstance(elements, bytes_): if isinstance(elements, bytes_):
elements = str_(elements, 'utf-8') elements = str_(elements, "utf-8")
elements = elements.split(u',') elements = elements.split(u",")
if isinstance(elements, (list, tuple)): if isinstance(elements, (list, tuple)):
return re.compile(u'|'.join([re.escape(x.strip()) for x in elements]), re.U) return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
else: else:
raise Exception("Unknown type for the pattern: {}".format(type(elements))) raise Exception("Unknown type for the pattern: {}".format(type(elements)))
# assume string or string like object # assume string or string like object
@ -83,9 +94,17 @@ def compile_pattern(elements):
class Document: class Document:
"""Class to build a etree document out of html.""" """Class to build a etree document out of html."""
def __init__(self, input, positive_keywords=None, negative_keywords=None, def __init__(
url=None, min_text_length=25, retry_length=250, xpath=False, self,
handle_failures='discard'): input,
positive_keywords=None,
negative_keywords=None,
url=None,
min_text_length=25,
retry_length=250,
xpath=False,
handle_failures="discard",
):
"""Generate the document """Generate the document
:param input: string of the html content. :param input: string of the html content.
@ -131,8 +150,8 @@ class Document:
if self.xpath: if self.xpath:
root = self.html.getroottree() root = self.html.getroottree()
for i in self.html.getiterator(): for i in self.html.getiterator():
#print root.getpath(i) # print root.getpath(i)
i.attrib['x'] = root.getpath(i) i.attrib["x"] = root.getpath(i)
return self.html return self.html
def _parse(self, input): def _parse(self, input):
@ -143,11 +162,19 @@ class Document:
# trying to guard against bad links like <a href="http://[http://..."> # trying to guard against bad links like <a href="http://[http://...">
try: try:
# such support is added in lxml 3.3.0 # such support is added in lxml 3.3.0
doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures=self.handle_failures) doc.make_links_absolute(
except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures' base_href,
resolve_base_href=True,
handle_failures=self.handle_failures,
)
except TypeError: # make_links_absolute() got an unexpected keyword argument 'handle_failures'
# then we have lxml < 3.3.0 # then we have lxml < 3.3.0
# please upgrade to lxml >= 3.3.0 if you're failing here! # please upgrade to lxml >= 3.3.0 if you're failing here!
doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures=self.handle_failures) doc.make_links_absolute(
base_href,
resolve_base_href=True,
handle_failures=self.handle_failures,
)
else: else:
doc.resolve_base_href(handle_failures=self.handle_failures) doc.resolve_base_href(handle_failures=self.handle_failures)
return doc return doc
@ -169,7 +196,7 @@ class Document:
An internal method, which can be overridden in subclasses, for example, An internal method, which can be overridden in subclasses, for example,
to disable or to improve DOM-to-text conversion in .summary() method to disable or to improve DOM-to-text conversion in .summary() method
""" """
return clean_attributes(tounicode(self.html, method='html')) return clean_attributes(tounicode(self.html, method="html"))
def summary(self, html_partial=False): def summary(self, html_partial=False):
""" """
@ -185,10 +212,10 @@ class Document:
ruthless = True ruthless = True
while True: while True:
self._html(True) self._html(True)
for i in self.tags(self.html, 'script', 'style'): for i in self.tags(self.html, "script", "style"):
i.drop_tree() i.drop_tree()
for i in self.tags(self.html, 'body'): for i in self.tags(self.html, "body"):
i.set('id', 'readabilityBody') i.set("id", "readabilityBody")
if ruthless: if ruthless:
self.remove_unlikely_candidates() self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs() self.transform_misused_divs_into_paragraphs()
@ -197,27 +224,34 @@ class Document:
best_candidate = self.select_best_candidate(candidates) best_candidate = self.select_best_candidate(candidates)
if best_candidate: if best_candidate:
article = self.get_article(candidates, best_candidate, article = self.get_article(
html_partial=html_partial) candidates, best_candidate, html_partial=html_partial
)
else: else:
if ruthless: if ruthless:
log.info("ruthless removal did not work. ") log.info("ruthless removal did not work. ")
ruthless = False ruthless = False
log.debug( log.debug(
("ended up stripping too much - " (
"going for a safer _parse")) "ended up stripping too much - "
"going for a safer _parse"
)
)
# try again # try again
continue continue
else: else:
log.debug( log.debug(
("Ruthless and lenient parsing did not work. " (
"Returning raw html")) "Ruthless and lenient parsing did not work. "
article = self.html.find('body') "Returning raw html"
)
)
article = self.html.find("body")
if article is None: if article is None:
article = self.html article = self.html
cleaned_article = self.sanitize(article, candidates) cleaned_article = self.sanitize(article, candidates)
article_length = len(cleaned_article or '') article_length = len(cleaned_article or "")
retry_length = self.retry_length retry_length = self.retry_length
of_acceptable_length = article_length >= retry_length of_acceptable_length = article_length >= retry_length
if ruthless and not of_acceptable_length: if ruthless and not of_acceptable_length:
@ -227,7 +261,7 @@ class Document:
else: else:
return cleaned_article return cleaned_article
except Exception as e: except Exception as e:
log.exception('error getting summary: ') log.exception("error getting summary: ")
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
from .compat.two import raise_with_traceback from .compat.two import raise_with_traceback
else: else:
@ -238,15 +272,13 @@ class Document:
# Now that we have the top candidate, look through its siblings for # Now that we have the top candidate, look through its siblings for
# content that might also be related. # content that might also be related.
# Things like preambles, content split by ads that we removed, etc. # Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([ sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
10,
best_candidate['content_score'] * 0.2])
# create a new html document with a html->body->div # create a new html document with a html->body->div
if html_partial: if html_partial:
output = fragment_fromstring('<div/>') output = fragment_fromstring("<div/>")
else: else:
output = document_fromstring('<div/>') output = document_fromstring("<div/>")
best_elem = best_candidate['elem'] best_elem = best_candidate["elem"]
parent = best_elem.getparent() parent = best_elem.getparent()
siblings = parent.getchildren() if parent is not None else [best_elem] siblings = parent.getchildren() if parent is not None else [best_elem]
for sibling in siblings: for sibling in siblings:
@ -256,8 +288,10 @@ class Document:
if sibling is best_elem: if sibling is best_elem:
append = True append = True
sibling_key = sibling # HashableElement(sibling) sibling_key = sibling # HashableElement(sibling)
if sibling_key in candidates and \ if (
candidates[sibling_key]['content_score'] >= sibling_score_threshold: sibling_key in candidates
and candidates[sibling_key]["content_score"] >= sibling_score_threshold
):
append = True append = True
if sibling.tag == "p": if sibling.tag == "p":
@ -267,9 +301,11 @@ class Document:
if node_length > 80 and link_density < 0.25: if node_length > 80 and link_density < 0.25:
append = True append = True
elif node_length <= 80 \ elif (
and link_density == 0 \ node_length <= 80
and re.search(r'\.( |$)', node_content): and link_density == 0
and re.search(r"\.( |$)", node_content)
):
append = True append = True
if append: if append:
@ -279,7 +315,7 @@ class Document:
output.append(sibling) output.append(sibling)
else: else:
output.getchildren()[0].getchildren()[0].append(sibling) output.getchildren()[0].getchildren()[0].append(sibling)
#if output is not None: # if output is not None:
# output.append(best_elem) # output.append(best_elem)
return output return output
@ -288,15 +324,11 @@ class Document:
return None return None
sorted_candidates = sorted( sorted_candidates = sorted(
candidates.values(), candidates.values(), key=lambda x: x["content_score"], reverse=True
key=lambda x: x['content_score'],
reverse=True
) )
for candidate in sorted_candidates[:5]: for candidate in sorted_candidates[:5]:
elem = candidate['elem'] elem = candidate["elem"]
log.info("Top 5 : %6.3f %s" % ( log.info("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
candidate['content_score'],
describe(elem)))
best_candidate = sorted_candidates[0] best_candidate = sorted_candidates[0]
return best_candidate return best_candidate
@ -305,7 +337,7 @@ class Document:
link_length = 0 link_length = 0
for i in elem.findall(".//a"): for i in elem.findall(".//a"):
link_length += text_length(i) link_length += text_length(i)
#if len(elem.findall(".//div") or elem.findall(".//p")): # if len(elem.findall(".//div") or elem.findall(".//p")):
# link_length = link_length # link_length = link_length
total_length = text_length(elem) total_length = text_length(elem)
return float(link_length) / max(total_length, 1) return float(link_length) / max(total_length, 1)
@ -333,20 +365,19 @@ class Document:
ordered.append(parent_node) ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates: if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node( candidates[grand_parent_node] = self.score_node(grand_parent_node)
grand_parent_node)
ordered.append(grand_parent_node) ordered.append(grand_parent_node)
content_score = 1 content_score = 1
content_score += len(inner_text.split(',')) content_score += len(inner_text.split(","))
content_score += min((inner_text_len / 100), 3) content_score += min((inner_text_len / 100), 3)
#if elem not in candidates: # if elem not in candidates:
# candidates[elem] = self.score_node(elem) # candidates[elem] = self.score_node(elem)
#WTF? candidates[elem]['content_score'] += content_score # WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]['content_score'] += content_score candidates[parent_node]["content_score"] += content_score
if grand_parent_node is not None: if grand_parent_node is not None:
candidates[grand_parent_node]['content_score'] += content_score / 2.0 candidates[grand_parent_node]["content_score"] += content_score / 2.0
# Scale the final candidates score based on link density. Good content # Scale the final candidates score based on link density. Good content
# should have a relatively small link density (5% or less) and be # should have a relatively small link density (5% or less) and be
@ -354,24 +385,23 @@ class Document:
for elem in ordered: for elem in ordered:
candidate = candidates[elem] candidate = candidates[elem]
ld = self.get_link_density(elem) ld = self.get_link_density(elem)
score = candidate['content_score'] score = candidate["content_score"]
log.debug("Branch %6.3f %s link density %.3f -> %6.3f" % ( log.debug(
score, "Branch %6.3f %s link density %.3f -> %6.3f"
describe(elem), % (score, describe(elem), ld, score * (1 - ld))
ld, )
score * (1 - ld))) candidate["content_score"] *= 1 - ld
candidate['content_score'] *= (1 - ld)
return candidates return candidates
def class_weight(self, e): def class_weight(self, e):
weight = 0 weight = 0
for feature in [e.get('class', None), e.get('id', None)]: for feature in [e.get("class", None), e.get("id", None)]:
if feature: if feature:
if REGEXES['negativeRe'].search(feature): if REGEXES["negativeRe"].search(feature):
weight -= 25 weight -= 25
if REGEXES['positiveRe'].search(feature): if REGEXES["positiveRe"].search(feature):
weight += 25 weight += 25
if self.positive_keywords and self.positive_keywords.search(feature): if self.positive_keywords and self.positive_keywords.search(feature):
@ -380,10 +410,10 @@ class Document:
if self.negative_keywords and self.negative_keywords.search(feature): if self.negative_keywords and self.negative_keywords.search(feature):
weight -= 25 weight -= 25
if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag): if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
weight += 25 weight += 25
if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag): if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag):
weight -= 25 weight -= 25
return weight return weight
@ -397,63 +427,76 @@ class Document:
content_score += 3 content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
content_score -= 3 content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]: elif name in [
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"th",
"header",
"footer",
"nav",
]:
content_score -= 5 content_score -= 5
return { return {"content_score": content_score, "elem": elem}
'content_score': content_score,
'elem': elem
}
def remove_unlikely_candidates(self): def remove_unlikely_candidates(self):
for elem in self.html.findall('.//*'): for elem in self.html.findall(".//*"):
s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
if len(s) < 2: if len(s) < 2:
continue continue
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: if (
REGEXES["unlikelyCandidatesRe"].search(s)
and (not REGEXES["okMaybeItsACandidateRe"].search(s))
and elem.tag not in ["html", "body"]
):
log.debug("Removing unlikely candidate - %s" % describe(elem)) log.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree() elem.drop_tree()
def transform_misused_divs_into_paragraphs(self): def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'): for elem in self.tags(self.html, "div"):
# transform <div>s that do not contain other block elements into # transform <div>s that do not contain other block elements into
# <p>s # <p>s
#FIXME: The current implementation ignores all descendants that # FIXME: The current implementation ignores all descendants that
# are not direct children of elem # are not direct children of elem
# This results in incorrect results in case there is an <img> # This results in incorrect results in case there is an <img>
# buried within an <a> for example # buried within an <a> for example
if not REGEXES['divToPElementsRe'].search( if not REGEXES["divToPElementsRe"].search(
str_(b''.join(map(tostring, list(elem))))): str_(b"".join(map(tostring, list(elem))))
#log.debug("Altering %s to p" % (describe(elem))) ):
# log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p" elem.tag = "p"
#print "Fixed element "+describe(elem) # print "Fixed element "+describe(elem)
for elem in self.tags(self.html, 'div'): for elem in self.tags(self.html, "div"):
if elem.text and elem.text.strip(): if elem.text and elem.text.strip():
p = fragment_fromstring('<p/>') p = fragment_fromstring("<p/>")
p.text = elem.text p.text = elem.text
elem.text = None elem.text = None
elem.insert(0, p) elem.insert(0, p)
#print "Appended "+tounicode(p)+" to "+describe(elem) # print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))): for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip(): if child.tail and child.tail.strip():
p = fragment_fromstring('<p/>') p = fragment_fromstring("<p/>")
p.text = child.tail p.text = child.tail
child.tail = None child.tail = None
elem.insert(pos + 1, p) elem.insert(pos + 1, p)
#print "Inserted "+tounicode(p)+" to "+describe(elem) # print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br': if child.tag == "br":
#print 'Dropped <br> at '+describe(elem) # print 'Dropped <br> at '+describe(elem)
child.drop_tree() child.drop_tree()
def tags(self, node, *tag_names): def tags(self, node, *tag_names):
for tag_name in tag_names: for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name): for e in node.findall(".//%s" % tag_name):
yield e yield e
def reverse_tags(self, node, *tag_names): def reverse_tags(self, node, *tag_names):
for tag_name in tag_names: for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)): for e in reversed(node.findall(".//%s" % tag_name)):
yield e yield e
def sanitize(self, node, candidates): def sanitize(self, node, candidates):
@ -467,31 +510,35 @@ class Document:
for elem in self.tags(node, "iframe"): for elem in self.tags(node, "iframe"):
if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]): if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]):
elem.text = "VIDEO" # ADD content to iframe text node to force <iframe></iframe> proper output elem.text = "VIDEO" # ADD content to iframe text node to force <iframe></iframe> proper output
else: else:
elem.drop_tree() elem.drop_tree()
allowed = {} allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s # Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"): for el in self.reverse_tags(
node, "table", "ul", "div", "aside", "header", "footer", "section"
):
if el in allowed: if el in allowed:
continue continue
weight = self.class_weight(el) weight = self.class_weight(el)
if el in candidates: if el in candidates:
content_score = candidates[el]['content_score'] content_score = candidates[el]["content_score"]
#print '!',el, '-> %6.3f' % content_score # print '!',el, '-> %6.3f' % content_score
else: else:
content_score = 0 content_score = 0
tag = el.tag tag = el.tag
if weight + content_score < 0: if weight + content_score < 0:
log.debug("Removed %s with score %6.3f and weight %-3s" % log.debug(
(describe(el), content_score, weight, )) "Removed %s with score %6.3f and weight %-3s"
% (describe(el), content_score, weight,)
)
el.drop_tree() el.drop_tree()
elif el.text_content().count(",") < 10: elif el.text_content().count(",") < 10:
counts = {} counts = {}
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: for kind in ["p", "img", "li", "a", "embed", "input"]:
counts[kind] = len(el.findall('.//%s' % kind)) counts[kind] = len(el.findall(".//%s" % kind))
counts["li"] -= 100 counts["li"] -= 100
counts["input"] -= len(el.findall('.//input[@type="hidden"]')) counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
@ -501,21 +548,21 @@ class Document:
parent_node = el.getparent() parent_node = el.getparent()
if parent_node is not None: if parent_node is not None:
if parent_node in candidates: if parent_node in candidates:
content_score = candidates[parent_node]['content_score'] content_score = candidates[parent_node]["content_score"]
else: else:
content_score = 0 content_score = 0
#if parent_node is not None: # if parent_node is not None:
#pweight = self.class_weight(parent_node) + content_score # pweight = self.class_weight(parent_node) + content_score
#pname = describe(parent_node) # pname = describe(parent_node)
#else: # else:
#pweight = 0 # pweight = 0
#pname = "no parent" # pname = "no parent"
to_remove = False to_remove = False
reason = "" reason = ""
#if el.tag == 'div' and counts["img"] >= 1: # if el.tag == 'div' and counts["img"] >= 1:
# continue # continue
if counts["p"] and counts["img"] > 1 + counts["p"]*1.3: if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = "too many images (%s)" % counts["img"] reason = "too many images (%s)" % counts["img"]
to_remove = True to_remove = True
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"): elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
@ -525,65 +572,79 @@ class Document:
reason = "less than 3x <p>s than <input>s" reason = "less than 3x <p>s than <input>s"
to_remove = True to_remove = True
elif content_length < MIN_LEN and counts["img"] == 0: elif content_length < MIN_LEN and counts["img"] == 0:
reason = "too short content length %s without a single image" % content_length reason = (
"too short content length %s without a single image"
% content_length
)
to_remove = True to_remove = True
elif content_length < MIN_LEN and counts["img"] > 2: elif content_length < MIN_LEN and counts["img"] > 2:
reason = "too short content length %s and too many images" % content_length reason = (
"too short content length %s and too many images"
% content_length
)
to_remove = True to_remove = True
elif weight < 25 and link_density > 0.2: elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % ( reason = "too many links %.3f for its weight %s" % (
link_density, weight) link_density,
to_remove = True weight,
)
to_remove = True
elif weight >= 25 and link_density > 0.5: elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % ( reason = "too many links %.3f for its weight %s" % (
link_density, weight) link_density,
weight,
)
to_remove = True to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: elif (counts["embed"] == 1 and content_length < 75) or counts[
reason = "<embed>s with too short content length, or too many <embed>s" "embed"
] > 1:
reason = (
"<embed>s with too short content length, or too many <embed>s"
)
to_remove = True to_remove = True
elif not content_length: elif not content_length:
reason = "no content" reason = "no content"
to_remove = True to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove: # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img') # imgs = el.findall('.//img')
# valid_img = False # valid_img = False
# log.debug(tounicode(el)) # log.debug(tounicode(el))
# for img in imgs: # for img in imgs:
# #
# height = img.get('height') # height = img.get('height')
# text_length = img.get('text_length') # text_length = img.get('text_length')
# log.debug ("height %s text_length %s" %(repr(height), repr(text_length))) # log.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
# if to_int(height) >= 100 or to_int(text_length) >= 100: # if to_int(height) >= 100 or to_int(text_length) >= 100:
# valid_img = True # valid_img = True
# log.debug("valid image" + tounicode(img)) # log.debug("valid image" + tounicode(img))
# break # break
# if valid_img: # if valid_img:
# to_remove = False # to_remove = False
# log.debug("Allowing %s" %el.text_content()) # log.debug("Allowing %s" %el.text_content())
# for desnode in self.tags(el, "table", "ul", "div"): # for desnode in self.tags(el, "table", "ul", "div"):
# allowed[desnode] = True # allowed[desnode] = True
#find x non empty preceding and succeeding siblings # find x non empty preceding and succeeding siblings
i, j = 0, 0 i, j = 0, 0
x = 1 x = 1
siblings = [] siblings = []
for sib in el.itersiblings(): for sib in el.itersiblings():
#log.debug(sib.text_content()) # log.debug(sib.text_content())
sib_content_length = text_length(sib) sib_content_length = text_length(sib)
if sib_content_length: if sib_content_length:
i =+ 1 i = +1
siblings.append(sib_content_length) siblings.append(sib_content_length)
if i == x: if i == x:
break break
for sib in el.itersiblings(preceding=True): for sib in el.itersiblings(preceding=True):
#log.debug(sib.text_content()) # log.debug(sib.text_content())
sib_content_length = text_length(sib) sib_content_length = text_length(sib)
if sib_content_length: if sib_content_length:
j =+ 1 j = +1
siblings.append(sib_content_length) siblings.append(sib_content_length)
if j == x: if j == x:
break break
#log.debug(str_(siblings)) # log.debug(str_(siblings))
if siblings and sum(siblings) > 1000: if siblings and sum(siblings) > 1000:
to_remove = False to_remove = False
log.debug("Allowing %s" % describe(el)) log.debug("Allowing %s" % describe(el))
@ -591,40 +652,62 @@ class Document:
allowed[desnode] = True allowed[desnode] = True
if to_remove: if to_remove:
log.debug("Removed %6.3f %s with weight %s cause it has %s." % log.debug(
(content_score, describe(el), weight, reason)) "Removed %6.3f %s with weight %s cause it has %s."
#print tounicode(el) % (content_score, describe(el), weight, reason)
#log.debug("pname %s pweight %.3f" %(pname, pweight)) )
# print tounicode(el)
# log.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree() el.drop_tree()
else: else:
log.debug("Not removing %s of length %s: %s" % ( log.debug(
describe(el), content_length, text_content(el))) "Not removing %s of length %s: %s"
% (describe(el), content_length, text_content(el))
)
self.html = node self.html = node
return self.get_clean_html() return self.get_clean_html()
def main(): def main():
VERBOSITY = { VERBOSITY = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}
1: logging.WARNING,
2: logging.INFO,
3: logging.DEBUG
}
from optparse import OptionParser from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]") parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='count', default=0) parser.add_option("-v", "--verbose", action="count", default=0)
parser.add_option('-b', '--browser', default=None, action='store_true', help="open in browser") parser.add_option(
parser.add_option('-l', '--log', default=None, help="save logs into file (appended)") "-b", "--browser", default=None, action="store_true", help="open in browser"
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") )
parser.add_option('-x', '--xpath', default=None, help="add original xpath") parser.add_option(
parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (comma-separated)", action='store') "-l", "--log", default=None, help="save logs into file (appended)"
parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (comma-separated)", action='store') )
parser.add_option(
"-u", "--url", default=None, help="use URL instead of a local file"
)
parser.add_option("-x", "--xpath", default=None, help="add original xpath")
parser.add_option(
"-p",
"--positive-keywords",
default=None,
help="positive keywords (comma-separated)",
action="store",
)
parser.add_option(
"-n",
"--negative-keywords",
default=None,
help="negative keywords (comma-separated)",
action="store",
)
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if options.verbose: if options.verbose:
logging.basicConfig(level=VERBOSITY[options.verbose], filename=options.log, logging.basicConfig(
format='%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)') level=VERBOSITY[options.verbose],
filename=options.log,
format="%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)",
)
if not (len(args) == 1 or options.url): if not (len(args) == 1 or options.url):
parser.print_help() parser.print_help()
@ -632,36 +715,43 @@ def main():
file = None file = None
if options.url: if options.url:
headers = {'User-Agent': 'Mozilla/5.0'} headers = {"User-Agent": "Mozilla/5.0"}
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
import urllib.request, urllib.parse, urllib.error import urllib.request, urllib.parse, urllib.error
request = urllib.request.Request(options.url, None, headers) request = urllib.request.Request(options.url, None, headers)
file = urllib.request.urlopen(request) file = urllib.request.urlopen(request)
else: else:
import urllib2 import urllib2
request = urllib2.Request(options.url, None, headers) request = urllib2.Request(options.url, None, headers)
file = urllib2.urlopen(request) file = urllib2.urlopen(request)
else: else:
file = open(args[0], 'rt') file = open(args[0], "rt")
try: try:
doc = Document(file.read(), doc = Document(
file.read(),
url=options.url, url=options.url,
positive_keywords = options.positive_keywords, positive_keywords=options.positive_keywords,
negative_keywords = options.negative_keywords, negative_keywords=options.negative_keywords,
) )
if options.browser: if options.browser:
from .browser import open_in_browser from .browser import open_in_browser
result = '<h2>' + doc.short_title() + '</h2><br/>' + doc.summary()
result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
open_in_browser(result) open_in_browser(result)
else: else:
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING enc = (
result = 'Title:' + doc.short_title() + '\n' + doc.summary() sys.__stdout__.encoding or "utf-8"
) # XXX: this hack could not always work, better to set PYTHONIOENCODING
result = "Title:" + doc.short_title() + "\n" + doc.summary()
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
print(result) print(result)
else: else:
print(result.encode(enc, 'replace')) print(result.encode(enc, "replace"))
finally: finally:
file.close() file.close()
if __name__ == '__main__':
if __name__ == "__main__":
main() main()

@ -8,21 +8,22 @@ from setuptools import setup
import sys import sys
lxml_requirement = "lxml" lxml_requirement = "lxml"
if sys.platform == 'darwin': if sys.platform == "darwin":
import platform import platform
mac_ver = platform.mac_ver()[0] mac_ver = platform.mac_ver()[0]
mac_ver_no = int(mac_ver.split('.')[1]) mac_ver_no = int(mac_ver.split(".")[1])
if mac_ver_no < 9: if mac_ver_no < 9:
print("Using lxml<2.4") print("Using lxml<2.4")
lxml_requirement = "lxml<2.4" lxml_requirement = "lxml<2.4"
test_deps = [ test_deps = [
# Test timeouts # Test timeouts
"timeout_decorator", "timeout_decorator",
] ]
extras = { extras = {
'test': test_deps, "test": test_deps,
} }
# Adapted from https://github.com/pypa/pip/blob/master/setup.py # Adapted from https://github.com/pypa/pip/blob/master/setup.py
@ -31,34 +32,29 @@ def find_version(*file_paths):
# Intentionally *not* adding an encoding option to open, See: # Intentionally *not* adding an encoding option to open, See:
# https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690 # https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
with codecs.open(os.path.join(here, *file_paths), 'r') as fp: with codecs.open(os.path.join(here, *file_paths), "r") as fp:
version_file = fp.read() version_file = fp.read()
version_match = re.search( version_match = re.search(
r"^__version__ = ['\"]([^'\"]*)['\"]", r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M,
version_file,
re.M,
) )
if version_match: if version_match:
return version_match.group(1) return version_match.group(1)
raise RuntimeError("Unable to find version string.") raise RuntimeError("Unable to find version string.")
setup( setup(
name="readability-lxml", name="readability-lxml",
version=find_version("readability", "__init__.py"), version=find_version("readability", "__init__.py"),
author="Yuri Baburov", author="Yuri Baburov",
author_email="burchik@gmail.com", author_email="burchik@gmail.com",
description="fast html to text parser (article readability tool) with python3 support", description="fast html to text parser (article readability tool) with python3 support",
test_suite = "tests.test_article_only", test_suite="tests.test_article_only",
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
license="Apache License 2.0", license="Apache License 2.0",
url="http://github.com/buriy/python-readability", url="http://github.com/buriy/python-readability",
packages=['readability', 'readability.compat'], packages=["readability", "readability.compat"],
install_requires=[ install_requires=["chardet", lxml_requirement, "cssselect"],
"chardet",
lxml_requirement,
"cssselect"
],
tests_require=test_deps, tests_require=test_deps,
extras_require=extras, extras_require=extras,
classifiers=[ classifiers=[

@ -5,7 +5,7 @@ from readability import Document
import timeout_decorator import timeout_decorator
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples') SAMPLES = os.path.join(os.path.dirname(__file__), "samples")
def load_sample(filename): def load_sample(filename):
@ -26,30 +26,34 @@ class TestArticleOnly(unittest.TestCase):
def test_si_sample(self): def test_si_sample(self):
"""Using the si sample, load article with only opening body element""" """Using the si sample, load article with only opening body element"""
sample = load_sample('si-game.sample.html') sample = load_sample("si-game.sample.html")
doc = Document( doc = Document(
sample, sample,
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
)
res = doc.summary() res = doc.summary()
self.assertEqual('<html><body><div><div class', res[0:27]) self.assertEqual("<html><body><div><div class", res[0:27])
def test_si_sample_html_partial(self): def test_si_sample_html_partial(self):
"""Using the si sample, make sure we can get the article alone.""" """Using the si sample, make sure we can get the article alone."""
sample = load_sample('si-game.sample.html') sample = load_sample("si-game.sample.html")
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') doc = Document(
sample,
url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
)
res = doc.summary(html_partial=True) res = doc.summary(html_partial=True)
self.assertEqual('<div><div class="', res[0:17]) self.assertEqual('<div><div class="', res[0:17])
def test_too_many_images_sample_html_partial(self): def test_too_many_images_sample_html_partial(self):
"""Using the too-many-images sample, make sure we still get the article.""" """Using the too-many-images sample, make sure we still get the article."""
sample = load_sample('too-many-images.sample.html') sample = load_sample("too-many-images.sample.html")
doc = Document(sample) doc = Document(sample)
res = doc.summary(html_partial=True) res = doc.summary(html_partial=True)
self.assertEqual('<div><div class="post-body', res[0:26]) self.assertEqual('<div><div class="post-body', res[0:26])
def test_wrong_link_issue_49(self): def test_wrong_link_issue_49(self):
"""We shouldn't break on bad HTML.""" """We shouldn't break on bad HTML."""
sample = load_sample('the-hurricane-rubin-carter-denzel-washington.html') sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
doc = Document(sample) doc = Document(sample)
res = doc.summary(html_partial=True) res = doc.summary(html_partial=True)
self.assertEqual('<div><div class="content__article-body ', res[0:39]) self.assertEqual('<div><div class="content__article-body ', res[0:39])
@ -57,10 +61,10 @@ class TestArticleOnly(unittest.TestCase):
def test_best_elem_is_root_and_passing(self): def test_best_elem_is_root_and_passing(self):
sample = ( sample = (
'<html class="article" id="body">' '<html class="article" id="body">'
' <body>' " <body>"
' <p>1234567890123456789012345</p>' " <p>1234567890123456789012345</p>"
' </body>' " </body>"
'</html>' "</html>"
) )
doc = Document(sample) doc = Document(sample)
doc.summary() doc.summary()
@ -91,23 +95,26 @@ class TestArticleOnly(unittest.TestCase):
""" """
doc = Document(sample) doc = Document(sample)
s = doc.summary() s = doc.summary()
#print(s) # print(s)
assert('punctuation' in s) assert "punctuation" in s
assert(not 'comment' in s) assert not "comment" in s
assert(not 'aside' in s) assert not "aside" in s
# Many spaces make some regexes run forever # Many spaces make some regexes run forever
@timeout_decorator.timeout(seconds=3, use_signals=False) @timeout_decorator.timeout(seconds=3, use_signals=False)
def test_many_repeated_spaces(self): def test_many_repeated_spaces(self):
long_space = ' ' * 1000000 long_space = " " * 1000000
sample = '<html><body><p>foo' + long_space + '</p></body></html>' sample = "<html><body><p>foo" + long_space + "</p></body></html>"
doc = Document(sample) doc = Document(sample)
s = doc.summary() s = doc.summary()
assert 'foo' in s assert "foo" in s
def test_not_self_closing(self): def test_not_self_closing(self):
sample = '<h2><a href="#"></a>foobar</h2>' sample = '<h2><a href="#"></a>foobar</h2>'
doc = Document(sample) doc = Document(sample)
assert '<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>' == doc.summary() assert (
'<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>'
== doc.summary()
)

Loading…
Cancel
Save