You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
145 lines
3.9 KiB
Python
145 lines
3.9 KiB
Python
from lxml.html import tostring
|
|
import lxml.html
|
|
import re
|
|
|
|
from .cleaners import normalize_spaces, clean_attributes
|
|
from .encoding import get_encoding
|
|
from .compat import str_
|
|
|
|
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
|
|
|
|
|
|
def build_doc(page):
|
|
if isinstance(page, str_):
|
|
encoding = None
|
|
decoded_page = page
|
|
else:
|
|
encoding = get_encoding(page) or "utf-8"
|
|
decoded_page = page.decode(encoding, "replace")
|
|
|
|
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
|
|
doc = lxml.html.document_fromstring(
|
|
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
|
|
)
|
|
return doc, encoding
|
|
|
|
|
|
def js_re(src, pattern, flags, repl):
|
|
return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
|
|
|
|
|
|
def normalize_entities(cur_title):
|
|
entities = {
|
|
u"\u2014": "-",
|
|
u"\u2013": "-",
|
|
u"—": "-",
|
|
u"–": "-",
|
|
u"\u00A0": " ",
|
|
u"\u00AB": '"',
|
|
u"\u00BB": '"',
|
|
u""": '"',
|
|
}
|
|
for c, r in entities.items():
|
|
if c in cur_title:
|
|
cur_title = cur_title.replace(c, r)
|
|
|
|
return cur_title
|
|
|
|
|
|
def norm_title(title):
|
|
return normalize_entities(normalize_spaces(title))
|
|
|
|
|
|
def get_title(doc):
|
|
title = doc.find(".//title")
|
|
if title is None or title.text is None or len(title.text) == 0:
|
|
return "[no-title]"
|
|
|
|
return norm_title(title.text)
|
|
|
|
|
|
def add_match(collection, text, orig):
|
|
text = norm_title(text)
|
|
if len(text.split()) >= 2 and len(text) >= 15:
|
|
if text.replace('"', "") in orig.replace('"', ""):
|
|
collection.add(text)
|
|
|
|
|
|
TITLE_CSS_HEURISTICS = [
|
|
"#title",
|
|
"#head",
|
|
"#heading",
|
|
".pageTitle",
|
|
".news_title",
|
|
".title",
|
|
".head",
|
|
".heading",
|
|
".contentheading",
|
|
".small_header_red",
|
|
]
|
|
|
|
|
|
def shorten_title(doc):
|
|
title = doc.find(".//title")
|
|
if title is None or title.text is None or len(title.text) == 0:
|
|
return ""
|
|
|
|
title = orig = norm_title(title.text)
|
|
|
|
candidates = set()
|
|
|
|
for item in [".//h1", ".//h2", ".//h3"]:
|
|
for e in list(doc.iterfind(item)):
|
|
if e.text:
|
|
add_match(candidates, e.text, orig)
|
|
if e.text_content():
|
|
add_match(candidates, e.text_content(), orig)
|
|
|
|
for item in TITLE_CSS_HEURISTICS:
|
|
for e in doc.cssselect(item):
|
|
if e.text:
|
|
add_match(candidates, e.text, orig)
|
|
if e.text_content():
|
|
add_match(candidates, e.text_content(), orig)
|
|
|
|
if candidates:
|
|
title = sorted(candidates, key=len)[-1]
|
|
else:
|
|
for delimiter in [" | ", " - ", " :: ", " / "]:
|
|
if delimiter in title:
|
|
parts = orig.split(delimiter)
|
|
if len(parts[0].split()) >= 4:
|
|
title = parts[0]
|
|
break
|
|
elif len(parts[-1].split()) >= 4:
|
|
title = parts[-1]
|
|
break
|
|
else:
|
|
if ": " in title:
|
|
parts = orig.split(": ")
|
|
if len(parts[-1].split()) >= 4:
|
|
title = parts[-1]
|
|
else:
|
|
title = orig.split(": ", 1)[1]
|
|
|
|
if not 15 < len(title) < 150:
|
|
return orig
|
|
|
|
return title
|
|
|
|
|
|
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
|
|
def get_body(doc):
|
|
for elem in doc.xpath(".//script | .//link | .//style"):
|
|
elem.drop_tree()
|
|
# tostring() always return utf-8 encoded string
|
|
# FIXME: isn't better to use tounicode?
|
|
raw_html = str_(tostring(doc.body or doc))
|
|
cleaned = clean_attributes(raw_html)
|
|
try:
|
|
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
|
return cleaned
|
|
except Exception: # FIXME find the equivalent lxml error
|
|
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
|
|
return raw_html
|