|
|
|
@ -1,7 +1,6 @@
|
|
|
|
|
from lxml.html import tostring
|
|
|
|
|
import logging
|
|
|
|
|
import lxml.html
|
|
|
|
|
import re, sys
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from .cleaners import normalize_spaces, clean_attributes
|
|
|
|
|
from .encoding import get_encoding
|
|
|
|
@ -9,6 +8,7 @@ from .compat import str_
|
|
|
|
|
|
|
|
|
|
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_doc(page):
|
|
|
|
|
if isinstance(page, str_):
|
|
|
|
|
encoding = None
|
|
|
|
@ -21,9 +21,11 @@ def build_doc(page):
|
|
|
|
|
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
|
|
|
|
|
return doc, encoding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def js_re(src, pattern, flags, repl):
|
|
|
|
|
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_entities(cur_title):
|
|
|
|
|
entities = {
|
|
|
|
|
u'\u2014':'-',
|
|
|
|
@ -41,9 +43,11 @@ def normalize_entities(cur_title):
|
|
|
|
|
|
|
|
|
|
return cur_title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def norm_title(title):
|
|
|
|
|
return normalize_entities(normalize_spaces(title))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_title(doc):
|
|
|
|
|
title = doc.find('.//title')
|
|
|
|
|
if title is None or title.text is None or len(title.text) == 0:
|
|
|
|
@ -51,16 +55,19 @@ def get_title(doc):
|
|
|
|
|
|
|
|
|
|
return norm_title(title.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_match(collection, text, orig):
|
|
|
|
|
text = norm_title(text)
|
|
|
|
|
if len(text.split()) >= 2 and len(text) >= 15:
|
|
|
|
|
if text.replace('"', '') in orig.replace('"', ''):
|
|
|
|
|
collection.add(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
|
|
|
|
|
'.news_title', '.title', '.head', '.heading',
|
|
|
|
|
'.contentheading', '.small_header_red']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def shorten_title(doc):
|
|
|
|
|
title = doc.find('.//title')
|
|
|
|
|
if title is None or title.text is None or len(title.text) == 0:
|
|
|
|
@ -109,6 +116,8 @@ def shorten_title(doc):
|
|
|
|
|
|
|
|
|
|
return title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
|
|
|
|
|
def get_body(doc):
|
|
|
|
|
for elem in doc.xpath('.//script | .//link | .//style'):
|
|
|
|
|
elem.drop_tree()
|
|
|
|
|