Improved title shortener method, and added it to the Document class.

pull/1/head
Yuri Baburov 13 years ago
parent f925e3ef05
commit 96f476181c

@ -1,96 +1,112 @@
from cleaners import normalize_spaces, clean_attributes from cleaners import normalize_spaces, clean_attributes
from encodings import get_encoding from encodings import get_encoding
from lxml.html import tostring from lxml.html import tostring
import logging import logging
import lxml.html import lxml.html
import re import re
logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG)
utf8_parser = lxml.html.HTMLParser(encoding='utf-8') utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page): def build_doc(page):
enc = get_encoding(page) enc = get_encoding(page)
page_enc = page.decode(enc, 'replace').encode('utf-8') page_enc = page.decode(enc, 'replace').encode('utf-8')
doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser) doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
return doc return doc
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title): def normalize_entities(cur_title):
entities = { entities = {
u'\u2014':'-', u'\u2014':'-',
u'\u2013':'-', u'\u2013':'-',
u'—': '-', u'—': '-',
u'–': '-', u'–': '-',
u'\u00A0': ' ', u'\u00A0': ' ',
u'\u00AB': '"', u'\u00AB': '"',
u'\u00BB': '"', u'\u00BB': '"',
u'"': '"', u'"': '"',
} }
for c, r in entities.iteritems(): for c, r in entities.iteritems():
if c in cur_title: if c in cur_title:
cur_title = cur_title.replace(c, r) cur_title = cur_title.replace(c, r)
return cur_title return cur_title
def norm_title(title): def norm_title(title):
return normalize_entities(normalize_spaces(title)) return normalize_entities(normalize_spaces(title))
def get_title(doc): def get_title(doc):
title = doc.find('.//title').text title = doc.find('.//title').text
if not title: if not title:
return '[no-title]' return '[no-title]'
return norm_title(title) return norm_title(title)
def shortify_title(doc): def add_match(collection, text, orig):
title = doc.find('.//title').text text = norm_title(text)
if not title: if len(text.split()) >= 2 and len(text) >= 15:
return '[no-title]' if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)
title = orig = norm_title(title)
def shorten_title(doc):
for delimiter in [' | ', ' - ', ' :: ', ' / ']: title = doc.find('.//title').text
if delimiter in title: if not title:
parts = orig.split(delimiter) return ''
if len(parts[0].split()) >= 4:
title = parts[0] title = orig = norm_title(title)
break
elif len(parts[-1].split()) >= 4: candidates = set()
title = parts[-1]
break for item in ['.//h1', './/h2', './/h3']:
else: for e in list(doc.iterfind(item)):
if ': ' in title: if e.text:
parts = orig.split(': ') add_match(candidates, e.text, orig)
if len(parts[-1].split()) >= 4: if e.text_content():
title = parts[-1] add_match(candidates, e.text_content(), orig)
else:
title = orig.split(': ', 1)[1] for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if len(title.split()) <= 4: if e.text:
h1 = list(doc.iterfind('.//h1')) add_match(candidates, e.text, orig)
if len(h1) == 1: if e.text_content():
title = norm_title(h1[0].text) add_match(candidates, e.text_content(), orig)
elif len(h1) == 0:
h2 = list(doc.iterfind('.//h2')) if candidates:
if len(h1) == 1: title = sorted(candidates, key=len)[-1]
title = norm_title(h2[1].text) else:
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
if not 15 < len(title) < 150: if delimiter in title:
return orig parts = orig.split(delimiter)
if len(parts[0].split()) >= 4:
return title title = parts[0]
break
def get_body(doc): elif len(parts[-1].split()) >= 4:
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] title = parts[-1]
raw_html = unicode(tostring(doc.body or doc)) break
cleaned = clean_attributes(raw_html) else:
try: if ': ' in title:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it? parts = orig.split(': ')
return cleaned if len(parts[-1].split()) >= 4:
except Exception: #FIXME find the equivalent lxml error title = parts[-1]
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) else:
return raw_html title = orig.split(': ', 1)[1]
if not 15 < len(title) < 150:
return orig
return title
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: #FIXME find the equivalent lxml error
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from collections import defaultdict from collections import defaultdict
from cleaners import html_cleaner, clean_attributes from cleaners import html_cleaner, clean_attributes
from htmls import build_doc, get_body, get_title from htmls import build_doc, get_body, get_title, shorten_title
from lxml.etree import tostring, tounicode from lxml.etree import tostring, tounicode
import logging import logging
import re import re
@ -15,12 +15,12 @@ REGEXES = {
'positiveRe': re.compile('caption|article|body|content|entry|hentry|page|pagination|post|text',re.I), 'positiveRe': re.compile('caption|article|body|content|entry|hentry|page|pagination|post|text',re.I),
'negativeRe': re.compile('adwrapper|ad_wrapper|share|bookmark|nav|combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I), 'negativeRe': re.compile('adwrapper|ad_wrapper|share|bookmark|nav|combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I), #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
'trimRe': re.compile('^\s+|\s+$/'), #'trimRe': re.compile('^\s+|\s+$/'),
'normalizeRe': re.compile('\s{2,}/'), #'normalizeRe': re.compile('\s{2,}/'),
'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'), #'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
} }
def describe(node): def describe(node):
@ -37,6 +37,15 @@ def log_candidates(candidates, print_format=""):
#def _text(node): #def _text(node):
# return " ".join(node.findall(text=True)) # return " ".join(node.findall(text=True))
def to_int(x):
if not x: return None
x = x.strip()
if x.endswith('px'):
return int(x[:-2])
if x.endswith('em'):
return int(x[:-2]) * 12
return int(x)
class Unparseable(ValueError): class Unparseable(ValueError):
pass pass
@ -72,6 +81,9 @@ class Document:
def title(self): def title(self):
return get_title(self._html(True)) return get_title(self._html(True))
def short_title(self):
return shorten_title(self._html(True))
def summary(self): def summary(self):
try: try:
ruthless = True ruthless = True
@ -263,9 +275,10 @@ class Document:
def sanitize(self, node, candidates): def sanitize(self, node, candidates):
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree() if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
header.drop_tree()
for elem in self.tags(node, "form", "iframe"): for elem in self.tags(node, "form", "iframe", "textarea"):
elem.drop_tree() elem.drop_tree()
allowed = {} allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s # Conditionally clean <table>s, <ul>s, and <div>s
@ -338,7 +351,7 @@ class Document:
height = img.get('height') height = img.get('height')
width = img.get('width') width = img.get('width')
self.debug ("height %s width %s" %(repr(height), repr(width))) self.debug ("height %s width %s" %(repr(height), repr(width)))
if (height and int(height) >= 50) or (width and int(width) >= 50): if to_int(height) >= 100 or to_int(width) >= 100:
valid_img = True valid_img = True
self.debug("valid image" + tounicode(img)) self.debug("valid image" + tounicode(img))
break break

Loading…
Cancel
Save