diff --git a/readability/htmls.py b/readability/htmls.py index 55d7516..2d30abb 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,96 +1,112 @@ -from cleaners import normalize_spaces, clean_attributes -from encodings import get_encoding -from lxml.html import tostring -import logging -import lxml.html -import re - -logging.getLogger().setLevel(logging.DEBUG) - -utf8_parser = lxml.html.HTMLParser(encoding='utf-8') - -def build_doc(page): - enc = get_encoding(page) - page_enc = page.decode(enc, 'replace').encode('utf-8') - doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser) - return doc - -def js_re(src, pattern, flags, repl): - return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) - - -def normalize_entities(cur_title): - entities = { - u'\u2014':'-', - u'\u2013':'-', - u'—': '-', - u'–': '-', - u'\u00A0': ' ', - u'\u00AB': '"', - u'\u00BB': '"', - u'"': '"', - } - for c, r in entities.iteritems(): - if c in cur_title: - cur_title = cur_title.replace(c, r) - - return cur_title - -def norm_title(title): - return normalize_entities(normalize_spaces(title)) - -def get_title(doc): - title = doc.find('.//title').text - if not title: - return '[no-title]' - - return norm_title(title) - -def shortify_title(doc): - title = doc.find('.//title').text - if not title: - return '[no-title]' - - title = orig = norm_title(title) - - for delimiter in [' | ', ' - ', ' :: ', ' / ']: - if delimiter in title: - parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] - break - elif len(parts[-1].split()) >= 4: - title = parts[-1] - break - else: - if ': ' in title: - parts = orig.split(': ') - if len(parts[-1].split()) >= 4: - title = parts[-1] - else: - title = orig.split(': ', 1)[1] - - if len(title.split()) <= 4: - h1 = list(doc.iterfind('.//h1')) - if len(h1) == 1: - title = norm_title(h1[0].text) - elif len(h1) == 0: - h2 = list(doc.iterfind('.//h2')) - if len(h1) == 1: - title = norm_title(h2[1].text) - - if not 15 < len(title) < 150: - return orig - - return title - -def get_body(doc): - [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] - raw_html = unicode(tostring(doc.body or doc)) - cleaned = clean_attributes(raw_html) - try: - #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? - return cleaned - except Exception: #FIXME find the equivalent lxml error - logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) - return raw_html +from cleaners import normalize_spaces, clean_attributes +from encodings import get_encoding +from lxml.html import tostring +import logging +import lxml.html +import re + +logging.getLogger().setLevel(logging.DEBUG) + +utf8_parser = lxml.html.HTMLParser(encoding='utf-8') + +def build_doc(page): + enc = get_encoding(page) + page_enc = page.decode(enc, 'replace').encode('utf-8') + doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser) + return doc + +def js_re(src, pattern, flags, repl): + return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) + + +def normalize_entities(cur_title): + entities = { + u'\u2014':'-', + u'\u2013':'-', + u'—': '-', + u'–': '-', + u'\u00A0': ' ', + u'\u00AB': '"', + u'\u00BB': '"', + u'"': '"', + } + for c, r in entities.iteritems(): + if c in cur_title: + cur_title = cur_title.replace(c, r) + + return cur_title + +def norm_title(title): + return normalize_entities(normalize_spaces(title)) + +def get_title(doc): + title = doc.find('.//title').text + if not title: + return '[no-title]' + + return norm_title(title) + +def add_match(collection, text, orig): + text = norm_title(text) + if len(text.split()) >= 2 and len(text) >= 15: + if text.replace('"', '') in orig.replace('"', ''): + collection.add(text) + +def shorten_title(doc): + title = doc.find('.//title').text + if not title: + return '' + + title = orig = norm_title(title) + + candidates = set() + + for item in ['.//h1', './/h2', './/h3']: + for e in list(doc.iterfind(item)): + if e.text: + add_match(candidates, e.text, orig) + if e.text_content(): + add_match(candidates, e.text_content(), orig) + + for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: + for e in doc.cssselect(item): + if e.text: + add_match(candidates, e.text, orig) + if e.text_content(): + add_match(candidates, e.text_content(), orig) + + if candidates: + title = sorted(candidates, key=len)[-1] + else: + for delimiter in [' | ', ' - ', ' :: ', ' / ']: + if delimiter in title: + parts = orig.split(delimiter) + if len(parts[0].split()) >= 4: + title = parts[0] + break + elif len(parts[-1].split()) >= 4: + title = parts[-1] + break + else: + if ': ' in title: + parts = orig.split(': ') + if len(parts[-1].split()) >= 4: + title = parts[-1] + else: + title = orig.split(': ', 1)[1] + + if not 15 < len(title) < 150: + return orig + + return title + +def get_body(doc): + [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] + raw_html = unicode(tostring(doc.body or doc)) + cleaned = clean_attributes(raw_html) + try: + #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? + return cleaned + except Exception: #FIXME find the equivalent lxml error + logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) + return raw_html diff --git a/readability/readability.py b/readability/readability.py index 11f8da0..0802c6b 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,7 +1,7 @@ #!/usr/bin/env python from collections import defaultdict from cleaners import html_cleaner, clean_attributes -from htmls import build_doc, get_body, get_title +from htmls import build_doc, get_body, get_title, shorten_title from lxml.etree import tostring, tounicode import logging import re @@ -15,12 +15,12 @@ REGEXES = { 'positiveRe': re.compile('caption|article|body|content|entry|hentry|page|pagination|post|text',re.I), 'negativeRe': re.compile('adwrapper|ad_wrapper|share|bookmark|nav|combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I), 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), - 'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), - 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), - 'trimRe': re.compile('^\s+|\s+$/'), - 'normalizeRe': re.compile('\s{2,}/'), - 'killBreaksRe': re.compile('((\s| ?)*){1,}/'), - 'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), + #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), + #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), + #'trimRe': re.compile('^\s+|\s+$/'), + #'normalizeRe': re.compile('\s{2,}/'), + #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), + #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), } def describe(node): @@ -37,6 +37,15 @@ def log_candidates(candidates, print_format=""): #def _text(node): # return " ".join(node.findall(text=True)) +def to_int(x): + if not x: return None + x = x.strip() + if x.endswith('px'): + return int(x[:-2]) + if x.endswith('em'): + return int(x[:-2]) * 12 + return int(x) + class Unparseable(ValueError): pass @@ -72,6 +81,9 @@ class Document: def title(self): return get_title(self._html(True)) + def short_title(self): + return shorten_title(self._html(True)) + def summary(self): try: ruthless = True @@ -263,9 +275,10 @@ class Document: def sanitize(self, node, candidates): for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): - if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree() + if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: + header.drop_tree() - for elem in self.tags(node, "form", "iframe"): + for elem in self.tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean s,
    s, and
    s @@ -338,7 +351,7 @@ class Document: height = img.get('height') width = img.get('width') self.debug ("height %s width %s" %(repr(height), repr(width))) - if (height and int(height) >= 50) or (width and int(width) >= 50): + if to_int(height) >= 100 or to_int(width) >= 100: valid_img = True self.debug("valid image" + tounicode(img)) break