Improved title shortener method, and added it to the Document class.

13 years ago · 96f476181c
parent f925e3ef05
commit 96f476181c
2 changed files with 135 additions and 106 deletions
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -1,96 +1,112 @@
-from cleaners import normalize_spaces, clean_attributes
-from encodings import get_encoding
-from lxml.html import tostring
-import logging
-import lxml.html
-import re
-
-logging.getLogger().setLevel(logging.DEBUG)
-
-utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
-
-def build_doc(page):
-    enc = get_encoding(page)
-    page_enc = page.decode(enc, 'replace').encode('utf-8')
-    doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
-    return doc
-
-def js_re(src, pattern, flags, repl):
-    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
-
-
-def normalize_entities(cur_title):
-    entities = {
-        u'\u2014':'-',
-        u'\u2013':'-',
-        u'&mdash;': '-',
-        u'&ndash;': '-',
-        u'\u00A0': ' ',
-        u'\u00AB': '"',
-        u'\u00BB': '"',
-        u'&quot;': '"',
-    }
-    for c, r in entities.iteritems():
-        if c in cur_title:
-            cur_title = cur_title.replace(c, r)
-
-    return cur_title
-
-def norm_title(title):
-    return normalize_entities(normalize_spaces(title))
-
-def get_title(doc):
-    title = doc.find('.//title').text
-    if not title:
-        return '[no-title]'
-    
-    return norm_title(title)
-
-def shortify_title(doc):
-    title = doc.find('.//title').text
-    if not title:
-        return '[no-title]'
-    
-    title = orig = norm_title(title)
-    
-    for delimiter in [' | ', ' - ', ' :: ', ' / ']:
-        if delimiter in title:
-            parts = orig.split(delimiter)
-            if len(parts[0].split()) >= 4:
-                title = parts[0]
-                break
-            elif len(parts[-1].split()) >= 4:
-                title = parts[-1]
-                break
-    else:
-        if ': ' in title:
-            parts = orig.split(': ')
-            if len(parts[-1].split()) >= 4:
-                title = parts[-1]
-            else:
-                title = orig.split(': ', 1)[1]
-
-    if len(title.split()) <= 4:
-        h1 = list(doc.iterfind('.//h1'))
-        if len(h1) == 1:
-            title = norm_title(h1[0].text)
-        elif len(h1) == 0:
-            h2 = list(doc.iterfind('.//h2'))
-            if len(h1) == 1:
-                title = norm_title(h2[1].text)
-
-    if not 15 < len(title) < 150:
-        return orig
-
-    return title
-
-def get_body(doc):
-    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
-    cleaned = clean_attributes(raw_html)
-    try:
-        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
-        return cleaned
-    except Exception: #FIXME find the equivalent lxml error
-        logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
-        return raw_html
+from cleaners import normalize_spaces, clean_attributes
+from encodings import get_encoding
+from lxml.html import tostring
+import logging
+import lxml.html
+import re
+
+logging.getLogger().setLevel(logging.DEBUG)
+
+utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+
+def build_doc(page):
+    enc = get_encoding(page)
+    page_enc = page.decode(enc, 'replace').encode('utf-8')
+    doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
+    return doc
+
+def js_re(src, pattern, flags, repl):
+    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
+
+
+def normalize_entities(cur_title):
+    entities = {
+        u'\u2014':'-',
+        u'\u2013':'-',
+        u'&mdash;': '-',
+        u'&ndash;': '-',
+        u'\u00A0': ' ',
+        u'\u00AB': '"',
+        u'\u00BB': '"',
+        u'&quot;': '"',
+    }
+    for c, r in entities.iteritems():
+        if c in cur_title:
+            cur_title = cur_title.replace(c, r)
+
+    return cur_title
+
+def norm_title(title):
+    return normalize_entities(normalize_spaces(title))
+
+def get_title(doc):
+    title = doc.find('.//title').text
+    if not title:
+        return '[no-title]'
+    
+    return norm_title(title)
+
+def add_match(collection, text, orig):
+    text = norm_title(text)
+    if len(text.split()) >= 2 and len(text) >= 15:
+        if text.replace('"', '') in orig.replace('"', ''):
+            collection.add(text)
+
+def shorten_title(doc):
+    title = doc.find('.//title').text
+    if not title:
+        return ''
+    
+    title = orig = norm_title(title)
+
+    candidates = set()
+
+    for item in ['.//h1', './/h2', './/h3']:
+        for e in list(doc.iterfind(item)):
+            if e.text:
+                add_match(candidates, e.text, orig)
+            if e.text_content():
+                add_match(candidates, e.text_content(), orig)
+
+    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+        for e in doc.cssselect(item):
+            if e.text:
+                add_match(candidates, e.text, orig)
+            if e.text_content():
+                add_match(candidates, e.text_content(), orig)
+                
+    if candidates:
+        title = sorted(candidates, key=len)[-1]
+    else:
+        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
+            if delimiter in title:
+                parts = orig.split(delimiter)
+                if len(parts[0].split()) >= 4:
+                    title = parts[0]
+                    break
+                elif len(parts[-1].split()) >= 4:
+                    title = parts[-1]
+                    break
+        else:
+            if ': ' in title:
+                parts = orig.split(': ')
+                if len(parts[-1].split()) >= 4:
+                    title = parts[-1]
+                else:
+                    title = orig.split(': ', 1)[1]
+
+    if not 15 < len(title) < 150:
+        return orig
+
+    return title
+
+def get_body(doc):
+    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    raw_html = unicode(tostring(doc.body or doc))
+    cleaned = clean_attributes(raw_html)
+    try:
+        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
+        return cleaned
+    except Exception: #FIXME find the equivalent lxml error
+        logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+        return raw_html
--- a/readability/readability.py
+++ b/readability/readability.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python
 from collections import defaultdict
 from cleaners import html_cleaner, clean_attributes
-from htmls import build_doc, get_body, get_title
+from htmls import build_doc, get_body, get_title, shorten_title
 from lxml.etree import tostring, tounicode
 import logging
 import re
@ -15,12 +15,12 @@ REGEXES = {
 	'positiveRe': re.compile('caption|article|body|content|entry|hentry|page|pagination|post|text',re.I),
 	'negativeRe': re.compile('adwrapper|ad_wrapper|share|bookmark|nav|combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
 	'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
-	'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
-	'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
-	'trimRe': re.compile('^\s+|\s+$/'),
-	'normalizeRe': re.compile('\s{2,}/'),
-	'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
-	'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
+	#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+	#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
+	#'trimRe': re.compile('^\s+|\s+$/'),
+	#'normalizeRe': re.compile('\s{2,}/'),
+	#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+	#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
 }

 def describe(node):
@ -37,6 +37,15 @@ def log_candidates(candidates, print_format=""):
 #def _text(node):
 #	return " ".join(node.findall(text=True))

+def to_int(x):
+	if not x: return None
+	x = x.strip()
+	if x.endswith('px'):
+		return int(x[:-2]) 
+	if x.endswith('em'):
+		return int(x[:-2]) * 12 
+	return int(x)
+
 class Unparseable(ValueError):
 	pass

@ -72,6 +81,9 @@ class Document:
 	def title(self):
 		return get_title(self._html(True))

+	def short_title(self):
+		return shorten_title(self._html(True))
+
 	def summary(self):
 		try:
 			ruthless = True
@ -263,9 +275,10 @@ class Document:

 	def sanitize(self, node, candidates):
 		for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
-			if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree()
+			if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: 
+				header.drop_tree()

-		for elem in self.tags(node, "form", "iframe"):
+		for elem in self.tags(node, "form", "iframe", "textarea"):
 			elem.drop_tree()
 		allowed = {}
 		# Conditionally clean <table>s, <ul>s, and <div>s
@ -338,7 +351,7 @@ class Document:
 						height = img.get('height')
 						width = img.get('width')
 						self.debug ("height %s width %s" %(repr(height), repr(width)))
-						if (height and int(height) >= 50) or (width and int(width) >= 50):
+						if to_int(height) >= 100 or to_int(width) >= 100:
 							valid_img = True
 							self.debug("valid image" + tounicode(img))
 							break