Work on why we have an empty <body/> tag

- Seems to come because the sanitizer ends up with two nodes, not one. The
first is an empty body, the second is the article div.
- Fix up the tabs so we can work with the file. Needs lots of pep8 love.
- Implement an initial hack that at least gets it working atm.
- Start to add test cases, sample html files we can test against, etc.
Richard Harding 12 years ago
parent ab783b25b7
commit edccec5d3b

@ -11,502 +11,508 @@ import sys
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
#'normalizeRe': re.compile('\s{2,}/'),
#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
class Unparseable(ValueError):
class Document:
def __init__(self, input, **options):
self.input = input
self.options = defaultdict(lambda: None)
for k, v in options.items():
self.options[k] = v
self.html = None
def _html(self, force=False):
if force or self.html is None:
self.html = self._parse(self.input)
return self.html
def _parse(self, input):
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.options['url']
if base_href:
doc.make_links_absolute(base_href, resolve_base_href=True)
return doc
def content(self):
return get_body(self._html(True))
def title(self):
return get_title(self._html(True))
def short_title(self):
return shorten_title(self._html(True))
def summary(self):
ruthless = True
while True:
for i in self.tags(self.html, 'script', 'style'):
for i in self.tags(self.html, 'body'):
i.set('id', 'readabilityBody')
if ruthless:
candidates = self.score_paragraphs()
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
article = self.get_article(candidates, best_candidate)
if ruthless:
logging.debug("ruthless removal did not work. ")
ruthless = False
self.debug("ended up stripping too much - going for a safer _parse")
# try again
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
article = self.html.find('body')
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
if ruthless and not of_acceptable_length:
ruthless = False
continue # try again
return cleaned_article
except StandardError, e:
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
logging.exception('error getting summary: ' )
raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article(self, candidates, best_candidate):
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>')
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
append = False
if sibling is best_elem:
append = True
sibling_key = sibling #HashableElement(sibling)
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if sibling.tag == "p":
link_density = self.get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
append = True
elif node_length <= 80 and link_density == 0 and'\.( |$)', node_content):
append = True
if append:
#if output is not None:
# output.append(best_elem)
return output
def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0]
return best_candidate
def get_link_density(self, elem):
link_length = 0
for i in elem.findall(".//a"):
link_length += text_length(i)
#if len(elem.findall(".//div") or elem.findall(".//p")):
# link_length = link_length
total_length = text_length(elem)
return float(link_length) / max(total_length, 1)
def score_paragraphs(self, ):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
candidates = {}
#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
ordered = []
for elem in self.tags(self._html(), "p", "pre", "td"):
parent_node = elem.getparent()
if parent_node is None:
grand_parent_node = parent_node.getparent()
inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < MIN_LEN:
if parent_node not in candidates:
candidates[parent_node] = self.score_node(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node(grand_parent_node)
content_score = 1
content_score += len(inner_text.split(','))
content_score += min((inner_text_len / 100), 3)
#if elem not in candidates:
# candidates[elem] = self.score_node(elem)
#WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]['content_score'] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]['content_score'] += content_score / 2.0
# Scale the final candidates score based on link density. Good content should have a
# relatively small link density (5% or less) and be mostly unaffected by this operation.
for elem in ordered:
candidate = candidates[elem]
ld = self.get_link_density(elem)
score = candidate['content_score']
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
candidate['content_score'] *= (1 - ld)
return candidates
def class_weight(self, e):
weight = 0
if e.get('class', None):
if REGEXES['negativeRe'].search(e.get('class')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('class')):
weight += 25
if e.get('id', None):
if REGEXES['negativeRe'].search(e.get('id')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('id')):
weight += 25
return weight
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
content_score -= 5
return {
'content_score': content_score,
'elem': elem
def debug(self, *a):
#if self.options['debug']:
def remove_unlikely_candidates(self):
for elem in self.html.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
if len(s) < 2:
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
self.debug("Removing unlikely candidate - %s" % describe(elem))
def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into <p>s
#FIXME: The current implementation ignores all descendants that are not direct children of elem
# This results in incorrect results in case there is an <img> buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
for elem in self.tags(self.html, 'div'):
if elem.text and elem.text.strip():
p = fragment_fromstring('<p/>')
p.text = elem.text
elem.text = None
elem.insert(0, p)
#print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip():
p = fragment_fromstring('<p/>')
p.text = child.tail
child.tail = None
elem.insert(pos + 1, p)
#print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem)
def tags(self, node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)):
yield e
def sanitize(self, node, candidates):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
for elem in self.tags(node, "form", "iframe", "textarea"):
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed:
weight = self.class_weight(el)
if el in candidates:
content_score = candidates[el]['content_score']
#print '!',el, '-> %6.3f' % content_score
content_score = 0
tag = el.tag
if weight + content_score < 0:
self.debug("Cleaned %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, ))
elif el.text_content().count(",") < 10:
counts = {}
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
counts[kind] = len(el.findall('.//%s' %kind))
counts["li"] -= 100
content_length = text_length(el) # Count the text length excluding any surrounding whitespace
link_density = self.get_link_density(el)
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
content_score = candidates[parent_node]['content_score']
content_score = 0
#if parent_node is not None:
#pweight = self.class_weight(parent_node) + content_score
#pname = describe(parent_node)
#pweight = 0
#pname = "no parent"
to_remove = False
reason = ""
#if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > counts["p"]:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
reason = "too short content length %s without a single image" % content_length
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "<embed>s with too short content length, or too many <embed>s"
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img')
# valid_img = False
# self.debug(tounicode(el))
# for img in imgs:
#self.debug("pname %s pweight %.3f" %(pname, pweight))
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
return clean_attributes(tounicode(node))
class HashableElement():
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
def main():
from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', help="use URL instead of a local file")
(options, args) = parser.parse_args()
if not (len(args) == 1 or options.url):
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8'
print Document(, debug=options.verbose).summary().encode(enc, 'replace')
if __name__ == '__main__':

@ -0,0 +1,762 @@
<title>Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012</title>
<!-- end personalize -->
<!-- start searchbar -->
<div class="cnnBanner">
<div><a href="/"><img src="" alt=" Home" title=" Home"/></a>
<div class="cnnBannerSection">
<div class="cnnLeft"><a href="/baseball/mlb/"><img src=""/></a></div>
<div class="cnn_header"><a href="/baseball/mlb/">MLB GAMEFLASH</a></div>
<div class="cnn_more" style="font-size:9px;"><a href="/baseball/mlb/scoreboards/today/">Scores</a> | <a href="/baseball/mlb/teams/">Teams</a> | <a href="/baseball/mlb/players/">Players</a> | <a href="/fantasy/player_news/mlb/">Player News</a> | <a href="/baseball/mlb/standings/">Standings</a> | <a href="/baseball/mlb/probables/today/">Probables</a> | <a href="/baseball/mlb/schedules/weekly/today/">Schedules</a> | <a href="/baseball/mlb/stats/">Stats</a> | <a href="/baseball/mlb/transactions/">Transactions</a> | <a href="/baseball/mlb/injuries/">Injuries</a> | <a href=" " target="_blank" rel="nofollow">Tickets</a> | <a href="" target="_blank" rel="nofollow">MLB.TV</a>
<div class="cnnClear"></div>
<!-- end banner -->
<!-- start scoreboard -->
<div class="cnngScoreboardNoLastPlay">
<div class="cnngScoreboard">
<div class="cnnLeft">
<table border="0" cellpadding="0" cellspacing="0">
<tr class="cnnRow0">
<td class="cnnCol0">&nbsp;</td>
<td class="cnnCol1">1</td>
<td class="cnnCol2">2</td>
<td class="cnnCol3">3</td>
<td class="cnnCol4">4</td>
<td class="cnnCol5">5</td>
<td class="cnnCol6">6</td>
<td class="cnnCol7">7</td>
<td class="cnnCol8">8</td>
<td class="cnnCol9">9</td>
<td class="cnnColR">R</td>
<td class="cnnColH">H</td>
<td class="cnnColE">E</td>
<tr class="cnnRow1">
<td class="cnnCol0"><a href="/baseball/mlb/teams/tigers/">TIGERS</a></td>
<td class="cnnCol1">&nbsp;</td>
<td class="cnnCol2">&nbsp;</td>
<td class="cnnCol3">&nbsp;</td>
<td class="cnnCol4">&nbsp;</td>
<td class="cnnCol5">&nbsp;</td>
<td class="cnnCol6">&nbsp;</td>
<td class="cnnCol7">&nbsp;</td>
<td class="cnnCol8">&nbsp;</td>
<td class="cnnCol9">&nbsp;</td>
<td class="cnnColR">&nbsp;</td>
<td class="cnnColH">&nbsp;</td>
<td class="cnnColE">&nbsp;</td>
<tr class="cnnRow2">
<td class="cnnCol0"><a href="/baseball/mlb/teams/royals/">ROYALS</a></td>
<td class="cnnCol1">&nbsp;</td>
<td class="cnnCol2">&nbsp;</td>
<td class="cnnCol3">&nbsp;</td>
<td class="cnnCol4">&nbsp;</td>
<td class="cnnCol5">&nbsp;</td>
<td class="cnnCol6">&nbsp;</td>
<td class="cnnCol7">&nbsp;</td>
<td class="cnnCol8">&nbsp;</td>
<td class="cnnCol9">&nbsp;</td>
<td class="cnnColR">&nbsp;</td>
<td class="cnnColH">&nbsp;</td>
<td class="cnnColE">&nbsp;</td>
<div class="cnnRight">
<li class="cnnItem4">8:10 PM ET
<li class="cnnItem0"><strong>Tigers</strong><a href="/baseball/mlb/players/7590/"><img src="" border="0" width="50" height="76" alt="Verlander" title="Verlander"></a><a href="/baseball/mlb/players/7590/">
<div class="cnnLine0">Verlander</div>
<div class="cnnLine4">0-1</div>
<div class="cnnLine5">2.2&nbsp;ERA</div>
<div class="cnnLine6">&nbsp;</div>
<div class="cnnLine7">&nbsp;</div></a></li>
<li class="cnnItem1"><strong>Royals</strong><a href="/baseball/mlb/players/8932/"><img src="" border="0" width="50" height="76" alt="Duffy" title="Duffy"></a><a href="/baseball/mlb/players/8932//">
<div class="cnnLine0">Duffy</div>
<div class="cnnLine4">1-0</div>
<div class="cnnLine5">0&nbsp;ERA</div>
<div class="cnnLine6">&nbsp;</div>
<div class="cnnLine7">&nbsp;</div></a></li>
<!-- end scoreboard -->
<!-- start navbar -->
<div class="cnngNavbar">
<table border="0" cellpadding="0" cellspacing="0">
<td class="cnnCol0"><span>PREVIEW</span></td>
<td class="cnnCol0"><a href="40630_matchup.html">MATCHUP</a></td></li>
<td class="cnnCol3"><a href="40630_fancomment.html">FAN COMMENTS</a></td>
<!-- end navbar -->
<!-- start content -->
<div class="cnngContent">
<div class="cnngPreview">
<div class="cnnLeft">
<!-- REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
<h1>Tigers-Royals Preview</h1>
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7590/index.html">Justin Verlander</a></span>
has pitched well in each of his first two starts, though he doesn't have a win to show for those efforts.
He hasn't had much trouble earning victories against the
<span class="cnnDataLinked"><a href="/baseball/mlb/teams/royals/index.html">Kansas City Royals</a></span>
Verlander looks to continue his mastery of the Royals when the
<span class="cnnDataLinked"><a href="/baseball/mlb/teams/tigers/index.html">Detroit Tigers</a></span>
visit Kauffman Stadium in the opener of a three-game series Monday night.
The reigning AL
<span class="cnnDataLinked"><a href="/baseball/mlb/players/49534/index.html">Cy Young</a></span>
winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't able to hold the lead.
<p>Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay on Wednesday, getting
charged with four runs in 8 1-3 innings of a 4-2 defeat.
</p>"Once a couple guys got on, really the first time I've cranked it up like that - and lost a little bit of my consistency that
I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders."
<p>The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits before the bullpen faltered.
Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a decision.
<p>That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. Verlander is 13-2 with
a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than he has against any other
team. He's also beaten Cleveland 13 times.
<p>Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a three-game series against
the Indians with Sunday's 13-7 loss.
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7634/index.html">Billy Butler</a></span>
, who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with four doubles and a homer
during a five-game hitting streak.
Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager
<span class="cnnDataLinked"><a href="/baseball/mlb/players/1716/index.html">Ned Yost</a></span>
turned to outfielder
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7899/index.html">Mitch Maier</a></span>
in the ninth to pitched a scoreless inning Sunday.
</p>"Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we needed an inning."
Kansas City will look to bounce back with the help of another solid outing from
<span class="cnnDataLinked"><a href="/baseball/mlb/players/8932/index.html">Danny Duffy</a></span>
(1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on Tuesday.
<p>The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in three starts versus the Tigers
as a rookie.
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7129/index.html">Gerald Laird</a></span>
was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game series with a 5-2 victory over
Chicago on Sunday.
<span class="cnnDataLinked"><a href="/baseball/mlb/players/8419/index.html">Rick Porcello</a></span>
allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory.
</p>"All the other starters have pitched well," Porcello said. "It's just the way it's happened so far."
<p>Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, who gave up three runs
over five.
<!-- /REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
<p class="cnnLast">
<a href="" target="new">&#169; 2011 STATS LLC <img src="" align="absmiddle" alt="STATS, Inc"></a>
<div class="cnnRight">
<div class="cnngCommentsBox" id="cnngCommentsBox">
<div class="cnngComments">
<div class="cnnHolder">
<div id="fanComments">
<iframe src="" width="397" height="390" marginwidth="0" scrolling="no" frameborder="0"></iframe>
<div class="cnn_footer">
<div class="cnngToggleOn"><a href="javascript:hidediv();">TURN COMMENTS <span>OFF</span></a></div>
<div class="cnngToggleOff"><a href="javascript:showdiv();">TURN COMMENTS <span>ON</span></a></div>
<!-- end content -->
<!-- start contentFooter -->
@ -0,0 +1,39 @@
import os
import unittest
from readability import Document
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
def load_sample(filename):
"""Helper to get the content out of the sample files"""
return open(os.path.join(SAMPLES, filename)).read()
class TestArticleOnly(unittest.TestCase):
"""The option to not get back a full html doc should work
Given a full html document, the call can request just divs of processed
content. In this way the developer can then wrap the article however they
want in their own view or application.
def setUp(self):
def tearDown(self):
def test_si_sample(self):
"""Using the si sample, make sure we can get the article alone."""
sample = load_sample('si-game.sample.html')
doc = Document(sample)
res = doc.summary(document_only=True)
self.assertEqual('<div class="', res[0:12])