From edccec5d3b4cecee3fdccff7667dd81bb3ed6258 Mon Sep 17 00:00:00 2001
From: Richard Harding <rharding@mitechie.com>
Date: Mon, 16 Apr 2012 17:13:24 -0400
Subject: [PATCH 1/4] Work on why we have an empty <body/> tag

- Seems to come because the sanitizer ends up with two nodes, not one. The
first is an empty body, the second is the article div.
- Fix up the tabs so we can work with the file. Needs lots of pep8 love.
- Implement an initial hack that at least gets it working atm.
- Start to add test cases, sample html files we can test against, etc.
---
 readability/readability.py        | 964 +++++++++++++++---------------
 tests/__init__.py                 |   0
 tests/samples/si-game.sample.html | 762 +++++++++++++++++++++++
 tests/test_article_only.py        |  39 ++
 4 files changed, 1286 insertions(+), 479 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/samples/si-game.sample.html
 create mode 100644 tests/test_article_only.py
diff --git a/readability/readability.py b/readability/readability.py
index 9029a2f..b409c59 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -11,502 +11,508 @@ import sys
 logging.basicConfig(level=logging.INFO)
 
 REGEXES = {
-	'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
-	'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
-	'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
-	'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
-	'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
-	#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
-	#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
-	#'trimRe': re.compile('^\s+|\s+$/'),
-	#'normalizeRe': re.compile('\s{2,}/'),
-	#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
-	#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
-	#skipFootnoteLink:	  /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
+    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
+    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
+    'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
+    'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
+    'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
+    #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+    #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
+    #'trimRe': re.compile('^\s+|\s+$/'),
+    #'normalizeRe': re.compile('\s{2,}/'),
+    #'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+    #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
+    #skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 }
 
 def describe(node, depth=1):
-	if not hasattr(node, 'tag'):
-		return "[%s]" % type(node)
-	name = node.tag
-	if node.get('id', ''): name += '#'+node.get('id') 
-	if node.get('class', ''): 
-		name += '.' + node.get('class').replace(' ','.')
-	if name[:4] in ['div#', 'div.']:
-		name = name[3:]
-	if depth and node.getparent() is not None:
-		return name+' - '+describe(node.getparent(), depth-1)
-	return name
+    if not hasattr(node, 'tag'):
+        return "[%s]" % type(node)
+    name = node.tag
+    if node.get('id', ''): name += '#'+node.get('id')
+    if node.get('class', ''):
+        name += '.' + node.get('class').replace(' ','.')
+    if name[:4] in ['div#', 'div.']:
+        name = name[3:]
+    if depth and node.getparent() is not None:
+        return name+' - '+describe(node.getparent(), depth-1)
+    return name
 
 def to_int(x):
-	if not x: return None
-	x = x.strip()
-	if x.endswith('px'):
-		return int(x[:-2]) 
-	if x.endswith('em'):
-		return int(x[:-2]) * 12 
-	return int(x)
+    if not x: return None
+    x = x.strip()
+    if x.endswith('px'):
+        return int(x[:-2])
+    if x.endswith('em'):
+        return int(x[:-2]) * 12
+    return int(x)
 
 def clean(text):
-	text = re.sub('\s*\n\s*', '\n', text)
-	text = re.sub('[ \t]{2,}', ' ', text)
-	return text.strip()
+    text = re.sub('\s*\n\s*', '\n', text)
+    text = re.sub('[ \t]{2,}', ' ', text)
+    return text.strip()
 
 def text_length(i):
-	return len(clean(i.text_content() or ""))
+    return len(clean(i.text_content() or ""))
 
 class Unparseable(ValueError):
-	pass
+    pass
 
 class Document:
-	TEXT_LENGTH_THRESHOLD = 25
-	RETRY_LENGTH = 250
-
-	def __init__(self, input, **options):
-		self.input = input
-		self.options = defaultdict(lambda: None)
-		for k, v in options.items():
-			self.options[k] = v
-		self.html = None
-
-	def _html(self, force=False):
-		if force or self.html is None:
-			self.html = self._parse(self.input)
-		return self.html
-	
-	def _parse(self, input):
-		doc = build_doc(input)
-		doc = html_cleaner.clean_html(doc)
-		base_href = self.options['url']
-		if base_href:
-			doc.make_links_absolute(base_href, resolve_base_href=True)
-		else:
-			doc.resolve_base_href()
-		return doc
-	
-	def content(self):
-		return get_body(self._html(True))
-	
-	def title(self):
-		return get_title(self._html(True))
-
-	def short_title(self):
-		return shorten_title(self._html(True))
-
-	def summary(self):
-		try:
-			ruthless = True
-			while True:
-				self._html(True)
-				
-				for i in self.tags(self.html, 'script', 'style'):
-					i.drop_tree()
-				for i in self.tags(self.html, 'body'):
-					i.set('id', 'readabilityBody')
-				if ruthless: 
-					self.remove_unlikely_candidates()
-				self.transform_misused_divs_into_paragraphs()
-				candidates = self.score_paragraphs()
-				
-				best_candidate = self.select_best_candidate(candidates)
-				if best_candidate:
-					article = self.get_article(candidates, best_candidate)
-				else:
-					if ruthless:
-						logging.debug("ruthless removal did not work. ")
-						ruthless = False
-						self.debug("ended up stripping too much - going for a safer _parse")
-						# try again
-						continue
-					else:
-						logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
-						article = self.html.find('body')
-						if article is None:
-							article = self.html
-
-				cleaned_article = self.sanitize(article, candidates)
-				of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
-				if ruthless and not of_acceptable_length:
-					ruthless = False
-					continue # try again
-				else:
-					return cleaned_article
-		except StandardError, e:
-			#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
-			logging.exception('error getting summary: ' )
-			raise Unparseable(str(e)), None, sys.exc_info()[2]
-
-	def get_article(self, candidates, best_candidate):
-		# Now that we have the top candidate, look through its siblings for content that might also be related.
-		# Things like preambles, content split by ads that we removed, etc.
-
-		sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
-		output = document_fromstring('<div/>')
-		best_elem = best_candidate['elem']
-		for sibling in best_elem.getparent().getchildren():
-			#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text 
-			append = False 
-			if sibling is best_elem:
-				append = True
-			sibling_key = sibling #HashableElement(sibling)
-			if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
-				append = True
-
-			if sibling.tag == "p":
-				link_density = self.get_link_density(sibling)
-				node_content = sibling.text or ""
-				node_length = len(node_content)
-
-				if node_length > 80 and link_density < 0.25:
-					append = True
-				elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
-					append = True
-
-			if append:
-				output.append(sibling)
-		#if output is not None: 
-		#	output.append(best_elem)
-		return output
-
-	def select_best_candidate(self, candidates):
-		sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
-		for candidate in sorted_candidates[:5]:
-			elem = candidate['elem']
-			self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
-
-		if len(sorted_candidates) == 0:
-			return None
-
-		best_candidate = sorted_candidates[0]
-		return best_candidate
-
-
-	def get_link_density(self, elem):
-		link_length = 0
-		for i in elem.findall(".//a"):
-			link_length += text_length(i)
-		#if len(elem.findall(".//div") or elem.findall(".//p")):
-		#	link_length = link_length
-		total_length = text_length(elem)
-		return float(link_length) / max(total_length, 1)
-
-	def score_paragraphs(self, ):
-		MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
-		candidates = {}
-		#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
-
-		ordered = []
-		for elem in self.tags(self._html(), "p", "pre", "td"):
-			parent_node = elem.getparent()
-			if parent_node is None:
-				continue 
-			grand_parent_node = parent_node.getparent()
-
-			inner_text = clean(elem.text_content() or "")
-			inner_text_len = len(inner_text)
-
-			# If this paragraph is less than 25 characters, don't even count it.
-			if inner_text_len < MIN_LEN:
-				continue
-
-			if parent_node not in candidates:
-				candidates[parent_node] = self.score_node(parent_node)
-				ordered.append(parent_node)
-				
-			if grand_parent_node is not None and grand_parent_node not in candidates:
-				candidates[grand_parent_node] = self.score_node(grand_parent_node)
-				ordered.append(grand_parent_node)
-
-			content_score = 1
-			content_score += len(inner_text.split(','))
-			content_score += min((inner_text_len / 100), 3)
-			#if elem not in candidates:
-			#	candidates[elem] = self.score_node(elem)
-				
-			#WTF? candidates[elem]['content_score'] += content_score
-			candidates[parent_node]['content_score'] += content_score
-			if grand_parent_node is not None:
-				candidates[grand_parent_node]['content_score'] += content_score / 2.0
-
-		# Scale the final candidates score based on link density. Good content should have a
-		# relatively small link density (5% or less) and be mostly unaffected by this operation.
-		for elem in ordered:
-			candidate = candidates[elem]
-			ld = self.get_link_density(elem)
-			score = candidate['content_score']
-			self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
-			candidate['content_score'] *= (1 - ld)
-
-		return candidates
-
-	def class_weight(self, e):
-		weight = 0
-		if e.get('class', None):
-			if REGEXES['negativeRe'].search(e.get('class')):
-				weight -= 25
-
-			if REGEXES['positiveRe'].search(e.get('class')):
-				weight += 25
-
-		if e.get('id', None):
-			if REGEXES['negativeRe'].search(e.get('id')):
-				weight -= 25
-
-			if REGEXES['positiveRe'].search(e.get('id')):
-				weight += 25
-
-		return weight
-
-	def score_node(self, elem):
-		content_score = self.class_weight(elem)
-		name = elem.tag.lower()
-		if name == "div":
-			content_score += 5
-		elif name in ["pre", "td", "blockquote"]:
-			content_score += 3
-		elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
-			content_score -= 3
-		elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
-			content_score -= 5
-		return { 
-			'content_score': content_score, 
-			'elem': elem
-		}
-
-	def debug(self, *a):
-		#if self.options['debug']:
-			logging.debug(*a)
-
-	def remove_unlikely_candidates(self):
-		for elem in self.html.iter():
-			s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
-			if len(s) < 2:
-				continue
-			#self.debug(s)
-			if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
-				self.debug("Removing unlikely candidate - %s" % describe(elem))
-				elem.drop_tree()
-
-	def transform_misused_divs_into_paragraphs(self):
-		for elem in self.tags(self.html, 'div'):
-			# transform <div>s that do not contain other block elements into <p>s
-			#FIXME: The current implementation ignores all descendants that are not direct children of elem
-			# This results in incorrect results in case there is an <img> buried within an <a> for example
-			if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
-				#self.debug("Altering %s to p" % (describe(elem)))
-				elem.tag = "p"
-				#print "Fixed element "+describe(elem)
-				
-		for elem in self.tags(self.html, 'div'):
-			if elem.text and elem.text.strip():
-				p = fragment_fromstring('<p/>')
-				p.text = elem.text
-				elem.text = None
-				elem.insert(0, p)
-				#print "Appended "+tounicode(p)+" to "+describe(elem)
-			
-			for pos, child in reversed(list(enumerate(elem))):
-				if child.tail and child.tail.strip():
-					p = fragment_fromstring('<p/>')
-					p.text = child.tail
-					child.tail = None
-					elem.insert(pos + 1, p)
-					#print "Inserted "+tounicode(p)+" to "+describe(elem)
-				if child.tag == 'br':
-					#print 'Dropped <br> at '+describe(elem) 
-					child.drop_tree()
-
-	def tags(self, node, *tag_names):
-		for tag_name in tag_names:
-			for e in node.findall('.//%s' % tag_name):
-				yield e
-
-	def reverse_tags(self, node, *tag_names):
-		for tag_name in tag_names:
-			for e in reversed(node.findall('.//%s' % tag_name)):
-				yield e
-
-	def sanitize(self, node, candidates):
-		MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
-		for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
-			if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: 
-				header.drop_tree()
-
-		for elem in self.tags(node, "form", "iframe", "textarea"):
-			elem.drop_tree()
-		allowed = {}
-		# Conditionally clean <table>s, <ul>s, and <div>s
-		for el in self.reverse_tags(node, "table", "ul", "div"):
-			if el in allowed:
-				continue
-			weight = self.class_weight(el)
-			if el in candidates:
-				content_score = candidates[el]['content_score']
-				#print '!',el, '-> %6.3f' % content_score
-			else:
-				content_score = 0
-			tag = el.tag
-
-			if weight + content_score < 0:
-				self.debug("Cleaned %s with score %6.3f and weight %-3s" %
-					(describe(el), content_score, weight, ))
-				el.drop_tree()
-			elif el.text_content().count(",") < 10:
-				counts = {}
-				for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
-					counts[kind] = len(el.findall('.//%s' %kind))
-				counts["li"] -= 100
-
-				content_length = text_length(el) # Count the text length excluding any surrounding whitespace
-				link_density = self.get_link_density(el)
-				parent_node = el.getparent()
-				if parent_node is not None:
-					if parent_node in candidates:
-						content_score = candidates[parent_node]['content_score']
-					else:
-						content_score = 0
-				#if parent_node is not None:
-					#pweight = self.class_weight(parent_node) + content_score
-					#pname = describe(parent_node)
-				#else:
-					#pweight = 0
-					#pname = "no parent"
-				to_remove = False
-				reason = ""
-
-				#if el.tag == 'div' and counts["img"] >= 1:
-				#	continue
-				if counts["p"] and counts["img"] > counts["p"]:
-					reason = "too many images (%s)" % counts["img"]
-					to_remove = True
-				elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
-					reason = "more <li>s than <p>s"
-					to_remove = True
-				elif counts["input"] > (counts["p"] / 3):
-					reason = "less than 3x <p>s than <input>s"
-					to_remove = True
-				elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
-					reason = "too short content length %s without a single image" % content_length
-					to_remove = True
-				elif weight < 25 and link_density > 0.2:
-						reason = "too many links %.3f for its weight %s" % (link_density, weight)
-						to_remove = True
-				elif weight >= 25 and link_density > 0.5:
-					reason = "too many links %.3f for its weight %s" % (link_density, weight)
-					to_remove = True
-				elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
-					reason = "<embed>s with too short content length, or too many <embed>s"
-					to_remove = True
-#				if el.tag == 'div' and counts['img'] >= 1 and to_remove:
-#					imgs = el.findall('.//img')
-#					valid_img = False
-#					self.debug(tounicode(el))
-#					for img in imgs:
+    TEXT_LENGTH_THRESHOLD = 25
+    RETRY_LENGTH = 250
+
+    def __init__(self, input, **options):
+        self.input = input
+        self.options = defaultdict(lambda: None)
+        for k, v in options.items():
+            self.options[k] = v
+        self.html = None
+
+    def _html(self, force=False):
+        if force or self.html is None:
+            self.html = self._parse(self.input)
+        return self.html
+
+    def _parse(self, input):
+        doc = build_doc(input)
+        doc = html_cleaner.clean_html(doc)
+        base_href = self.options['url']
+        if base_href:
+            doc.make_links_absolute(base_href, resolve_base_href=True)
+        else:
+            doc.resolve_base_href()
+        return doc
+
+    def content(self):
+        return get_body(self._html(True))
+
+    def title(self):
+        return get_title(self._html(True))
+
+    def short_title(self):
+        return shorten_title(self._html(True))
+
+    def summary(self, document_only=False):
+        try:
+            ruthless = True
+            while True:
+                self._html(True)
+
+                for i in self.tags(self.html, 'script', 'style'):
+                    i.drop_tree()
+                for i in self.tags(self.html, 'body'):
+                    i.set('id', 'readabilityBody')
+                if ruthless:
+                    self.remove_unlikely_candidates()
+                self.transform_misused_divs_into_paragraphs()
+                candidates = self.score_paragraphs()
+
+                best_candidate = self.select_best_candidate(candidates)
+
+                if best_candidate:
+                    article = self.get_article(candidates, best_candidate)
+                else:
+                    if ruthless:
+                        logging.debug("ruthless removal did not work. ")
+                        ruthless = False
+                        self.debug("ended up stripping too much - going for a safer _parse")
+                        # try again
+                        continue
+                    else:
+                        logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
+                        article = self.html.find('body')
+                        if article is None:
+                            article = self.html
+                cleaned_article = self.sanitize(article, candidates)
+                of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
+                if ruthless and not of_acceptable_length:
+                    ruthless = False
+                    continue # try again
+                else:
+                    return cleaned_article
+        except StandardError, e:
+            #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
+            logging.exception('error getting summary: ' )
+            raise Unparseable(str(e)), None, sys.exc_info()[2]
+
+    def get_article(self, candidates, best_candidate):
+        # Now that we have the top candidate, look through its siblings for content that might also be related.
+        # Things like preambles, content split by ads that we removed, etc.
+
+        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
+        output = document_fromstring('<div/>')
+        best_elem = best_candidate['elem']
+        for sibling in best_elem.getparent().getchildren():
+            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
+            append = False
+            if sibling is best_elem:
+                append = True
+            sibling_key = sibling #HashableElement(sibling)
+            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+                append = True
+
+            if sibling.tag == "p":
+                link_density = self.get_link_density(sibling)
+                node_content = sibling.text or ""
+                node_length = len(node_content)
+
+                if node_length > 80 and link_density < 0.25:
+                    append = True
+                elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
+                    append = True
+
+            if append:
+                output.append(sibling)
+        #if output is not None:
+        #    output.append(best_elem)
+        return output
+
+    def select_best_candidate(self, candidates):
+        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+        for candidate in sorted_candidates[:5]:
+            elem = candidate['elem']
+            self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
+
+        if len(sorted_candidates) == 0:
+            return None
+
+        best_candidate = sorted_candidates[0]
+        return best_candidate
+
+
+    def get_link_density(self, elem):
+        link_length = 0
+        for i in elem.findall(".//a"):
+            link_length += text_length(i)
+        #if len(elem.findall(".//div") or elem.findall(".//p")):
+        #    link_length = link_length
+        total_length = text_length(elem)
+        return float(link_length) / max(total_length, 1)
+
+    def score_paragraphs(self, ):
+        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
+        candidates = {}
+        #self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
+
+        ordered = []
+        for elem in self.tags(self._html(), "p", "pre", "td"):
+            parent_node = elem.getparent()
+            if parent_node is None:
+                continue
+            grand_parent_node = parent_node.getparent()
+
+            inner_text = clean(elem.text_content() or "")
+            inner_text_len = len(inner_text)
+
+            # If this paragraph is less than 25 characters, don't even count it.
+            if inner_text_len < MIN_LEN:
+                continue
+
+            if parent_node not in candidates:
+                candidates[parent_node] = self.score_node(parent_node)
+                ordered.append(parent_node)
+
+            if grand_parent_node is not None and grand_parent_node not in candidates:
+                candidates[grand_parent_node] = self.score_node(grand_parent_node)
+                ordered.append(grand_parent_node)
+
+            content_score = 1
+            content_score += len(inner_text.split(','))
+            content_score += min((inner_text_len / 100), 3)
+            #if elem not in candidates:
+            #    candidates[elem] = self.score_node(elem)
+
+            #WTF? candidates[elem]['content_score'] += content_score
+            candidates[parent_node]['content_score'] += content_score
+            if grand_parent_node is not None:
+                candidates[grand_parent_node]['content_score'] += content_score / 2.0
+
+        # Scale the final candidates score based on link density. Good content should have a
+        # relatively small link density (5% or less) and be mostly unaffected by this operation.
+        for elem in ordered:
+            candidate = candidates[elem]
+            ld = self.get_link_density(elem)
+            score = candidate['content_score']
+            self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
+            candidate['content_score'] *= (1 - ld)
+
+        return candidates
+
+    def class_weight(self, e):
+        weight = 0
+        if e.get('class', None):
+            if REGEXES['negativeRe'].search(e.get('class')):
+                weight -= 25
+
+            if REGEXES['positiveRe'].search(e.get('class')):
+                weight += 25
+
+        if e.get('id', None):
+            if REGEXES['negativeRe'].search(e.get('id')):
+                weight -= 25
+
+            if REGEXES['positiveRe'].search(e.get('id')):
+                weight += 25
+
+        return weight
+
+    def score_node(self, elem):
+        content_score = self.class_weight(elem)
+        name = elem.tag.lower()
+        if name == "div":
+            content_score += 5
+        elif name in ["pre", "td", "blockquote"]:
+            content_score += 3
+        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
+            content_score -= 3
+        elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
+            content_score -= 5
+        return {
+            'content_score': content_score,
+            'elem': elem
+        }
+
+    def debug(self, *a):
+        #if self.options['debug']:
+            logging.debug(*a)
+
+    def remove_unlikely_candidates(self):
+        for elem in self.html.iter():
+            s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
+            if len(s) < 2:
+                continue
+            #self.debug(s)
+            if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
+                self.debug("Removing unlikely candidate - %s" % describe(elem))
+                elem.drop_tree()
+
+    def transform_misused_divs_into_paragraphs(self):
+        for elem in self.tags(self.html, 'div'):
+            # transform <div>s that do not contain other block elements into <p>s
+            #FIXME: The current implementation ignores all descendants that are not direct children of elem
+            # This results in incorrect results in case there is an <img> buried within an <a> for example
+            if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+                #self.debug("Altering %s to p" % (describe(elem)))
+                elem.tag = "p"
+                #print "Fixed element "+describe(elem)
+
+        for elem in self.tags(self.html, 'div'):
+            if elem.text and elem.text.strip():
+                p = fragment_fromstring('<p/>')
+                p.text = elem.text
+                elem.text = None
+                elem.insert(0, p)
+                #print "Appended "+tounicode(p)+" to "+describe(elem)
+
+            for pos, child in reversed(list(enumerate(elem))):
+                if child.tail and child.tail.strip():
+                    p = fragment_fromstring('<p/>')
+                    p.text = child.tail
+                    child.tail = None
+                    elem.insert(pos + 1, p)
+                    #print "Inserted "+tounicode(p)+" to "+describe(elem)
+                if child.tag == 'br':
+                    #print 'Dropped <br> at '+describe(elem)
+                    child.drop_tree()
+
+    def tags(self, node, *tag_names):
+        for tag_name in tag_names:
+            for e in node.findall('.//%s' % tag_name):
+                yield e
+
+    def reverse_tags(self, node, *tag_names):
+        for tag_name in tag_names:
+            for e in reversed(node.findall('.//%s' % tag_name)):
+                yield e
+
+    def sanitize(self, node, candidates):
+        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
+        for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
+            if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
+                header.drop_tree()
+
+        for elem in self.tags(node, "form", "iframe", "textarea"):
+            elem.drop_tree()
+        allowed = {}
+        # Conditionally clean <table>s, <ul>s, and <div>s
+        for el in self.reverse_tags(node, "table", "ul", "div"):
+            if el in allowed:
+                continue
+            weight = self.class_weight(el)
+            if el in candidates:
+                content_score = candidates[el]['content_score']
+                #print '!',el, '-> %6.3f' % content_score
+            else:
+                content_score = 0
+            tag = el.tag
+
+            if weight + content_score < 0:
+                self.debug("Cleaned %s with score %6.3f and weight %-3s" %
+                    (describe(el), content_score, weight, ))
+                el.drop_tree()
+            elif el.text_content().count(",") < 10:
+                counts = {}
+                for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
+                    counts[kind] = len(el.findall('.//%s' %kind))
+                counts["li"] -= 100
+
+                content_length = text_length(el) # Count the text length excluding any surrounding whitespace
+                link_density = self.get_link_density(el)
+                parent_node = el.getparent()
+                if parent_node is not None:
+                    if parent_node in candidates:
+                        content_score = candidates[parent_node]['content_score']
+                    else:
+                        content_score = 0
+                #if parent_node is not None:
+                    #pweight = self.class_weight(parent_node) + content_score
+                    #pname = describe(parent_node)
+                #else:
+                    #pweight = 0
+                    #pname = "no parent"
+                to_remove = False
+                reason = ""
+
+                #if el.tag == 'div' and counts["img"] >= 1:
+                #    continue
+                if counts["p"] and counts["img"] > counts["p"]:
+                    reason = "too many images (%s)" % counts["img"]
+                    to_remove = True
+                elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
+                    reason = "more <li>s than <p>s"
+                    to_remove = True
+                elif counts["input"] > (counts["p"] / 3):
+                    reason = "less than 3x <p>s than <input>s"
+                    to_remove = True
+                elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
+                    reason = "too short content length %s without a single image" % content_length
+                    to_remove = True
+                elif weight < 25 and link_density > 0.2:
+                        reason = "too many links %.3f for its weight %s" % (link_density, weight)
+                        to_remove = True
+                elif weight >= 25 and link_density > 0.5:
+                    reason = "too many links %.3f for its weight %s" % (link_density, weight)
+                    to_remove = True
+                elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
+                    reason = "<embed>s with too short content length, or too many <embed>s"
+                    to_remove = True
+#                if el.tag == 'div' and counts['img'] >= 1 and to_remove:
+#                    imgs = el.findall('.//img')
+#                    valid_img = False
+#                    self.debug(tounicode(el))
+#                    for img in imgs:
 #
-#						height = img.get('height')
-#						text_length = img.get('text_length')
-#						self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
-#						if to_int(height) >= 100 or to_int(text_length) >= 100:
-#							valid_img = True
-#							self.debug("valid image" + tounicode(img))
-#							break
-#					if valid_img:
-#						to_remove = False
-#						self.debug("Allowing %s" %el.text_content())
-#						for desnode in self.tags(el, "table", "ul", "div"):
-#							allowed[desnode] = True
-
-					#find x non empty preceding and succeeding siblings
-					i, j = 0, 0
-					x  = 1
-					siblings = []
-					for sib in el.itersiblings():
-						#self.debug(sib.text_content())
-						sib_content_length = text_length(sib)
-						if sib_content_length:
-							i =+ 1
-							siblings.append(sib_content_length)
-							if i == x:
-								break
-					for sib in el.itersiblings(preceding=True):
-						#self.debug(sib.text_content())
-						sib_content_length = text_length(sib)
-						if sib_content_length:
-							j =+ 1
-							siblings.append(sib_content_length)
-							if j == x:
-								break
-					#self.debug(str(siblings))
-					if siblings and sum(siblings) > 1000 :
-						to_remove = False
-						self.debug("Allowing %s" % describe(el))
-						for desnode in self.tags(el, "table", "ul", "div"):
-							allowed[desnode] = True
-
-				if to_remove:
-					self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
-						(content_score, describe(el), weight, reason))
-					#print tounicode(el)
-					#self.debug("pname %s pweight %.3f" %(pname, pweight))
-					el.drop_tree()
-
-		for el in ([node] + [n for n in node.iter()]):
-			if not (self.options['attributes']):
-				#el.attrib = {} #FIXME:Checkout the effects of disabling this
-				pass
-
-		return clean_attributes(tounicode(node))
-	
+#                        height = img.get('height')
+#                        text_length = img.get('text_length')
+#                        self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
+#                        if to_int(height) >= 100 or to_int(text_length) >= 100:
+#                            valid_img = True
+#                            self.debug("valid image" + tounicode(img))
+#                            break
+#                    if valid_img:
+#                        to_remove = False
+#                        self.debug("Allowing %s" %el.text_content())
+#                        for desnode in self.tags(el, "table", "ul", "div"):
+#                            allowed[desnode] = True
+
+                    #find x non empty preceding and succeeding siblings
+                    i, j = 0, 0
+                    x  = 1
+                    siblings = []
+                    for sib in el.itersiblings():
+                        #self.debug(sib.text_content())
+                        sib_content_length = text_length(sib)
+                        if sib_content_length:
+                            i =+ 1
+                            siblings.append(sib_content_length)
+                            if i == x:
+                                break
+                    for sib in el.itersiblings(preceding=True):
+                        #self.debug(sib.text_content())
+                        sib_content_length = text_length(sib)
+                        if sib_content_length:
+                            j =+ 1
+                            siblings.append(sib_content_length)
+                            if j == x:
+                                break
+                    #self.debug(str(siblings))
+                    if siblings and sum(siblings) > 1000 :
+                        to_remove = False
+                        self.debug("Allowing %s" % describe(el))
+                        for desnode in self.tags(el, "table", "ul", "div"):
+                            allowed[desnode] = True
+
+                if to_remove:
+                    self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
+                        (content_score, describe(el), weight, reason))
+                    #print tounicode(el)
+                    #self.debug("pname %s pweight %.3f" %(pname, pweight))
+                    el.drop_tree()
+
+        for el in ([node] + [n for n in node.iter()]):
+            if not (self.options['attributes']):
+                #el.attrib = {} #FIXME:Checkout the effects of disabling this
+                pass
+        # There can be two nodes here. We really want to tounicode only one of
+        # them.
+        # To start with let's hack it to get the longest tree as our document.
+        if len(node.getchildren()) > 1:
+            children = node.getchildren()
+            sorted_list = sorted(children, key=len, reverse=True)
+            node = sorted_list[0]
+        return clean_attributes(tounicode(node))
+
 
 class HashableElement():
-	def __init__(self, node):
-		self.node = node
-		self._path = None
-
-	def _get_path(self):
-		if self._path is None:
-			reverse_path = []
-			node = self.node
-			while node is not None:
-				node_id = (node.tag, tuple(node.attrib.items()), node.text)
-				reverse_path.append(node_id)
-				node = node.getparent()
-			self._path = tuple(reverse_path)
-		return self._path
-	path = property(_get_path)
-
-	def __hash__(self):
-		return hash(self.path)
-
-	def __eq__(self, other):
-		return self.path == other.path
-
-	def __getattr__(self, tag):
-		return getattr(self.node, tag)
+    def __init__(self, node):
+        self.node = node
+        self._path = None
+
+    def _get_path(self):
+        if self._path is None:
+            reverse_path = []
+            node = self.node
+            while node is not None:
+                node_id = (node.tag, tuple(node.attrib.items()), node.text)
+                reverse_path.append(node_id)
+                node = node.getparent()
+            self._path = tuple(reverse_path)
+        return self._path
+    path = property(_get_path)
+
+    def __hash__(self):
+        return hash(self.path)
+
+    def __eq__(self, other):
+        return self.path == other.path
+
+    def __getattr__(self, tag):
+        return getattr(self.node, tag)
 
 def main():
-	from optparse import OptionParser
-	parser = OptionParser(usage="%prog: [options] [file]")
-	parser.add_option('-v', '--verbose', action='store_true')
-	parser.add_option('-u', '--url', help="use URL instead of a local file")
-	(options, args) = parser.parse_args()
-	
-	if not (len(args) == 1 or options.url):
-		parser.print_help()
-		sys.exit(1)
-	logging.basicConfig(level=logging.INFO)
-
-	file = None
-	if options.url:
-		import urllib
-		file = urllib.urlopen(options.url)
-	else:
-		file = open(args[0], 'rt')
-	enc = sys.__stdout__.encoding or 'utf-8'
-	try:
-		print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
-	finally:
-		file.close()
+    from optparse import OptionParser
+    parser = OptionParser(usage="%prog: [options] [file]")
+    parser.add_option('-v', '--verbose', action='store_true')
+    parser.add_option('-u', '--url', help="use URL instead of a local file")
+    (options, args) = parser.parse_args()
+
+    if not (len(args) == 1 or options.url):
+        parser.print_help()
+        sys.exit(1)
+    logging.basicConfig(level=logging.INFO)
+
+    file = None
+    if options.url:
+        import urllib
+        file = urllib.urlopen(options.url)
+    else:
+        file = open(args[0], 'rt')
+    enc = sys.__stdout__.encoding or 'utf-8'
+    try:
+        print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
+    finally:
+        file.close()
 
 if __name__ == '__main__':
-	main()
+    main()
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/samples/si-game.sample.html b/tests/samples/si-game.sample.html
new file mode 100644
index 0000000..fab4f4f
--- /dev/null
+++ b/tests/samples/si-game.sample.html
@@ -0,0 +1,762 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+   <html>
+   <head>
+   <meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
+   <a href="/baseball/mlb/teams/tigers/">
+      <title>Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012</title></a><meta name="description" content="Tigers-Royals preview for game played on April 16, 2012">
+   <meta name="keywords" content="Detroit Tigers, Kansas City Royals, preview, mlb, baseball, si.com">
+   <script type="text/javascript">
+   var SPORTID = "MLB";
+   var PATH = "/baseball/mlb/scoreboards/2012/04/16/";
+   var FEEDNAME = "scoreboard.dat";
+   isViewcast = true;
+   var searchString = document.location.href;
+   </script>
+   <link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.e/css/pkg/global_41/129.css"/>
+<script type="text/javascript" language="JavaScript" src="http://i.cdn.turner.com/si/.e/js/4.1/global/lib/jquery-1.5.2.min.js"></script>
+<script language="JavaScript" type="text/javascript" src="http://i.cdn.turner.com/si/.e/js/pkg/global/593.js"></script>
+<script src="http://img.timeinc.net/shared/static/js/tii_ads.js"></script><script>var adConfig=new TiiAdConfig('3475.si2');adConfig.setRevSciTracking(true);</script>
+
+<!--[if IE 9]>
+<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.e/css/4.1/ie9.css" />
+<![endif]-->
+<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.element/css/4.1/gameflash.css"/>
+<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.element/css/4.1/miniscores.css"/>
+<script language="javascript" type="text/javascript">
+	function hidediv() {
+			if (document.getElementById) { // DOM3 = IE5, NS6
+					document.getElementById('cnngCommentsBox').className = 'cnngCommentsBoxOff';
+			}
+			else {
+					if (document.layers) { // Netscape 4
+							document.cnngCommentsBox.className = 'cnngCommentsBoxOff';
+					}
+					else { // IE 4
+							document.all.cnngCommentsBox.className = 'cnngCommentsBoxOff';
+					}
+			}
+	}
+	function showdiv() {
+			if (document.getElementById) { // DOM3 = IE5, NS6
+					document.getElementById('cnngCommentsBox').className = 'cnngCommentsBox';
+			}
+			else {
+					if (document.layers) { // Netscape 4
+							document.cnngCommentsBox.className = 'cnngCommentsBox';
+					}
+					else { // IE 4
+							document.all.cnngCommentsBox.className = 'cnngCommentsBox';
+					}
+			}
+	}
+function siVideoBegin(cvpInstance, videoId) {  }
+function siVideoPlay(cvpInstance, videoId) {
+	var cvpData = cvpInstance.getContentEntry(videoId);
+	var cvpObject = window.JSON.parse(cvpData);
+	jQuery('#cnnCVPRecapDetails').show();
+	jQuery('#cvpHeadline').html(cvpObject.headline);
+	jQuery('#cvpDescription').html(cvpObject.description);
+	jQuery('#cvpSource').html(cvpObject.source);
+}
+
+function siVideoPlayHead(cvpInstance, playheadTime, totalDuration) { }
+
+function siVideoAdStarted(cvpInstance, videoId) { }
+
+function siVideoTrackingAdCountdown(seconds) { }
+
+function siVideoComplete(cvpInstance, videoId) { }
+
+function siVideoPause(cvpInstance, videoId, paused) { }
+
+function siVideoSeek() { }
+</script>
+<script language="JavaScript" src="/.element/js/4.1/ads/sasd_ads.js"></script>
+<script src="http://i.cdn.turner.com/si/.element/js/4.1/global/lib/iframe_ad_factory.js"></script><script>iframeAdFactory.url = '/si_adspaces/4.0/iframe.html';
+window.setInterval(function(){ iframeAdFactory.refresh() }, 45000);</script>
+
+
+<script type="text/javascript">
+var adFactory = new TiiAdFactory(adConfig, "mlb/gameflashpage");
+iframeAdFactory.queryString = 'TiiAdConfig=3475.si2&adConfigPairs=' + '&TiiAdFactory=' + encodeURIComponent('mlb/gameflashpage') + '&adFactoryPairs=' + '&paramPairs=' + encodeURIComponent('sport=mlb');
+if (TiiAdsIsDebugMode()) { iframeAdFactory.queryString += '&debugads=y'; }
+</script>
+<link rel="stylesheet" type="text/css" href="http://z.cdn.turner.com/si/.element/css/4.1/gameflash_mlb.css"/>
+<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/js/4.1/global/lib/jquery-1.4.2.min.js"></script>
+
+<link rel="stylesheet" type="text/css" href="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/css/scoreticker-master.css"/>
+<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/gameflash/4.2/football/nfl/js/jquery.jsonp-2.1.4.min.js"></script>
+<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/js/scoreticker-master.js"></script>
+<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/js/scoreticker-mlb.js"></script>
+
+
+
+
+
+   </head>
+   <body>
+   <!--[if IE 6]><div class="ie"><div class="ie6"><![endif]--><!--[if IE 7]><div class="ie"><div class="ie7"><![endif]--><!--[if
+   IE 8]><div class="ie"><div class="ie8"><![endif]-->
+   <div class="cnnPage">
+   		
+   <!-- start contentHeader-->
+   <style>
+DIV.cnnSearch { padding:5px 0; }
+DIV.cnnSearch DIV.cnnRight { padding:4px 0; }
+DIV.cnnSearch DIV.cnnLeft { margin:0;padding:0; }
+DIV.cnnSearch DIV.cnnLeft LI { float:left;margin:0;padding:0 5px 0 0; }
+DIV.cnnSearch DIV.cnnLeft LI A { display:block;margin:0;padding:0; }
+DIV.cnnSearch DIV.cnnLeft LI IMG { vertical-align:bottom; }
+DIV.cnnSearch DIV.cnnLeft LI DL { margin:0;padding:0;position:relative;z-index:999999; }
+DIV.cnnSearch DIV.cnnLeft LI DT { margin:0;padding:0; }
+DIV.cnnSearch DIV.cnnLeft LI DD { left:-999em;margin:0;padding:0 3px 0 1px;position:absolute;top:23px; }
+DIV.cnnSearch DIV.cnnLeft LI DL.cnnOver DD,
+DIV.cnnSearch DIV.cnnLeft LI DL:hover DD { left:auto; }
+
+DIV.cnnBanner { height:auto; }
+DIV.cnnBannerSection DIV.cnnLeft { width:auto; }
+DIV.cnnBannerSection DIV.cnnLeft A { display:inline;height:auto;width:auto; }
+
+DIV.cnnBanner { background:transparent url('http://i.cdn.turner.com/si/.element/img/4.1/sect/global/topper.gif') no-repeat top right;position:relative;text-align:left;width:1000px; }
+.ie6 DIV.cnnBanner { width:1000px; }
+DIV.cnnBanner DIV IMG { display:block; }
+DIV.cnnBannerSection { height:99px;position:absolute;left:243px;top:0px;width:757px; }
+DIV.cnnBannerSection TD.col0 { display:none; }
+DIV.cnnBannerSection DIV.cnn_border { display:none; }
+DIV.cnnBannerSection IMG { display:inline;float:left; }
+DIV.cnnBannerSection DIV.cnnLeft { float:left; }
+DIV.cnnBannerSection DIV.cnnLeft IMG { float:none; }
+DIV.cnnBannerSection DIV.cnnRight { float:right;margin:8px 6px 0 0; }
+DIV.cnnBannerSection DIV.cnn_header { color:#000;font:bold 50px georgia;line-height:58px;padding:6px 10px 0 0; }
+DIV.cnnBannerSection DIV.cnn_header SPAN { font-size:10px;color:#ccc; }
+DIV.cnnBannerSection DIV.cnn_header A { color:#000; }
+DIV.cnnBannerSection DIV.cnn_header UL { color:#ccc;float:right;font-size:10px;line-height:12px;margin-top:36px; }
+.ie DIV.cnnBannerSection DIV.cnn_header UL { margin-top:-21px; }
+DIV.cnnBannerSection DIV.cnn_header UL LI { border-left:1px solid #ccc;float:left;padding:0 4px; }
+DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem0 { border:0; }
+DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem2 DIV.cnn_more { font:normal 9px arial; }
+DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem2 DIV.cnn_more A { font:normal 9px arial; }
+DIV.cnnBannerSection DIV.cnn_header UL LI DIV.cnn_rollover { background-image:url('http://i.cdn.turner.com/si/.e1d/img/4.0/global/pixels/blank_pixel.gif');display:none;padding:10px 0 9px 0;left:103px;position:absolute;width:654px; }
+.ie DIV.cnnBannerSection DIV.cnn_header UL LI DIV.cnn_rollover { top:55px; }
+DIV.cnnBannerSection DIV.cnn_header UL LI.cnnOver .cnn_rollover,
+DIV.cnnBannerSection DIV.cnn_header UL LI:hover .cnn_rollover { display:block; }
+DIV.cnnBannerSection DIV.cnn_more { color:#2e373c;font-size:10px;padding:2px 0 0 0; }
+DIV.cnnBannerSection DIV.cnn_more A { color:#fff;font-weight:bold; }
+DIV.cnnBannerSection DIV.cnn_more A:hover { color:#e7e7e7; }
+DIV.cnnBannerSection DIV.cnn_more DIV { display:none;color:#ccc;line-height:12px; }
+DIV.cnnBannerSection DIV.cnn_more DIV SPAN A { font:9px arial;font-weight:normal; }
+DIV.cnnBannerSection DIV.cnn_header DIV.cnn_more A { font-family:arial; }
+
+DIV.cnnGameScores { background:#6f7f8b;border-bottom:11px solid #384d5e; }
+</style>
+
+<!-- start personalize -->
+<div class="cnnPersonalize"><div><div><script>cnn_writePresonalizeBar();</script></div></div></div>
+<!-- end personalize -->
+
+<!-- start searchbar -->
+<div class="cnnSearch">
+	<div class="cnnLeft"><ul>
+	<li class="cnnItem0" id="cnnCM1"><dl><script type="text/javascript">
+/* script for 50/50 split */
+/*var min=1;
+var max=2;
+x = Math.floor(Math.random() * (max - min + 1)) + min;
+if(x/2 == 1) {
+  document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1006340.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/WS11_btn_champ_STL.png" alt="Get the Cardinals Championship Package" title="Get the Cardinals Championship Package"/></a></dt>');
+  document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1006340.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/WS11_dropdown_STL.png" alt="Get the Cardinals Championship Package" title="Get the Cardinals Championship Package"/></a></dd>');
+} else {
+  document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1007180.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/EA-N4S-TheRun-btn.png" alt="Get Need for Speed 12 FREE" title="Get Need for Speed 12 FREE"/></a></dt>');
+  document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1007180.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/EA-N4S-TheRun-SI-dropdown.jpg" alt="Get Need for Speed 12 FREE" title="Get Need for Speed 12 FREE"/></a></dd>');
+}
+*/
+</script>
+
+
+<!--Kentucky-->
+<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009459.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn-champ-kentucky.png" alt="Get the Wildcats Championship Package" title="Get the Wildcats Championship Package"/></a></dt>
+<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009459.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-dropdown-kentucky.png" alt="Get the Wildcats Championship Package" title="Get the Wildcats Championship Package"/></a></dd>
+
+<!--original generic sub buttons, changed on 10.26.11 for world series-->
+<!--<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1005085.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/si-btn-EA-MADDEN12.png" alt="Get EA Sports Madden NFL 12 Free!" title="Get EA Sports Madden NFL 12 Free!"/></a></dt>
+<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1005085.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/si-dropdown-EA-MADDEN12.jpg" alt="Get EA Sports Madden NFL 12 Free!" title="Get EA Sports Madden NFL 12 Free!"/></a></dd>
+-->
+
+<script><!--
+/*
+if (cnnPage.isHomepage) {
+	var button = $e('cnn_cm_subscribe0');
+	button.href = 'https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1004340.html';
+	button = $e('cnn_cm_subscribe1');
+	button.href = 'https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1004340.html';
+}
+*/
+//--></script>
+</dl></a></li>
+	<li class="cnnItem1"><dl><script type="text/javascript">
+var min=1;
+var max=2;
+x = Math.floor(Math.random() * (max - min + 1)) + min;
+/*turning off 50/50 for now*/
+/*if(x/2 == 1) {
+  document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe2"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/button_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dt>');
+  document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe4"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/dropdown_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dd>');
+} else {*/
+  document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe2"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/button_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dt>');
+  document.write('<dd style="margin-left:-79px"><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe4"><img src="http://i.cdn.turner.com/si/2012_images/cm/bn_2osi16579_290x162_v1.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dd>');
+//}
+</script>
+</dl></li>
+	<li class="cnnItem2"><dl><!--Default ROS
+<a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1001406.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe3"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn3_170x30_sigift.png" alt="Give the Gift of SI" title="Give the Gift of SI"/></a>
+-->
+
+<script type="text/javascript">
+/*var min=1;
+var max=2;
+x = Math.floor(Math.random() * (max - min + 1)) + min;
+if(x/2 == 1) {
+  document.write('<dt><a href="https://subscription.si.com/storefront/Give-the-Gift-of-Sports-Illustrated/site/si-donor0411jacket.html?xid=sirosheader&link=1001406" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/170x30.png" alt="Give the Gift of SI" title="Give the Gift of SI"/></a></dt>');
+  document.write('<dd><a href="https://subscription.si.com/storefront/Give-the-Gift-of-Sports-Illustrated/site/si-donor0411jacket.html?xid=sirosheader&link=1001406" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/170X110.jpg" alt="Give the Gift of SI" title="Give the Gift of SI"/></a></dd>');
+} else {
+  document.write('<dt><a href="http://www.si.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn_swim.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dt>');
+  document.write('<dd><a href="http://www.si.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/SWIM_2012_dropdown.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dd>');
+*/
+</script>
+
+<!--MLB2K 2012-->
+<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009469.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn3_MLB2K12.png" alt="Get MLB 2K 12 FREE" title="Get MLB 2K 12 FREE"/></a></dt>
+<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009469.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-MLB2K12-dropdown.jpg" alt="Get MLB 2K 12 FREE" title="Get MLB 2K 12 FREE"/></a></dd>
+
+<!--swimsuit 2012-->
+<!--
+<dt><a href="http://sportsillustrated.cnn.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn_swim.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dt>
+<dd><a href="http://sportsillustrated.cnn.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/SWIM_2012_dropdown.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dd>
+-->
+</dl></li>
+</ul>
+</div>
+	<div class="cnnRight"><form method="get" action="http://sportsillustrated.cnn.com/search/" name="cm_search"><input type="text" name="text" class="cnnLeft"/><input type="image" src="http://i.cdn.turner.com/si/.element/img/4.1/global/search.gif" alt="Search" title="Search" class="cnnRight"/></form></div>
+</div>
+<!-- end searchbar -->
+
+<!-- start banner -->
+<div class="cnnBanner">
+	<div><a href="/"><img src="http://i.cdn.turner.com/si/.element/img/4.1/sect/global/logo2.png" alt="SI.com Home" title="SI.com Home"/></a>
+</div>
+	<div class="cnnBannerSection">
+		<div class="cnnLeft"><a href="/baseball/mlb/"><img src="http://i.cdn.turner.com/si/.element/img/4.0/sect/baseball/mlb/icon.jpg"/></a></div>
+		<div class="cnn_header"><a href="/baseball/mlb/">MLB GAMEFLASH</a></div>
+		<div class="cnn_more" style="font-size:9px;"><a href="/baseball/mlb/scoreboards/today/">Scores</a> | <a href="/baseball/mlb/teams/">Teams</a> | <a href="/baseball/mlb/players/">Players</a> | <a href="/fantasy/player_news/mlb/">Player News</a> | <a href="/baseball/mlb/standings/">Standings</a> | <a href="/baseball/mlb/probables/today/">Probables</a> | <a href="/baseball/mlb/schedules/weekly/today/">Schedules</a> | <a href="/baseball/mlb/stats/">Stats</a> | <a href="/baseball/mlb/transactions/">Transactions</a> | <a href="/baseball/mlb/injuries/">Injuries</a> | <a href="http://www.ticketcity.com/mlb-tickets.html " target="_blank" rel="nofollow">Tickets</a> | <a href="http://mlb.mlb.com/mlb/subscriptions/index.jsp?product=si&vbID=simlbtv_test" target="_blank" rel="nofollow">MLB.TV</a>
+</div>
+	</div>
+</div>
+<div class="cnnClear"></div>
+<!-- end banner -->
+
+<style>
+/*
+DIV.cnnTopnav LI A { color:#000;display:block;padding:0 16px 0 16px!important; }
+DIV.cnnTopnav LI A { color:#000;display:block;padding:0 23px 0 22px!important; }
+*/
+DIV.cnnTopnav LI A { color:#000;display:block;padding:0 11px 0 11px!important; }
+DIV.cnnTopnav LI.cnnFirst { padding-left:0px; }
+</style>
+<div class="cnnTopnav">
+	<ul>
+		<li class="cnnFirst"><a href="/extramustard/?eref=sinav">EXTRA MUSTARD</a></li>
+		<li><a href="http://www.fannation.com/?eref=sinav">FANNATION</a></li>
+		<li><a href="/multimedia/photo_gallery/?eref=sinav">PHOTOS</a></li>
+		<li><a href="/swimsuit/?eref=sinav">SWIMSUIT</a></li>
+		<li><a href="/fantasy/?eref=sinav">FANTASY</a></li>
+		<li><a href="/magazine/sportsman/?eref=sinav">SPORTSMAN</a></li>
+		<li><a href="http://www.sportsillustratedeverywhere.com/">MAGAZINE</a></li>
+		<li><a href="/sifk/?eref=sinav">SI KIDS</a></li>
+		<li><a href="/highschool/?eref=sinav">HIGH SCHOOL</a></li>
+		<li><a href="/behindthemic/?eref=sinav">BEHIND THE MIC</a></li>
+		<li><a href="http://www.twackle.com/" target="_blank" rel="nofollow">TWACKLE</a></li>
+		<!--<li><a href="http://www.maxpreps.com/national/national.htm?eref=sinav" target="_blank" rel="nofollow">MAXPREPS</a></li>-->
+	</ul>
+</div>
+<!-- end topnav -->
+
+<style>
+	.ie6 #cnnBotnav LI#cnnBotnav0 { width:49px; } /* NFL */
+	.ie6 #cnnBotnav LI#cnnBotnav1 { width:150px; } /* COLLEGE FOOTBALL */
+	.ie6 #cnnBotnav LI#cnnBotnav2 { width:50px; } /* MLB */
+	.ie6 #cnnBotnav LI#cnnBotnav3 { width:51px; } /* NBA */
+	.ie6 #cnnBotnav LI#cnnBotnav4 { width:101px; } /* COLLEGE BB */
+	.ie6 #cnnBotnav LI#cnnBotnav5 { width:58px; } /* GOLF */
+	.ie6 #cnnBotnav LI#cnnBotnav6 { width:50px; } /* NHL */
+	.ie6 #cnnBotnav LI#cnnBotnav7 { width:74px; } /* RACING */
+	.ie6 #cnnBotnav LI#cnnBotnav8 { width:74px; } /* SOCCER */
+	.ie6 #cnnBotnav LI#cnnBotnav9 { width:121px; } /* MMA & BOXING */
+	.ie6 #cnnBotnav LI#cnnBotnav11 { width:73px; } /* TENNIS */
+	.ie6 #cnnBotnav LI#cnnBotnav12 { width:63px; } /* MORE */
+	.ie6 #cnnBotnav LI#cnnBotnav13 { width:74px; } /* VIDEO */
+	#cnnBotnav LI#cnnBotnav0 STRONG { width:49px; } /* NFL */
+	#cnnBotnav LI#cnnBotnav1 STRONG { width:150px; } /* COLLEGE FOOTBALL */
+	#cnnBotnav LI#cnnBotnav2 STRONG { width:50px; } /* MLB */
+	#cnnBotnav LI#cnnBotnav3 STRONG { width:51px; } /* NBA */
+	#cnnBotnav LI#cnnBotnav4 STRONG { width:101px; } /* COLLEGE BB */
+	#cnnBotnav LI#cnnBotnav5 STRONG { width:58px; } /* GOLF */
+	#cnnBotnav LI#cnnBotnav6 STRONG { width:50px; } /* NHL */
+	#cnnBotnav LI#cnnBotnav7 STRONG { width:74px; } /* RACING */
+	#cnnBotnav LI#cnnBotnav8 STRONG { width:74px; } /* SOCCER */
+	#cnnBotnav LI#cnnBotnav9 STRONG { width:121px; } /* MMA & BOXING */
+	#cnnBotnav LI#cnnBotnav11 STRONG { width:73px; } /* TENNIS */
+	#cnnBotnav LI#cnnBotnav12 STRONG { width:63px; } /* MORE */
+	#cnnBotnav LI#cnnBotnav13 STRONG { width:74px; } /* VIDEO */
+
+/* realignment */
+	#cnnBotnav LI#cnnBotnav11:hover UL,
+	#cnnBotnav LI#cnnBotnav11 LI.cnnOver UL { margin-left:0; } /* width of subnav minus width of TENNIS minus width of MORE minus 2 lines */
+	#cnnBotnav LI#cnnBotnav12:hover UL,
+	#cnnBotnav LI#cnnBotnav12 LI.cnnOver UL { margin-left:-41px; } /* width of subnav minus width of MORE minus 1 line */
+	#cnnBotnav LI#cnnBotnav13:hover UL,
+	#cnnBotnav LI#cnnBotnav13 LI.cnnOver UL { margin-left:-93px; width:168px; } /* width of subnav minus width of MORE minus 1 line */
+	#cnnBotnav LI#cnnBotnav13 UL LI { width:168px; }
+</style>
+<!-- start botnav -->
+<div class="cnnBotnav">
+	<div>
+		<ul id="cnnBotnav" style="height:29px;overflow:hidden;">
+			<li id="cnnBotnav0" nav="nfl">
+				<a href="/football/nfl/?eref=sinav"><strong>NFL</strong></a>
+			</li>
+			<li id="cnnBotnav1" nav="ncaaf">
+				<a href="/football/ncaa/?eref=sinav"><strong>COLLEGE FOOTBALL</strong></a>
+			</li>
+			<li id="cnnBotnav2" nav="mlb">
+				<a href="/baseball/mlb/?eref=sinav"><strong>MLB</strong></a>
+			</li>
+			<li id="cnnBotnav3" nav="nba">
+				<a href="/basketball/nba/?eref=sinav"><strong>NBA</strong></a>
+			</li>
+			<li id="cnnBotnav4" nav="ncaabb">
+				<a href="/basketball/ncaa/?eref=sinav"><strong>COLLEGE BB</strong></a>
+			</li>
+			<li id="cnnBotnav5" nav="golf">
+				<a href="http://www.golf.com/?eref=sinav"><strong>GOLF</strong></a>
+			</li>
+			<li id="cnnBotnav6" nav="nhl">
+				<a href="/hockey/nhl/?eref=sinav"><strong>NHL</strong></a>
+			</li>
+			<li id="cnnBotnav7" nav="racing">
+				<a href="/racing/?eref=sinav"><strong>RACING</strong></a>
+			</li>
+			<li id="cnnBotnav8" nav="soccer">
+				<a href="/soccer/?eref=sinav"><strong>SOCCER</strong></a>
+			</li>
+			<li id="cnnBotnav9" nav="boxmma">
+				<a href="/mma/?eref=sinav"><strong>MMA &amp; BOXING</strong></a>
+			</li>
+			<li id="cnnBotnav11" nav="tennis">
+				<a href="/tennis/?eref=sinav"><strong>TENNIS</strong></a>
+			</li>
+			<li id="cnnBotnav12" nav="more">
+				<a href="/more/?eref=sinav"><strong>MORE</strong></a>
+			</li>
+			<li id="cnnBotnav13" nav="video">
+				<a href="/video/?eref=sinav"><strong>VIDEO</strong></a>
+			</li>
+		</ul>
+	</div>
+</div>
+<!-- end botnav -->
+
+
+<div class="cnnViewerAd"><script type="text/javascript">iframeAdFactory.getAd('i_728x90', 728, 90, new Array('728x90','101x1'), true);</script></div>
+
+<!-- start scoreboard ticker -->
+<div id="scoreticker" class="stMLB">
+
+	<div id="stScrollWrap">
+		<a href="" class="stScrollControl left disabled"></a>
+		<a href="" class="stScrollControl right"></a>
+		<div id="stScroller"></div>
+	</div>
+	
+</div>
+<!-- end scoreboard ticker -->
+
+   <!-- end contentHeader-->
+   
+   <!-- start scoreboard -->
+   <div class="cnngScoreboardNoLastPlay">
+      <div class="cnngScoreboard">
+         <div class="cnnLeft">
+            <div>&nbsp;
+               						
+            </div>
+            <table border="0" cellpadding="0" cellspacing="0">
+               <tr class="cnnRow0">
+                  <td class="cnnCol0">&nbsp;</td>
+                  <td class="cnnCol1">1</td>
+                  <td class="cnnCol2">2</td>
+                  <td class="cnnCol3">3</td>
+                  <td class="cnnCol4">4</td>
+                  <td class="cnnCol5">5</td>
+                  <td class="cnnCol6">6</td>
+                  <td class="cnnCol7">7</td>
+                  <td class="cnnCol8">8</td>
+                  <td class="cnnCol9">9</td>
+                  <td class="cnnColR">R</td>
+                  <td class="cnnColH">H</td>
+                  <td class="cnnColE">E</td>
+               </tr>
+               <tr class="cnnRow1">
+                  <td class="cnnCol0"><a href="/baseball/mlb/teams/tigers/">TIGERS</a></td>
+                  <td class="cnnCol1">&nbsp;</td>
+                  <td class="cnnCol2">&nbsp;</td>
+                  <td class="cnnCol3">&nbsp;</td>
+                  <td class="cnnCol4">&nbsp;</td>
+                  <td class="cnnCol5">&nbsp;</td>
+                  <td class="cnnCol6">&nbsp;</td>
+                  <td class="cnnCol7">&nbsp;</td>
+                  <td class="cnnCol8">&nbsp;</td>
+                  <td class="cnnCol9">&nbsp;</td>
+                  <td class="cnnColR">&nbsp;</td>
+                  <td class="cnnColH">&nbsp;</td>
+                  <td class="cnnColE">&nbsp;</td>
+               </tr>
+               <tr class="cnnRow2">
+                  <td class="cnnCol0"><a href="/baseball/mlb/teams/royals/">ROYALS</a></td>
+                  <td class="cnnCol1">&nbsp;</td>
+                  <td class="cnnCol2">&nbsp;</td>
+                  <td class="cnnCol3">&nbsp;</td>
+                  <td class="cnnCol4">&nbsp;</td>
+                  <td class="cnnCol5">&nbsp;</td>
+                  <td class="cnnCol6">&nbsp;</td>
+                  <td class="cnnCol7">&nbsp;</td>
+                  <td class="cnnCol8">&nbsp;</td>
+                  <td class="cnnCol9">&nbsp;</td>
+                  <td class="cnnColR">&nbsp;</td>
+                  <td class="cnnColH">&nbsp;</td>
+                  <td class="cnnColE">&nbsp;</td>
+               </tr>
+            </table>
+         </div>
+         <div class="cnnRight">
+            <ol>
+               <li class="cnnItem4">8:10 PM ET
+                  						
+               </li>
+            </ol>
+            <ul>
+               <li class="cnnItem0"><strong>Tigers</strong><a href="/baseball/mlb/players/7590/"><img src="http://i.cdn.turner.com/si/.e1d/img/4.0/global/baseball/mlb/players/7590_small.jpg" border="0" width="50" height="76" alt="Verlander" title="Verlander"></a><a href="/baseball/mlb/players/7590/">
+                     <div class="cnnLine0">Verlander</div>
+                     <div class="cnnLine4">0-1</div>
+                     <div class="cnnLine5">2.2&nbsp;ERA</div>
+                     <div class="cnnLine6">&nbsp;</div>
+                     <div class="cnnLine7">&nbsp;</div></a></li>
+               <li class="cnnItem1"><strong>Royals</strong><a href="/baseball/mlb/players/8932/"><img src="http://i.cdn.turner.com/si/.e1d/img/4.0/global/baseball/mlb/players/8932_small.jpg" border="0" width="50" height="76" alt="Duffy" title="Duffy"></a><a href="/baseball/mlb/players/8932//">
+                     <div class="cnnLine0">Duffy</div>
+                     <div class="cnnLine4">1-0</div>
+                     <div class="cnnLine5">0&nbsp;ERA</div>
+                     <div class="cnnLine6">&nbsp;</div>
+                     <div class="cnnLine7">&nbsp;</div></a></li>
+            </ul>
+         </div>
+      </div>
+   </div>
+   <!-- end scoreboard -->
+   
+   <!-- start navbar -->
+   <div class="cnngNavbar">
+      <table border="0" cellpadding="0" cellspacing="0">
+         <tr>
+            <td class="cnnCol0"><span>PREVIEW</span></td>
+            <td class="cnnCol0"><a href="40630_matchup.html">MATCHUP</a></td></li>
+            <td class="cnnCol3"><a href="40630_fancomment.html">FAN COMMENTS</a></td>
+         </tr>
+      </table>
+   </div>
+   <!-- end navbar -->
+   
+   <!-- start content -->
+   <div class="cnngContent">
+   	<div class="cnngPreview">
+   		<div class="cnnLeft">
+   			
+   <!-- REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
+   <h1>Tigers-Royals Preview</h1>
+   <p>
+      
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/7590/index.html">Justin Verlander</a></span>
+      has pitched well in each of his first two starts, though he doesn't have a win to show for those efforts.
+      
+   </p>
+   <p>
+      He hasn't had much trouble earning victories against the 
+      <span class="cnnDataLinked"><a href="/baseball/mlb/teams/royals/index.html">Kansas City Royals</a></span>
+      .
+      
+   </p>
+   <p>
+      Verlander looks to continue his mastery of the Royals when the 
+      <span class="cnnDataLinked"><a href="/baseball/mlb/teams/tigers/index.html">Detroit Tigers</a></span>
+      visit Kauffman Stadium in the opener of a three-game series Monday night.
+      
+   </p>
+   <p>
+      The reigning AL 
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/49534/index.html">Cy Young</a></span>
+      winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't able to hold the lead.
+      
+   </p>
+   <p>Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay on Wednesday, getting
+      charged with four runs in 8 1-3 innings of a 4-2 defeat.
+   </p>"Once a couple guys got on, really the first time I've cranked it up like that - and lost a little bit of my consistency that
+   I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders." 
+   <p>The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits before the bullpen faltered.
+      Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a decision.
+   </p>
+   <p>That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. Verlander is 13-2 with
+      a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than he has against any other
+      team. He's also beaten Cleveland 13 times.
+   </p>
+   <p>Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a three-game series against
+      the Indians with Sunday's 13-7 loss.
+   </p>
+   <p>
+      
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/7634/index.html">Billy Butler</a></span>
+      , who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with four doubles and a homer
+      during a five-game hitting streak.
+      
+   </p>
+   <p>
+      Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager 
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/1716/index.html">Ned Yost</a></span>
+      turned to outfielder 
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/7899/index.html">Mitch Maier</a></span>
+      in the ninth to pitched a scoreless inning Sunday.
+      
+   </p>"Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we needed an inning." 
+   <p>
+      Kansas City will look to bounce back with the help of another solid outing from 
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/8932/index.html">Danny Duffy</a></span>
+      (1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on Tuesday.
+      
+   </p>
+   <p>The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in three starts versus the Tigers
+      as a rookie.
+   </p>
+   <p>
+      
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/7129/index.html">Gerald Laird</a></span>
+      was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game series with a 5-2 victory over
+      Chicago on Sunday.
+      
+   </p>
+   <p>
+      
+      <span class="cnnDataLinked"><a href="/baseball/mlb/players/8419/index.html">Rick Porcello</a></span>
+      allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory.
+      
+   </p>"All the other starters have pitched well," Porcello said. "It's just the way it's happened so far." 
+   <p>Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, who gave up three runs
+      over five.
+   </p>
+   <!-- /REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
+   			<p class="cnnLast">
+   				<a href="http://biz.stats.com/" target="new">&#169; 2011 STATS LLC <img src="http://i.a.cnn.net/si/images/STATSlogo.gif" align="absmiddle" alt="STATS, Inc"></a>
+
+   			</p>
+   		</div>
+   		<div class="cnnRight">
+   			
+   			<div class="cnngCommentsBox" id="cnngCommentsBox">
+   				<div class="cnngComments">
+   					<div class="cnnHolder">						
+   						<div id="fanComments">
+   							<iframe src="http://www.fannation.com/gameday/gameflash_game_comments/320416107?sport_id=2" width="397" height="390" marginwidth="0" scrolling="no" frameborder="0"></iframe>
+   						</div>
+   					</div>
+   				</div>
+   				<div class="cnn_footer">
+   					<div class="cnngToggleOn"><a href="javascript:hidediv();">TURN COMMENTS <span>OFF</span></a></div>
+   					<div class="cnngToggleOff"><a href="javascript:showdiv();">TURN COMMENTS <span>ON</span></a></div>
+   				</div>
+   			</div>
+   		</div>
+   	</div>
+   </div>
+   <!-- end content -->
+   
+   <!-- start contentFooter -->
+   <div class="cnnWideSL"><script type="text/javascript">adsonar_placementId=1488671;adsonar_pid=769769;adsonar_ps=-1;adsonar_zw=978;adsonar_zh=150;</script><script>cnnad_createSL();</script></div>
+<!-- start footerbox -->
+<div class="cnnFooterBox">
+	<div class="cnnHolder">
+		<div class="cnnRight">
+			<dl>
+				<dt><a href="/"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/footer_logo.jpg" alt="SI.com" title="SI.com"/></a></dt>
+				<dd><span>Hot Topics:</span>   <a href="/2012/writers/peter_king/04/16/countdown/index.html" title="Peter King: MMQB"class="cnnFirst">Peter King: MMQB</a>   <a href="http://nhl-red-light.si.com/2012/04/16/mayhem-reigns-in-stanley-cup-playoffs/" title="NHL Playoffs" target="new" >NHL Playoffs</a>   <a href="/2012/writers/george_schroeder/04/16/arkansas-football-petrino/index.html" title="Bobby Petrino">Bobby Petrino</a>   <a href="/2012/baseball/mlb/04/16/valentine.youkilis.ap/index.html" title="Bobby Valentine">Bobby Valentine</a>   <a href="/2012/writers/michael_mccann/04/16/roger.clemens.trial.preview/index.html" title="Roger Clemens">Roger Clemens</a>   <a href="/2012/baseball/mlb/04/16/power.rankings/index.html" title="MLB Power Rankings">MLB Power Rankings</a>   <a href="/2012/writers/richard_rothschild/04/13/jackie.robinson/index.html" title="Jackie Robinson">Jackie Robinson</a> </dd>
+			</dl>
+			<div class="cnnClear"></div>
+			<ul>
+				<li><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002348.html" target="_blank" rel="nofollow">SUBSCRIBE TO SI</a></li>
+				<li><a href="http://www.sportsillustratedeverywhere.com" target="_blank" rel="nofollow">DIGITAL EDITION</a></li>
+				<li><a href="/mobile/">SI MOBILE</a></li>
+				<li><a href="/2010/about_us/jobs/">JOBS</a></li>
+				<li><a href="/sitemap/">SITE MAP</a></li>
+				<li><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1003862.html" target="_blank" rel="nofollow">GIVE THE GIFT OF SI</a></li>
+				<li><a href="http://sipictures.com/" target="_blank" rel="nofollow">SI PICTURE SALES</a></li>
+				<li><a href="http://www.sportsillustratedsnapshot.com" target="_blank" rel="nofollow">PICTURES OF THE DAY</a></li>
+				<li><a href="/about_us/">ABOUT US</a></li>
+				<li><a href="http://simediakit.com" target="_blank" rel="nofollow">SI MEDIA KITS</a></li>
+				<li><a href="http://www.sicovers.com/default.aspx?utm_source=sicom&utm_medium=ftr&utm_campaign=icrefer&xid=siftr" target="_blank" rel="nofollow">SI COVER COLLECTION</a></li>
+				<li><a href="http://sicustomerservice.com/" target="_blank" rel="nofollow">SI CUSTOMER SERVICE</a></li>
+				<li><a href="/2008/magazine/si.books/">SI BOOKS</a></li>
+				<li><a href="/about_us/feedback/">CONTACT US</a></li>
+				<li><a href="/services/rss/">ADD RSS HEADLINE</a></li>
+			</ul>
+			<div class="cnnClear"></div>
+			<div class="cnnCopyright">
+				<style>
+				.cnnFooterBox .cnnHolder { overflow:hidden; }
+				.cnnFooterBox .cnnRight DIV.cnnCopyright { line-height:16px;padding-top:2px;text-align:left; }
+				.cnnFooterBox .cnnRight DIV.cnnCopyright IMG { float:left;margin:0 6px 14px 0; }
+				.cnnFooterBox .cnnRight DIV.cnnCopyright IMG#cnnFooterAdOpt { float:none;margin:0 0 0 6px;vertical-align:bottom; }
+				</style>
+				<img src="http://i.cdn.turner.com//si/.element/img/4.1/global/logo_footer_turner.png" alt="Turner - SI Digital"/> 
+				<script type="text/javascript">if( ( ( document.location.pathname ).indexOf( '/basketball/nba' ) >= 0 ) || ( ( document.location.pathname ).indexOf( '/video/nba' ) == 0 ) ) { document.write( 'TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.' ); } else { document.write( 'TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.' ); }</script><noscript>TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.</noscript>
+				<br/> <a href="/interactive_legal.html" rel="nofollow">Terms</a> under which this service is provided to you. Read our <a href="/privacy/" rel="nofollow">privacy guidelines</a>, <a href="https://subscription.timeinc.com/storefront/privacy/si/generic_privacy_new.html?dnp-source=E#california" rel="nofollow">your California privacy rights</a>, and <a href="http://subscription-assets.timeinc.com/prod/assets/themes/magazines/default/template-resources/html/legal/ti-corp-behavioral.html">ad choices<img src="http://i.cdn.turner.com/si/.element/img/4.1/global/logo_adchoices.gif" id="cnnFooterAdOpt"/></a>.
+			</div>
+		</div>
+		<div class="cnnLeft"><a href="http://sportsillustrated.cnn.com/vault/cover/featured/11730/index.htm?xid=sivcoverhome"><img style="vertical-align:bottom;" title="SI Cover" alt="SI Cover" src="http://i.cdn.turner.com/si/si_online/covers/images/2012/0416_thumb.jpg"></a><a href="http://www.sportsillustratedeverywhere.com/?xid=sivcoverhome"><img style="vertical-align:bottom;" src="http://i.cdn.turner.com/sivault/.element/img/1.0/read_all_articles_96x12.gif" alt="Read All Articles" border="0" width="96" height="12"></a><a href="http://www.sicovers.com/ils.aspx?p=SPR20120416golf&utm_source=sivault&utm_medium=inet&utm_campain=icrefer &xid=sivcoverhome" target="_blank"><img style="vertical-align:bottom;" src="http://i.cdn.turner.com/sivault/.element/img/1.0/buy_cover_reprint.gif" alt="Buy Cover Reprint" border="0" width="96" height="12"></a>
+</div>
+	</div>
+</div>
+
+<!-- end footerbox -->
+
+<!-- start searchbar -->
+<div class="cnnSearchFooter">
+	<div class="cnnCenter"><form method="get" action="http://sportsillustrated.cnn.com/search/" name="footer_search"><input id="searchInputFooter" type="text" name="text" class="cnnLeft"/><input type="image" src="http://i.cdn.turner.com/si/.element/img/4.1/global/search.gif" alt="Search" title="Search" class="cnnRight"/></form></div>
+</div>
+
+<!-- end searchbar -->
+
+<!--START OF PAGELINKS.JS-->
+<script language="Javascript">// Post Processing code to update links with tracking references
+
+var url = window.location.href.toString();
+url = url.replace(/http:\/\/[^\/]*/, '');
+url = url.replace(/\?.*$/, '');
+
+// All links on page
+var links = document.getElementsByTagName('a');
+
+for (var i=0; i < links.length; i++) {
+	var link = links[i];
+	if (link.href); else continue;
+	if (link.href.indexOf('.html/')>0) { siLog.debug('Fix trail slash - ',link.href); link.href = link.href.replace(/\.html\//,'.html'); }
+	if (!cnnPage.isHomepage) {
+		// Loop through links, add erefs where expected
+		if (link.href.indexOf('http://www.fannation.com/') == 0) {
+			cnnAddQ( link, 'eref=fromSI' );
+		}
+		if (url != '/' && link.href.indexOf('/vault') > 0) {
+			cnnAddQ( link, 'eref=sisf' );
+		} 
+		if (url.indexOf('/danpatrick') != 0 && link.href.indexOf('/danpatrick') > 0 && link.href.indexOf('.mp3') < 0) {
+			cnnAddQ( link, 'eref=fromSI' );
+		}
+	}
+	if (link.innerHTML == link.getAttribute('title')) {
+		link.setAttribute('title','');
+	}
+}
+
+function cnnAddQ (link, add) {
+	if (link.href.toLowerCase().indexOf('javascript') == -1) {
+		if (link.href.indexOf('?') > 0) link.href = link.href + '&' + add;
+		else link.href = link.href + '?' + add;
+	}
+}
+
+// Add whitespace to cnnClear
+var breaks = $c('cnnClear','div');
+
+/* Homepage */
+if (cnnPage.isHomepage) {
+	cnnTagHPLinks(); 
+	/* iPad */
+	if(navigator.userAgent.indexOf('iPad')>-1) {
+		$e('cnnShareRow_mobile').href='http://ax.itunes.apple.com/WebObjects/MZStore.'
+		+'woa/wa/browserRedirect?url=itms%253A%252F%252Fax.itunes.apple.com%252FWebObj'
+		+'ects%252FMZStore.woa%252Fwa%252FviewSoftware%253Fid%253D329510739%2526mt%253D8';
+	}
+	/* Poll frame height issue */
+	if ($e('cnnPollFrame')) { $e('cnnPollFrame').setAttribute('height','169'); }
+}</script>
+<!--END OF PAGELINKS.JS-->
+
+</div>
+<div><!-- move tracking out of cnnpage -->
+<!-- ADBP/JSMD -->
+<!-- ADBP Meta Data -->
+<script type="text/javascript" src="http://i.cdn.turner.com/si/.e/js/4.1/global/jsmd/metadata.js"></script>
+<!-- /ADBP Meta Data -->	
+
+<!-- JSMD Code --> 
+<script language="JavaScript" type="text/javascript" src="http://i.cdn.turner.com/si/.element/js/4.1/global/jsmd/jsmd.js"></script> 
+<script language="JavaScript"> 
+<!-- $pathname is defined in metadata.js
+if($pathname.indexOf("/.element/ssi/ads.iframes/") == -1 && $pathname.indexOf("/doubleclick/dartiframe.html") == -1) {
+	var jsmd=_jsmd.init();
+	if(document.referrer !== window.location.href){
+		jsmd.send();
+	}
+}
+//-->
+</script> 
+<!-- / End JSMD Code -->
+<!-- /ADBP/JSMD -->
+</div>
+
+<div style="font-size:1px;line-height:1px;">
+<div><img src="/cookie.crumb" width="1" height="1"></div>
+</div>
+
+<img src="http://i.cdn.turner.com/si/.e/img/4.0/global/pixels/blank_pixel.gif" alt="" id="TargetImageDE" name="TargetImageDE" onload="cnnad_getDEAdHeadCookie(this)" height="1" width="1">
+
+<script language="JavaScript">
+	siTracking.init();
+</script>
+<script language="JavaScript">
+	//ADM
+	cnnad_sendADMData();
+	cnnad_ugsync();
+</script>
+
+<!-- TIIAD -->
+<script type="text/javascript">
+function siQuantcast()
+{
+	var lb = "Time Inc News Business and Sports,Sports Illustrated";
+	var lb_ch = (jsmd.get("m:page.section[0]") ? jsmd.get("m:page.section[0]") : "");
+	lb+=(lb_ch != null && typeof(lb_ch) == "string" && lb_ch.length > 0) ? "." + lb_ch:"";
+	return lb;
+}
+_qoptions={
+	qacct:"p-5dyPa639IrgIw",
+	labels:siQuantcast()
+};
+</script>
+<script type="text/javascript" src="http://edge.quantserve.com/quant.js"></script>
+<noscript><img src="http://pixel.quantserve.com/pixel/p-5dyPa639IrgIw.gif?labels=Time Inc News Business and Sports,Sports Illustrated" style="display: none;" border="0" height="1" width="1" alt="Quantcast"/></noscript> 
+<script src="http://js.revsci.net/gateway/gw.js?csid=H07710&auto=t" type="text/javascript"></script>
+<!-- /TIIAD -->
+
+<script src="http://i.cdn.turner.com/si/.e1d/js/4.1/global/pagelinks.js" type="text/javascript"></script>
+<script src="http://i.cdn.turner.com/si/.e1d/js/4.1/global/subnav.js" type="text/javascript"></script>
+
+   <!-- end contentFooter -->
+   
+   <!--[if IE 6]></div></div><![endif]--><!--[if IE 7]></div></div><![endif]--><!--[if IE 8]></div></div><![endif]-->
+   </body>
+   </html>
\ No newline at end of file
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
new file mode 100644
index 0000000..41bfd85
--- /dev/null
+++ b/tests/test_article_only.py
@@ -0,0 +1,39 @@
+import os
+import unittest
+
+from readability import Document
+
+
+SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
+
+
+def load_sample(filename):
+    """Helper to get the content out of the sample files"""
+    return open(os.path.join(SAMPLES, filename)).read()
+
+
+class TestArticleOnly(unittest.TestCase):
+    """The option to not get back a full html doc should work
+
+    Given a full html document, the call can request just divs of processed
+    content. In this way the developer can then wrap the article however they
+    want in their own view or application.
+
+    """
+
+    def setUp(self):
+        """"""
+        pass
+
+    def tearDown(self):
+        """"""
+        pass
+
+    def test_si_sample(self):
+        """Using the si sample, make sure we can get the article alone."""
+        sample = load_sample('si-game.sample.html')
+        doc = Document(sample)
+        res = doc.summary(document_only=True)
+
+        self.assertEqual('<div class="', res[0:12])
+

From 5a98e2c1b85b16699fda4e2e91905d5837ef8314 Mon Sep 17 00:00:00 2001
From: Richard Harding <rharding@mitechie.com>
Date: Mon, 16 Apr 2012 20:55:13 -0400
Subject: [PATCH 2/4] Correct appending and allow for document only

- Fix the appending of siblings to the correct nested element
- Add a document only flag so that you can get a dom tree you can nest
yourself without html/body tags.
---
 README                     |  5 +++++
 readability/readability.py | 28 +++++++++++++++-------------
 tests/test_article_only.py | 22 +++++++++++-----------
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/README b/README
index 36bb28c..acc96fa 100644
--- a/README
+++ b/README
@@ -33,3 +33,8 @@ Usage::
 Command-line usage::
 
     python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
+
+
+Document() kwarg options:
+    url=xxx will run make_links_absolute()
+
diff --git a/readability/readability.py b/readability/readability.py
index b409c59..ae760c5 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -98,7 +98,6 @@ class Document:
             ruthless = True
             while True:
                 self._html(True)
-
                 for i in self.tags(self.html, 'script', 'style'):
                     i.drop_tree()
                 for i in self.tags(self.html, 'body'):
@@ -111,7 +110,8 @@ class Document:
                 best_candidate = self.select_best_candidate(candidates)
 
                 if best_candidate:
-                    article = self.get_article(candidates, best_candidate)
+                    article = self.get_article(candidates, best_candidate,
+                            document_only=document_only)
                 else:
                     if ruthless:
                         logging.debug("ruthless removal did not work. ")
@@ -136,12 +136,15 @@ class Document:
             logging.exception('error getting summary: ' )
             raise Unparseable(str(e)), None, sys.exc_info()[2]
 
-    def get_article(self, candidates, best_candidate):
+    def get_article(self, candidates, best_candidate, document_only=False):
         # Now that we have the top candidate, look through its siblings for content that might also be related.
         # Things like preambles, content split by ads that we removed, etc.
-
         sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
-        output = document_fromstring('<div/>')
+        # create a new html document with a html->body->div
+        if document_only:
+            output = fragment_fromstring('<div/>')
+        else:
+            output = document_fromstring('<div/>')
         best_elem = best_candidate['elem']
         for sibling in best_elem.getparent().getchildren():
             #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@@ -163,7 +166,12 @@ class Document:
                     append = True
 
             if append:
-                output.append(sibling)
+                # We don't want to append directly to output, but the div
+                # in html->body->div
+                if document_only:
+                    output.append(sibling)
+                else:
+                    output.getchildren()[0].getchildren()[0].append(sibling)
         #if output is not None:
         #    output.append(best_elem)
         return output
@@ -454,13 +462,7 @@ class Document:
             if not (self.options['attributes']):
                 #el.attrib = {} #FIXME:Checkout the effects of disabling this
                 pass
-        # There can be two nodes here. We really want to tounicode only one of
-        # them.
-        # To start with let's hack it to get the longest tree as our document.
-        if len(node.getchildren()) > 1:
-            children = node.getchildren()
-            sorted_list = sorted(children, key=len, reverse=True)
-            node = sorted_list[0]
+
         return clean_attributes(tounicode(node))
 
 
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
index 41bfd85..28240bd 100644
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@@ -21,19 +21,19 @@ class TestArticleOnly(unittest.TestCase):
 
     """
 
-    def setUp(self):
-        """"""
-        pass
-
-    def tearDown(self):
-        """"""
-        pass
-
     def test_si_sample(self):
+        """Using the si sample, load article with only opening body element"""
+        sample = load_sample('si-game.sample.html')
+        doc = Document(
+            sample,
+            url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
+        res = doc.summary()
+        self.assertEqual('<html><body><div><div class', res[0:27])
+
+    def test_si_sample_doc_only(self):
         """Using the si sample, make sure we can get the article alone."""
         sample = load_sample('si-game.sample.html')
-        doc = Document(sample)
+        doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
         res = doc.summary(document_only=True)
-
-        self.assertEqual('<div class="', res[0:12])
+        self.assertEqual('<div><div class="', res[0:17])
 

From a46dc14251f088501941e59bd527aaeb67f92082 Mon Sep 17 00:00:00 2001
From: Richard Harding <rharding@mitechie.com>
Date: Mon, 16 Apr 2012 21:23:19 -0400
Subject: [PATCH 3/4] Try to pep8 all the things but give up when I got close.

---
 readability/readability.py | 189 +++++++++++++++++++++++++------------
 1 file changed, 129 insertions(+), 60 deletions(-)

diff --git a/readability/readability.py b/readability/readability.py
index ae760c5..0a40198 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,21 +1,32 @@
 #!/usr/bin/env python
-from cleaners import html_cleaner, clean_attributes
-from collections import defaultdict
-from htmls import build_doc, get_body, get_title, shorten_title
-from lxml.etree import tostring, tounicode
-from lxml.html import fragment_fromstring, document_fromstring
 import logging
 import re
 import sys
 
+from collections import defaultdict
+from lxml.etree import tostring
+from lxml.etree import tounicode
+from lxml.html import document_fromstring
+from lxml.html import fragment_fromstring
+
+from cleaners import clean_attributes
+from cleaners import html_cleaner
+from htmls import build_doc
+from htmls import get_body
+from htmls import get_title
+from htmls import shorten_title
+
+
 logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
 
 REGEXES = {
-    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
-    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
-    'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
-    'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
-    'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
+    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
+    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
+    'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
+    'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
+    'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
     #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
     #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
     #'trimRe': re.compile('^\s+|\s+$/'),
@@ -25,21 +36,29 @@ REGEXES = {
     #skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 }
 
+
+class Unparseable(ValueError):
+    pass
+
+
 def describe(node, depth=1):
     if not hasattr(node, 'tag'):
         return "[%s]" % type(node)
     name = node.tag
-    if node.get('id', ''): name += '#'+node.get('id')
+    if node.get('id', ''):
+        name += '#' + node.get('id')
     if node.get('class', ''):
-        name += '.' + node.get('class').replace(' ','.')
+        name += '.' + node.get('class').replace(' ', '.')
     if name[:4] in ['div#', 'div.']:
         name = name[3:]
     if depth and node.getparent() is not None:
-        return name+' - '+describe(node.getparent(), depth-1)
+        return name + ' - ' + describe(node.getparent(), depth - 1)
     return name
 
+
 def to_int(x):
-    if not x: return None
+    if not x:
+        return None
     x = x.strip()
     if x.endswith('px'):
         return int(x[:-2])
@@ -47,26 +66,37 @@ def to_int(x):
         return int(x[:-2]) * 12
     return int(x)
 
+
 def clean(text):
     text = re.sub('\s*\n\s*', '\n', text)
     text = re.sub('[ \t]{2,}', ' ', text)
     return text.strip()
 
+
 def text_length(i):
     return len(clean(i.text_content() or ""))
 
-class Unparseable(ValueError):
-    pass
 
 class Document:
+    """Class to build a etree document out of html."""
     TEXT_LENGTH_THRESHOLD = 25
     RETRY_LENGTH = 250
 
     def __init__(self, input, **options):
+        """Generate the document
+
+        :param input: string of the html content.
+
+        kwargs:
+            - attributes:
+            - debug: output debug messages
+            - min_text_length:
+            - retry_length:
+            - url: will allow adjusting links to be absolute
+
+        """
         self.input = input
-        self.options = defaultdict(lambda: None)
-        for k, v in options.items():
-            self.options[k] = v
+        self.options = options
         self.html = None
 
     def _html(self, force=False):
@@ -77,7 +107,7 @@ class Document:
     def _parse(self, input):
         doc = build_doc(input)
         doc = html_cleaner.clean_html(doc)
-        base_href = self.options['url']
+        base_href = self.options.get('url', None)
         if base_href:
             doc.make_links_absolute(base_href, resolve_base_href=True)
         else:
@@ -94,6 +124,12 @@ class Document:
         return shorten_title(self._html(True))
 
     def summary(self, document_only=False):
+        """Generate the summary of the html docuemnt
+
+        :param document_only: return only the div of the document, don't wrap
+        in html and body tags.
+
+        """
         try:
             ruthless = True
             while True:
@@ -114,32 +150,43 @@ class Document:
                             document_only=document_only)
                 else:
                     if ruthless:
-                        logging.debug("ruthless removal did not work. ")
+                        log.debug("ruthless removal did not work. ")
                         ruthless = False
-                        self.debug("ended up stripping too much - going for a safer _parse")
+                        self.debug(
+                            ("ended up stripping too much - "
+                             "going for a safer _parse"))
                         # try again
                         continue
                     else:
-                        logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
+                        log.debug(
+                            ("Ruthless and lenient parsing did not work. "
+                             "Returning raw html"))
                         article = self.html.find('body')
                         if article is None:
                             article = self.html
                 cleaned_article = self.sanitize(article, candidates)
-                of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
+                article_length = len(cleaned_article or '')
+                retry_length = self.options.get(
+                    'retry_length',
+                    self.RETRY_LENGTH)
+                of_acceptable_length = article_length >= retry_length
                 if ruthless and not of_acceptable_length:
                     ruthless = False
-                    continue # try again
+                    # Loop through and try again.
+                    continue
                 else:
                     return cleaned_article
         except StandardError, e:
-            #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
-            logging.exception('error getting summary: ' )
+            log.exception('error getting summary: ')
             raise Unparseable(str(e)), None, sys.exc_info()[2]
 
     def get_article(self, candidates, best_candidate, document_only=False):
-        # Now that we have the top candidate, look through its siblings for content that might also be related.
+        # Now that we have the top candidate, look through its siblings for
+        # content that might also be related.
         # Things like preambles, content split by ads that we removed, etc.
-        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
+        sibling_score_threshold = max([
+            10,
+            best_candidate['content_score'] * 0.2])
         # create a new html document with a html->body->div
         if document_only:
             output = fragment_fromstring('<div/>')
@@ -147,12 +194,14 @@ class Document:
             output = document_fromstring('<div/>')
         best_elem = best_candidate['elem']
         for sibling in best_elem.getparent().getchildren():
-            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
+            # in lxml there no concept of simple text
+            # if isinstance(sibling, NavigableString): continue
             append = False
             if sibling is best_elem:
                 append = True
-            sibling_key = sibling #HashableElement(sibling)
-            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+            sibling_key = sibling  # HashableElement(sibling)
+            if sibling_key in candidates and \
+                candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                 append = True
 
             if sibling.tag == "p":
@@ -162,7 +211,9 @@ class Document:
 
                 if node_length > 80 and link_density < 0.25:
                     append = True
-                elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
+                elif node_length <= 80 \
+                    and link_density == 0 \
+                    and re.search('\.( |$)', node_content):
                     append = True
 
             if append:
@@ -180,7 +231,9 @@ class Document:
         sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
         for candidate in sorted_candidates[:5]:
             elem = candidate['elem']
-            self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
+            self.debug("Top 5 : %6.3f %s" % (
+                candidate['content_score'],
+                describe(elem)))
 
         if len(sorted_candidates) == 0:
             return None
@@ -188,7 +241,6 @@ class Document:
         best_candidate = sorted_candidates[0]
         return best_candidate
 
-
     def get_link_density(self, elem):
         link_length = 0
         for i in elem.findall(".//a"):
@@ -199,10 +251,10 @@ class Document:
         return float(link_length) / max(total_length, 1)
 
     def score_paragraphs(self, ):
-        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
+        MIN_LEN = self.options.get(
+            'min_text_length',
+            self.TEXT_LENGTH_THRESHOLD)
         candidates = {}
-        #self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
-
         ordered = []
         for elem in self.tags(self._html(), "p", "pre", "td"):
             parent_node = elem.getparent()
@@ -213,7 +265,8 @@ class Document:
             inner_text = clean(elem.text_content() or "")
             inner_text_len = len(inner_text)
 
-            # If this paragraph is less than 25 characters, don't even count it.
+            # If this paragraph is less than 25 characters
+            # don't even count it.
             if inner_text_len < MIN_LEN:
                 continue
 
@@ -222,7 +275,8 @@ class Document:
                 ordered.append(parent_node)
 
             if grand_parent_node is not None and grand_parent_node not in candidates:
-                candidates[grand_parent_node] = self.score_node(grand_parent_node)
+                candidates[grand_parent_node] = self.score_node(
+                    grand_parent_node)
                 ordered.append(grand_parent_node)
 
             content_score = 1
@@ -236,13 +290,18 @@ class Document:
             if grand_parent_node is not None:
                 candidates[grand_parent_node]['content_score'] += content_score / 2.0
 
-        # Scale the final candidates score based on link density. Good content should have a
-        # relatively small link density (5% or less) and be mostly unaffected by this operation.
+        # Scale the final candidates score based on link density. Good content
+        # should have a relatively small link density (5% or less) and be
+        # mostly unaffected by this operation.
         for elem in ordered:
             candidate = candidates[elem]
             ld = self.get_link_density(elem)
             score = candidate['content_score']
-            self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
+            self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
+                score,
+                describe(elem),
+                ld,
+                score * (1 - ld)))
             candidate['content_score'] *= (1 - ld)
 
         return candidates
@@ -282,8 +341,8 @@ class Document:
         }
 
     def debug(self, *a):
-        #if self.options['debug']:
-            logging.debug(*a)
+        if self.options.get('debug', False):
+            log.debug(*a)
 
     def remove_unlikely_candidates(self):
         for elem in self.html.iter():
@@ -297,10 +356,14 @@ class Document:
 
     def transform_misused_divs_into_paragraphs(self):
         for elem in self.tags(self.html, 'div'):
-            # transform <div>s that do not contain other block elements into <p>s
-            #FIXME: The current implementation ignores all descendants that are not direct children of elem
-            # This results in incorrect results in case there is an <img> buried within an <a> for example
-            if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+            # transform <div>s that do not contain other block elements into
+            # <p>s
+            #FIXME: The current implementation ignores all descendants that
+            # are not direct children of elem
+            # This results in incorrect results in case there is an <img>
+            # buried within an <a> for example
+            if not REGEXES['divToPElementsRe'].search(
+                    unicode(''.join(map(tostring, list(elem))))):
                 #self.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
                 #print "Fixed element "+describe(elem)
@@ -335,7 +398,8 @@ class Document:
                 yield e
 
     def sanitize(self, node, candidates):
-        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
+        MIN_LEN = self.options.get('min_text_length',
+            self.TEXT_LENGTH_THRESHOLD)
         for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
             if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
                 header.drop_tree()
@@ -362,10 +426,11 @@ class Document:
             elif el.text_content().count(",") < 10:
                 counts = {}
                 for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
-                    counts[kind] = len(el.findall('.//%s' %kind))
+                    counts[kind] = len(el.findall('.//%s' % kind))
                 counts["li"] -= 100
 
-                content_length = text_length(el) # Count the text length excluding any surrounding whitespace
+                # Count the text length excluding any surrounding whitespace
+                content_length = text_length(el)
                 link_density = self.get_link_density(el)
                 parent_node = el.getparent()
                 if parent_node is not None:
@@ -397,10 +462,12 @@ class Document:
                     reason = "too short content length %s without a single image" % content_length
                     to_remove = True
                 elif weight < 25 and link_density > 0.2:
-                        reason = "too many links %.3f for its weight %s" % (link_density, weight)
+                        reason = "too many links %.3f for its weight %s" % (
+                            link_density, weight)
                         to_remove = True
                 elif weight >= 25 and link_density > 0.5:
-                    reason = "too many links %.3f for its weight %s" % (link_density, weight)
+                    reason = "too many links %.3f for its weight %s" % (
+                        link_density, weight)
                     to_remove = True
                 elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
                     reason = "<embed>s with too short content length, or too many <embed>s"
@@ -426,7 +493,7 @@ class Document:
 
                     #find x non empty preceding and succeeding siblings
                     i, j = 0, 0
-                    x  = 1
+                    x = 1
                     siblings = []
                     for sib in el.itersiblings():
                         #self.debug(sib.text_content())
@@ -445,7 +512,7 @@ class Document:
                             if j == x:
                                 break
                     #self.debug(str(siblings))
-                    if siblings and sum(siblings) > 1000 :
+                    if siblings and sum(siblings) > 1000:
                         to_remove = False
                         self.debug("Allowing %s" % describe(el))
                         for desnode in self.tags(el, "table", "ul", "div"):
@@ -459,7 +526,7 @@ class Document:
                     el.drop_tree()
 
         for el in ([node] + [n for n in node.iter()]):
-            if not (self.options['attributes']):
+            if not self.options.get('attributes', None):
                 #el.attrib = {} #FIXME:Checkout the effects of disabling this
                 pass
 
@@ -492,17 +559,17 @@ class HashableElement():
     def __getattr__(self, tag):
         return getattr(self.node, tag)
 
+
 def main():
     from optparse import OptionParser
     parser = OptionParser(usage="%prog: [options] [file]")
     parser.add_option('-v', '--verbose', action='store_true')
-    parser.add_option('-u', '--url', help="use URL instead of a local file")
+    parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
     (options, args) = parser.parse_args()
 
     if not (len(args) == 1 or options.url):
         parser.print_help()
         sys.exit(1)
-    logging.basicConfig(level=logging.INFO)
 
     file = None
     if options.url:
@@ -512,7 +579,9 @@ def main():
         file = open(args[0], 'rt')
     enc = sys.__stdout__.encoding or 'utf-8'
     try:
-        print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
+        print Document(file.read(),
+            debug=options.verbose,
+            url=options.url).summary().encode(enc, 'replace')
     finally:
         file.close()
 

From 8d3e39f04ed0c6e9401fcfa72f89b40273e66df2 Mon Sep 17 00:00:00 2001
From: Richard Harding <rharding@mitechie.com>
Date: Mon, 16 Apr 2012 21:24:33 -0400
Subject: [PATCH 4/4] Update readme

---
 README | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README b/README
index acc96fa..c7087b0 100644
--- a/README
+++ b/README
@@ -36,5 +36,9 @@ Command-line usage::
 
 
 Document() kwarg options:
-    url=xxx will run make_links_absolute()
 
+ - attributes:
+ - debug: output debug messages
+ - min_text_length:
+ - retry_length:
+ - url: will allow adjusting links to be absolute