Work on why we have an empty <body/> tag

- Seems to come because the sanitizer ends up with two nodes, not one. The
first is an empty body, the second is the article div.
- Fix up the tabs so we can work with the file. Needs lots of pep8 love.
- Implement an initial hack that at least gets it working atm.
- Start to add test cases, sample html files we can test against, etc.
pull/15/head
Richard Harding 12 years ago
parent ab783b25b7
commit edccec5d3b

@ -11,502 +11,508 @@ import sys
logging.basicConfig(level=logging.INFO)
REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
#'normalizeRe': re.compile('\s{2,}/'),
#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
#'normalizeRe': re.compile('\s{2,}/'),
#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
def describe(node, depth=1):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name
def to_int(x):
if not x: return None
x = x.strip()
if x.endswith('px'):
return int(x[:-2])
if x.endswith('em'):
return int(x[:-2]) * 12
return int(x)
if not x: return None
x = x.strip()
if x.endswith('px'):
return int(x[:-2])
if x.endswith('em'):
return int(x[:-2]) * 12
return int(x)
def clean(text):
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
return text.strip()
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
return text.strip()
def text_length(i):
return len(clean(i.text_content() or ""))
return len(clean(i.text_content() or ""))
class Unparseable(ValueError):
pass
pass
class Document:
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
def __init__(self, input, **options):
self.input = input
self.options = defaultdict(lambda: None)
for k, v in options.items():
self.options[k] = v
self.html = None
def _html(self, force=False):
if force or self.html is None:
self.html = self._parse(self.input)
return self.html
def _parse(self, input):
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.options['url']
if base_href:
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
doc.resolve_base_href()
return doc
def content(self):
return get_body(self._html(True))
def title(self):
return get_title(self._html(True))
def short_title(self):
return shorten_title(self._html(True))
def summary(self):
try:
ruthless = True
while True:
self._html(True)
for i in self.tags(self.html, 'script', 'style'):
i.drop_tree()
for i in self.tags(self.html, 'body'):
i.set('id', 'readabilityBody')
if ruthless:
self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs()
candidates = self.score_paragraphs()
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
article = self.get_article(candidates, best_candidate)
else:
if ruthless:
logging.debug("ruthless removal did not work. ")
ruthless = False
self.debug("ended up stripping too much - going for a safer _parse")
# try again
continue
else:
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
article = self.html.find('body')
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
if ruthless and not of_acceptable_length:
ruthless = False
continue # try again
else:
return cleaned_article
except StandardError, e:
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
logging.exception('error getting summary: ' )
raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article(self, candidates, best_candidate):
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>')
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
append = False
if sibling is best_elem:
append = True
sibling_key = sibling #HashableElement(sibling)
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if sibling.tag == "p":
link_density = self.get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
append = True
elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
append = True
if append:
output.append(sibling)
#if output is not None:
# output.append(best_elem)
return output
def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0]
return best_candidate
def get_link_density(self, elem):
link_length = 0
for i in elem.findall(".//a"):
link_length += text_length(i)
#if len(elem.findall(".//div") or elem.findall(".//p")):
# link_length = link_length
total_length = text_length(elem)
return float(link_length) / max(total_length, 1)
def score_paragraphs(self, ):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
candidates = {}
#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
ordered = []
for elem in self.tags(self._html(), "p", "pre", "td"):
parent_node = elem.getparent()
if parent_node is None:
continue
grand_parent_node = parent_node.getparent()
inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < MIN_LEN:
continue
if parent_node not in candidates:
candidates[parent_node] = self.score_node(parent_node)
ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node(grand_parent_node)
ordered.append(grand_parent_node)
content_score = 1
content_score += len(inner_text.split(','))
content_score += min((inner_text_len / 100), 3)
#if elem not in candidates:
# candidates[elem] = self.score_node(elem)
#WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]['content_score'] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]['content_score'] += content_score / 2.0
# Scale the final candidates score based on link density. Good content should have a
# relatively small link density (5% or less) and be mostly unaffected by this operation.
for elem in ordered:
candidate = candidates[elem]
ld = self.get_link_density(elem)
score = candidate['content_score']
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
candidate['content_score'] *= (1 - ld)
return candidates
def class_weight(self, e):
weight = 0
if e.get('class', None):
if REGEXES['negativeRe'].search(e.get('class')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('class')):
weight += 25
if e.get('id', None):
if REGEXES['negativeRe'].search(e.get('id')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('id')):
weight += 25
return weight
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
content_score -= 5
return {
'content_score': content_score,
'elem': elem
}
def debug(self, *a):
#if self.options['debug']:
logging.debug(*a)
def remove_unlikely_candidates(self):
for elem in self.html.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
if len(s) < 2:
continue
#self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
self.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree()
def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into <p>s
#FIXME: The current implementation ignores all descendants that are not direct children of elem
# This results in incorrect results in case there is an <img> buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
for elem in self.tags(self.html, 'div'):
if elem.text and elem.text.strip():
p = fragment_fromstring('<p/>')
p.text = elem.text
elem.text = None
elem.insert(0, p)
#print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip():
p = fragment_fromstring('<p/>')
p.text = child.tail
child.tail = None
elem.insert(pos + 1, p)
#print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem)
child.drop_tree()
def tags(self, node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)):
yield e
def sanitize(self, node, candidates):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
header.drop_tree()
for elem in self.tags(node, "form", "iframe", "textarea"):
elem.drop_tree()
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed:
continue
weight = self.class_weight(el)
if el in candidates:
content_score = candidates[el]['content_score']
#print '!',el, '-> %6.3f' % content_score
else:
content_score = 0
tag = el.tag
if weight + content_score < 0:
self.debug("Cleaned %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, ))
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
counts[kind] = len(el.findall('.//%s' %kind))
counts["li"] -= 100
content_length = text_length(el) # Count the text length excluding any surrounding whitespace
link_density = self.get_link_density(el)
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
content_score = candidates[parent_node]['content_score']
else:
content_score = 0
#if parent_node is not None:
#pweight = self.class_weight(parent_node) + content_score
#pname = describe(parent_node)
#else:
#pweight = 0
#pname = "no parent"
to_remove = False
reason = ""
#if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > counts["p"]:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
reason = "too short content length %s without a single image" % content_length
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "<embed>s with too short content length, or too many <embed>s"
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img')
# valid_img = False
# self.debug(tounicode(el))
# for img in imgs:
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
def __init__(self, input, **options):
self.input = input
self.options = defaultdict(lambda: None)
for k, v in options.items():
self.options[k] = v
self.html = None
def _html(self, force=False):
if force or self.html is None:
self.html = self._parse(self.input)
return self.html
def _parse(self, input):
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.options['url']
if base_href:
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
doc.resolve_base_href()
return doc
def content(self):
return get_body(self._html(True))
def title(self):
return get_title(self._html(True))
def short_title(self):
return shorten_title(self._html(True))
def summary(self, document_only=False):
try:
ruthless = True
while True:
self._html(True)
for i in self.tags(self.html, 'script', 'style'):
i.drop_tree()
for i in self.tags(self.html, 'body'):
i.set('id', 'readabilityBody')
if ruthless:
self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs()
candidates = self.score_paragraphs()
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
article = self.get_article(candidates, best_candidate)
else:
if ruthless:
logging.debug("ruthless removal did not work. ")
ruthless = False
self.debug("ended up stripping too much - going for a safer _parse")
# try again
continue
else:
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
article = self.html.find('body')
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
if ruthless and not of_acceptable_length:
ruthless = False
continue # try again
else:
return cleaned_article
except StandardError, e:
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
logging.exception('error getting summary: ' )
raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article(self, candidates, best_candidate):
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>')
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
append = False
if sibling is best_elem:
append = True
sibling_key = sibling #HashableElement(sibling)
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if sibling.tag == "p":
link_density = self.get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
append = True
elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
append = True
if append:
output.append(sibling)
#if output is not None:
# output.append(best_elem)
return output
def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0]
return best_candidate
def get_link_density(self, elem):
link_length = 0
for i in elem.findall(".//a"):
link_length += text_length(i)
#if len(elem.findall(".//div") or elem.findall(".//p")):
# link_length = link_length
total_length = text_length(elem)
return float(link_length) / max(total_length, 1)
def score_paragraphs(self, ):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
candidates = {}
#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
ordered = []
for elem in self.tags(self._html(), "p", "pre", "td"):
parent_node = elem.getparent()
if parent_node is None:
continue
grand_parent_node = parent_node.getparent()
inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < MIN_LEN:
continue
if parent_node not in candidates:
candidates[parent_node] = self.score_node(parent_node)
ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node(grand_parent_node)
ordered.append(grand_parent_node)
content_score = 1
content_score += len(inner_text.split(','))
content_score += min((inner_text_len / 100), 3)
#if elem not in candidates:
# candidates[elem] = self.score_node(elem)
#WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]['content_score'] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]['content_score'] += content_score / 2.0
# Scale the final candidates score based on link density. Good content should have a
# relatively small link density (5% or less) and be mostly unaffected by this operation.
for elem in ordered:
candidate = candidates[elem]
ld = self.get_link_density(elem)
score = candidate['content_score']
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
candidate['content_score'] *= (1 - ld)
return candidates
def class_weight(self, e):
weight = 0
if e.get('class', None):
if REGEXES['negativeRe'].search(e.get('class')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('class')):
weight += 25
if e.get('id', None):
if REGEXES['negativeRe'].search(e.get('id')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('id')):
weight += 25
return weight
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
content_score -= 5
return {
'content_score': content_score,
'elem': elem
}
def debug(self, *a):
#if self.options['debug']:
logging.debug(*a)
def remove_unlikely_candidates(self):
for elem in self.html.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
if len(s) < 2:
continue
#self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
self.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree()
def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into <p>s
#FIXME: The current implementation ignores all descendants that are not direct children of elem
# This results in incorrect results in case there is an <img> buried within an <a> for example
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
#self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
for elem in self.tags(self.html, 'div'):
if elem.text and elem.text.strip():
p = fragment_fromstring('<p/>')
p.text = elem.text
elem.text = None
elem.insert(0, p)
#print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip():
p = fragment_fromstring('<p/>')
p.text = child.tail
child.tail = None
elem.insert(pos + 1, p)
#print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem)
child.drop_tree()
def tags(self, node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)):
yield e
def sanitize(self, node, candidates):
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
header.drop_tree()
for elem in self.tags(node, "form", "iframe", "textarea"):
elem.drop_tree()
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
if el in allowed:
continue
weight = self.class_weight(el)
if el in candidates:
content_score = candidates[el]['content_score']
#print '!',el, '-> %6.3f' % content_score
else:
content_score = 0
tag = el.tag
if weight + content_score < 0:
self.debug("Cleaned %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, ))
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
counts[kind] = len(el.findall('.//%s' %kind))
counts["li"] -= 100
content_length = text_length(el) # Count the text length excluding any surrounding whitespace
link_density = self.get_link_density(el)
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
content_score = candidates[parent_node]['content_score']
else:
content_score = 0
#if parent_node is not None:
#pweight = self.class_weight(parent_node) + content_score
#pname = describe(parent_node)
#else:
#pweight = 0
#pname = "no parent"
to_remove = False
reason = ""
#if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > counts["p"]:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
reason = "too short content length %s without a single image" % content_length
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % (link_density, weight)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "<embed>s with too short content length, or too many <embed>s"
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img')
# valid_img = False
# self.debug(tounicode(el))
# for img in imgs:
#
# height = img.get('height')
# text_length = img.get('text_length')
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
# if to_int(height) >= 100 or to_int(text_length) >= 100:
# valid_img = True
# self.debug("valid image" + tounicode(img))
# break
# if valid_img:
# to_remove = False
# self.debug("Allowing %s" %el.text_content())
# for desnode in self.tags(el, "table", "ul", "div"):
# allowed[desnode] = True
#find x non empty preceding and succeeding siblings
i, j = 0, 0
x = 1
siblings = []
for sib in el.itersiblings():
#self.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
i =+ 1
siblings.append(sib_content_length)
if i == x:
break
for sib in el.itersiblings(preceding=True):
#self.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
j =+ 1
siblings.append(sib_content_length)
if j == x:
break
#self.debug(str(siblings))
if siblings and sum(siblings) > 1000 :
to_remove = False
self.debug("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"):
allowed[desnode] = True
if to_remove:
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
(content_score, describe(el), weight, reason))
#print tounicode(el)
#self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
return clean_attributes(tounicode(node))
# height = img.get('height')
# text_length = img.get('text_length')
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
# if to_int(height) >= 100 or to_int(text_length) >= 100:
# valid_img = True
# self.debug("valid image" + tounicode(img))
# break
# if valid_img:
# to_remove = False
# self.debug("Allowing %s" %el.text_content())
# for desnode in self.tags(el, "table", "ul", "div"):
# allowed[desnode] = True
#find x non empty preceding and succeeding siblings
i, j = 0, 0
x = 1
siblings = []
for sib in el.itersiblings():
#self.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
i =+ 1
siblings.append(sib_content_length)
if i == x:
break
for sib in el.itersiblings(preceding=True):
#self.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
j =+ 1
siblings.append(sib_content_length)
if j == x:
break
#self.debug(str(siblings))
if siblings and sum(siblings) > 1000 :
to_remove = False
self.debug("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"):
allowed[desnode] = True
if to_remove:
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
(content_score, describe(el), weight, reason))
#print tounicode(el)
#self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
if not (self.options['attributes']):
#el.attrib = {} #FIXME:Checkout the effects of disabling this
pass
# There can be two nodes here. We really want to tounicode only one of
# them.
# To start with let's hack it to get the longest tree as our document.
if len(node.getchildren()) > 1:
children = node.getchildren()
sorted_list = sorted(children, key=len, reverse=True)
node = sorted_list[0]
return clean_attributes(tounicode(node))
class HashableElement():
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
reverse_path.append(node_id)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
def __init__(self, node):
self.node = node
self._path = None
def _get_path(self):
if self._path is None:
reverse_path = []
node = self.node
while node is not None:
node_id = (node.tag, tuple(node.attrib.items()), node.text)
reverse_path.append(node_id)
node = node.getparent()
self._path = tuple(reverse_path)
return self._path
path = property(_get_path)
def __hash__(self):
return hash(self.path)
def __eq__(self, other):
return self.path == other.path
def __getattr__(self, tag):
return getattr(self.node, tag)
def main():
from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', help="use URL instead of a local file")
(options, args) = parser.parse_args()
if not (len(args) == 1 or options.url):
parser.print_help()
sys.exit(1)
logging.basicConfig(level=logging.INFO)
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8'
try:
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
finally:
file.close()
from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', help="use URL instead of a local file")
(options, args) = parser.parse_args()
if not (len(args) == 1 or options.url):
parser.print_help()
sys.exit(1)
logging.basicConfig(level=logging.INFO)
file = None
if options.url:
import urllib
file = urllib.urlopen(options.url)
else:
file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8'
try:
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
finally:
file.close()
if __name__ == '__main__':
main()
main()

@ -0,0 +1,762 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
<a href="/baseball/mlb/teams/tigers/">
<title>Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012</title></a><meta name="description" content="Tigers-Royals preview for game played on April 16, 2012">
<meta name="keywords" content="Detroit Tigers, Kansas City Royals, preview, mlb, baseball, si.com">
<script type="text/javascript">
var SPORTID = "MLB";
var PATH = "/baseball/mlb/scoreboards/2012/04/16/";
var FEEDNAME = "scoreboard.dat";
isViewcast = true;
var searchString = document.location.href;
</script>
<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.e/css/pkg/global_41/129.css"/>
<script type="text/javascript" language="JavaScript" src="http://i.cdn.turner.com/si/.e/js/4.1/global/lib/jquery-1.5.2.min.js"></script>
<script language="JavaScript" type="text/javascript" src="http://i.cdn.turner.com/si/.e/js/pkg/global/593.js"></script>
<script src="http://img.timeinc.net/shared/static/js/tii_ads.js"></script><script>var adConfig=new TiiAdConfig('3475.si2');adConfig.setRevSciTracking(true);</script>
<!--[if IE 9]>
<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.e/css/4.1/ie9.css" />
<![endif]-->
<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.element/css/4.1/gameflash.css"/>
<link rel="stylesheet" type="text/css" href="http://i.cdn.turner.com/si/.element/css/4.1/miniscores.css"/>
<script language="javascript" type="text/javascript">
function hidediv() {
if (document.getElementById) { // DOM3 = IE5, NS6
document.getElementById('cnngCommentsBox').className = 'cnngCommentsBoxOff';
}
else {
if (document.layers) { // Netscape 4
document.cnngCommentsBox.className = 'cnngCommentsBoxOff';
}
else { // IE 4
document.all.cnngCommentsBox.className = 'cnngCommentsBoxOff';
}
}
}
function showdiv() {
if (document.getElementById) { // DOM3 = IE5, NS6
document.getElementById('cnngCommentsBox').className = 'cnngCommentsBox';
}
else {
if (document.layers) { // Netscape 4
document.cnngCommentsBox.className = 'cnngCommentsBox';
}
else { // IE 4
document.all.cnngCommentsBox.className = 'cnngCommentsBox';
}
}
}
function siVideoBegin(cvpInstance, videoId) { }
function siVideoPlay(cvpInstance, videoId) {
var cvpData = cvpInstance.getContentEntry(videoId);
var cvpObject = window.JSON.parse(cvpData);
jQuery('#cnnCVPRecapDetails').show();
jQuery('#cvpHeadline').html(cvpObject.headline);
jQuery('#cvpDescription').html(cvpObject.description);
jQuery('#cvpSource').html(cvpObject.source);
}
function siVideoPlayHead(cvpInstance, playheadTime, totalDuration) { }
function siVideoAdStarted(cvpInstance, videoId) { }
function siVideoTrackingAdCountdown(seconds) { }
function siVideoComplete(cvpInstance, videoId) { }
function siVideoPause(cvpInstance, videoId, paused) { }
function siVideoSeek() { }
</script>
<script language="JavaScript" src="/.element/js/4.1/ads/sasd_ads.js"></script>
<script src="http://i.cdn.turner.com/si/.element/js/4.1/global/lib/iframe_ad_factory.js"></script><script>iframeAdFactory.url = '/si_adspaces/4.0/iframe.html';
window.setInterval(function(){ iframeAdFactory.refresh() }, 45000);</script>
<script type="text/javascript">
var adFactory = new TiiAdFactory(adConfig, "mlb/gameflashpage");
iframeAdFactory.queryString = 'TiiAdConfig=3475.si2&adConfigPairs=' + '&TiiAdFactory=' + encodeURIComponent('mlb/gameflashpage') + '&adFactoryPairs=' + '&paramPairs=' + encodeURIComponent('sport=mlb');
if (TiiAdsIsDebugMode()) { iframeAdFactory.queryString += '&debugads=y'; }
</script>
<link rel="stylesheet" type="text/css" href="http://z.cdn.turner.com/si/.element/css/4.1/gameflash_mlb.css"/>
<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/js/4.1/global/lib/jquery-1.4.2.min.js"></script>
<link rel="stylesheet" type="text/css" href="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/css/scoreticker-master.css"/>
<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/gameflash/4.2/football/nfl/js/jquery.jsonp-2.1.4.min.js"></script>
<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/js/scoreticker-master.js"></script>
<script type="text/javascript" src="http://z.cdn.turner.com/si/.element/ssi/scoreboards/4.2/js/scoreticker-mlb.js"></script>
</head>
<body>
<!--[if IE 6]><div class="ie"><div class="ie6"><![endif]--><!--[if IE 7]><div class="ie"><div class="ie7"><![endif]--><!--[if
IE 8]><div class="ie"><div class="ie8"><![endif]-->
<div class="cnnPage">
<!-- start contentHeader-->
<style>
DIV.cnnSearch { padding:5px 0; }
DIV.cnnSearch DIV.cnnRight { padding:4px 0; }
DIV.cnnSearch DIV.cnnLeft { margin:0;padding:0; }
DIV.cnnSearch DIV.cnnLeft LI { float:left;margin:0;padding:0 5px 0 0; }
DIV.cnnSearch DIV.cnnLeft LI A { display:block;margin:0;padding:0; }
DIV.cnnSearch DIV.cnnLeft LI IMG { vertical-align:bottom; }
DIV.cnnSearch DIV.cnnLeft LI DL { margin:0;padding:0;position:relative;z-index:999999; }
DIV.cnnSearch DIV.cnnLeft LI DT { margin:0;padding:0; }
DIV.cnnSearch DIV.cnnLeft LI DD { left:-999em;margin:0;padding:0 3px 0 1px;position:absolute;top:23px; }
DIV.cnnSearch DIV.cnnLeft LI DL.cnnOver DD,
DIV.cnnSearch DIV.cnnLeft LI DL:hover DD { left:auto; }
DIV.cnnBanner { height:auto; }
DIV.cnnBannerSection DIV.cnnLeft { width:auto; }
DIV.cnnBannerSection DIV.cnnLeft A { display:inline;height:auto;width:auto; }
DIV.cnnBanner { background:transparent url('http://i.cdn.turner.com/si/.element/img/4.1/sect/global/topper.gif') no-repeat top right;position:relative;text-align:left;width:1000px; }
.ie6 DIV.cnnBanner { width:1000px; }
DIV.cnnBanner DIV IMG { display:block; }
DIV.cnnBannerSection { height:99px;position:absolute;left:243px;top:0px;width:757px; }
DIV.cnnBannerSection TD.col0 { display:none; }
DIV.cnnBannerSection DIV.cnn_border { display:none; }
DIV.cnnBannerSection IMG { display:inline;float:left; }
DIV.cnnBannerSection DIV.cnnLeft { float:left; }
DIV.cnnBannerSection DIV.cnnLeft IMG { float:none; }
DIV.cnnBannerSection DIV.cnnRight { float:right;margin:8px 6px 0 0; }
DIV.cnnBannerSection DIV.cnn_header { color:#000;font:bold 50px georgia;line-height:58px;padding:6px 10px 0 0; }
DIV.cnnBannerSection DIV.cnn_header SPAN { font-size:10px;color:#ccc; }
DIV.cnnBannerSection DIV.cnn_header A { color:#000; }
DIV.cnnBannerSection DIV.cnn_header UL { color:#ccc;float:right;font-size:10px;line-height:12px;margin-top:36px; }
.ie DIV.cnnBannerSection DIV.cnn_header UL { margin-top:-21px; }
DIV.cnnBannerSection DIV.cnn_header UL LI { border-left:1px solid #ccc;float:left;padding:0 4px; }
DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem0 { border:0; }
DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem2 DIV.cnn_more { font:normal 9px arial; }
DIV.cnnBannerSection DIV.cnn_header UL LI#cnnItem2 DIV.cnn_more A { font:normal 9px arial; }
DIV.cnnBannerSection DIV.cnn_header UL LI DIV.cnn_rollover { background-image:url('http://i.cdn.turner.com/si/.e1d/img/4.0/global/pixels/blank_pixel.gif');display:none;padding:10px 0 9px 0;left:103px;position:absolute;width:654px; }
.ie DIV.cnnBannerSection DIV.cnn_header UL LI DIV.cnn_rollover { top:55px; }
DIV.cnnBannerSection DIV.cnn_header UL LI.cnnOver .cnn_rollover,
DIV.cnnBannerSection DIV.cnn_header UL LI:hover .cnn_rollover { display:block; }
DIV.cnnBannerSection DIV.cnn_more { color:#2e373c;font-size:10px;padding:2px 0 0 0; }
DIV.cnnBannerSection DIV.cnn_more A { color:#fff;font-weight:bold; }
DIV.cnnBannerSection DIV.cnn_more A:hover { color:#e7e7e7; }
DIV.cnnBannerSection DIV.cnn_more DIV { display:none;color:#ccc;line-height:12px; }
DIV.cnnBannerSection DIV.cnn_more DIV SPAN A { font:9px arial;font-weight:normal; }
DIV.cnnBannerSection DIV.cnn_header DIV.cnn_more A { font-family:arial; }
DIV.cnnGameScores { background:#6f7f8b;border-bottom:11px solid #384d5e; }
</style>
<!-- start personalize -->
<div class="cnnPersonalize"><div><div><script>cnn_writePresonalizeBar();</script></div></div></div>
<!-- end personalize -->
<!-- start searchbar -->
<div class="cnnSearch">
<div class="cnnLeft"><ul>
<li class="cnnItem0" id="cnnCM1"><dl><script type="text/javascript">
/* script for 50/50 split */
/*var min=1;
var max=2;
x = Math.floor(Math.random() * (max - min + 1)) + min;
if(x/2 == 1) {
document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1006340.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/WS11_btn_champ_STL.png" alt="Get the Cardinals Championship Package" title="Get the Cardinals Championship Package"/></a></dt>');
document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1006340.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/WS11_dropdown_STL.png" alt="Get the Cardinals Championship Package" title="Get the Cardinals Championship Package"/></a></dd>');
} else {
document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1007180.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/EA-N4S-TheRun-btn.png" alt="Get Need for Speed 12 FREE" title="Get Need for Speed 12 FREE"/></a></dt>');
document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1007180.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/EA-N4S-TheRun-SI-dropdown.jpg" alt="Get Need for Speed 12 FREE" title="Get Need for Speed 12 FREE"/></a></dd>');
}
*/
</script>
<!--Kentucky-->
<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009459.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn-champ-kentucky.png" alt="Get the Wildcats Championship Package" title="Get the Wildcats Championship Package"/></a></dt>
<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009459.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-dropdown-kentucky.png" alt="Get the Wildcats Championship Package" title="Get the Wildcats Championship Package"/></a></dd>
<!--original generic sub buttons, changed on 10.26.11 for world series-->
<!--<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1005085.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/si-btn-EA-MADDEN12.png" alt="Get EA Sports Madden NFL 12 Free!" title="Get EA Sports Madden NFL 12 Free!"/></a></dt>
<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1005085.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/si-dropdown-EA-MADDEN12.jpg" alt="Get EA Sports Madden NFL 12 Free!" title="Get EA Sports Madden NFL 12 Free!"/></a></dd>
-->
<script><!--
/*
if (cnnPage.isHomepage) {
var button = $e('cnn_cm_subscribe0');
button.href = 'https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1004340.html';
button = $e('cnn_cm_subscribe1');
button.href = 'https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1004340.html';
}
*/
//--></script>
</dl></a></li>
<li class="cnnItem1"><dl><script type="text/javascript">
var min=1;
var max=2;
x = Math.floor(Math.random() * (max - min + 1)) + min;
/*turning off 50/50 for now*/
/*if(x/2 == 1) {
document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe2"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/button_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dt>');
document.write('<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe4"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/dropdown_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dd>');
} else {*/
document.write('<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe2"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/cm/button_subscribe_si_red.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dt>');
document.write('<dd style="margin-left:-79px"><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002346.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe4"><img src="http://i.cdn.turner.com/si/2012_images/cm/bn_2osi16579_290x162_v1.png" alt="Subscribe to SI" title="Subscribe to SI"/></a></dd>');
//}
</script>
</dl></li>
<li class="cnnItem2"><dl><!--Default ROS
<a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1001406.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe3"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn3_170x30_sigift.png" alt="Give the Gift of SI" title="Give the Gift of SI"/></a>
-->
<script type="text/javascript">
/*var min=1;
var max=2;
x = Math.floor(Math.random() * (max - min + 1)) + min;
if(x/2 == 1) {
document.write('<dt><a href="https://subscription.si.com/storefront/Give-the-Gift-of-Sports-Illustrated/site/si-donor0411jacket.html?xid=sirosheader&link=1001406" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2011_images/cm/170x30.png" alt="Give the Gift of SI" title="Give the Gift of SI"/></a></dt>');
document.write('<dd><a href="https://subscription.si.com/storefront/Give-the-Gift-of-Sports-Illustrated/site/si-donor0411jacket.html?xid=sirosheader&link=1001406" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2011_images/cm/170X110.jpg" alt="Give the Gift of SI" title="Give the Gift of SI"/></a></dd>');
} else {
document.write('<dt><a href="http://www.si.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn_swim.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dt>');
document.write('<dd><a href="http://www.si.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/SWIM_2012_dropdown.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dd>');
*/
</script>
<!--MLB2K 2012-->
<dt><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009469.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn3_MLB2K12.png" alt="Get MLB 2K 12 FREE" title="Get MLB 2K 12 FREE"/></a></dt>
<dd><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1009469.html" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-MLB2K12-dropdown.jpg" alt="Get MLB 2K 12 FREE" title="Get MLB 2K 12 FREE"/></a></dd>
<!--swimsuit 2012-->
<!--
<dt><a href="http://sportsillustrated.cnn.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe0"><img src="http://i.cdn.turner.com/si/2012_images/cm/si-btn_swim.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dt>
<dd><a href="http://sportsillustrated.cnn.com/swim2012" target="_blank" rel="nofollow" id="cnn_cm_subscribe1"><img src="http://i.cdn.turner.com/si/2012_images/cm/SWIM_2012_dropdown.png" alt="Swimsuit 2012" title="Swimsuit 2012"/></a></dd>
-->
</dl></li>
</ul>
</div>
<div class="cnnRight"><form method="get" action="http://sportsillustrated.cnn.com/search/" name="cm_search"><input type="text" name="text" class="cnnLeft"/><input type="image" src="http://i.cdn.turner.com/si/.element/img/4.1/global/search.gif" alt="Search" title="Search" class="cnnRight"/></form></div>
</div>
<!-- end searchbar -->
<!-- start banner -->
<div class="cnnBanner">
<div><a href="/"><img src="http://i.cdn.turner.com/si/.element/img/4.1/sect/global/logo2.png" alt="SI.com Home" title="SI.com Home"/></a>
</div>
<div class="cnnBannerSection">
<div class="cnnLeft"><a href="/baseball/mlb/"><img src="http://i.cdn.turner.com/si/.element/img/4.0/sect/baseball/mlb/icon.jpg"/></a></div>
<div class="cnn_header"><a href="/baseball/mlb/">MLB GAMEFLASH</a></div>
<div class="cnn_more" style="font-size:9px;"><a href="/baseball/mlb/scoreboards/today/">Scores</a> | <a href="/baseball/mlb/teams/">Teams</a> | <a href="/baseball/mlb/players/">Players</a> | <a href="/fantasy/player_news/mlb/">Player News</a> | <a href="/baseball/mlb/standings/">Standings</a> | <a href="/baseball/mlb/probables/today/">Probables</a> | <a href="/baseball/mlb/schedules/weekly/today/">Schedules</a> | <a href="/baseball/mlb/stats/">Stats</a> | <a href="/baseball/mlb/transactions/">Transactions</a> | <a href="/baseball/mlb/injuries/">Injuries</a> | <a href="http://www.ticketcity.com/mlb-tickets.html " target="_blank" rel="nofollow">Tickets</a> | <a href="http://mlb.mlb.com/mlb/subscriptions/index.jsp?product=si&vbID=simlbtv_test" target="_blank" rel="nofollow">MLB.TV</a>
</div>
</div>
</div>
<div class="cnnClear"></div>
<!-- end banner -->
<style>
/*
DIV.cnnTopnav LI A { color:#000;display:block;padding:0 16px 0 16px!important; }
DIV.cnnTopnav LI A { color:#000;display:block;padding:0 23px 0 22px!important; }
*/
DIV.cnnTopnav LI A { color:#000;display:block;padding:0 11px 0 11px!important; }
DIV.cnnTopnav LI.cnnFirst { padding-left:0px; }
</style>
<div class="cnnTopnav">
<ul>
<li class="cnnFirst"><a href="/extramustard/?eref=sinav">EXTRA MUSTARD</a></li>
<li><a href="http://www.fannation.com/?eref=sinav">FANNATION</a></li>
<li><a href="/multimedia/photo_gallery/?eref=sinav">PHOTOS</a></li>
<li><a href="/swimsuit/?eref=sinav">SWIMSUIT</a></li>
<li><a href="/fantasy/?eref=sinav">FANTASY</a></li>
<li><a href="/magazine/sportsman/?eref=sinav">SPORTSMAN</a></li>
<li><a href="http://www.sportsillustratedeverywhere.com/">MAGAZINE</a></li>
<li><a href="/sifk/?eref=sinav">SI KIDS</a></li>
<li><a href="/highschool/?eref=sinav">HIGH SCHOOL</a></li>
<li><a href="/behindthemic/?eref=sinav">BEHIND THE MIC</a></li>
<li><a href="http://www.twackle.com/" target="_blank" rel="nofollow">TWACKLE</a></li>
<!--<li><a href="http://www.maxpreps.com/national/national.htm?eref=sinav" target="_blank" rel="nofollow">MAXPREPS</a></li>-->
</ul>
</div>
<!-- end topnav -->
<style>
.ie6 #cnnBotnav LI#cnnBotnav0 { width:49px; } /* NFL */
.ie6 #cnnBotnav LI#cnnBotnav1 { width:150px; } /* COLLEGE FOOTBALL */
.ie6 #cnnBotnav LI#cnnBotnav2 { width:50px; } /* MLB */
.ie6 #cnnBotnav LI#cnnBotnav3 { width:51px; } /* NBA */
.ie6 #cnnBotnav LI#cnnBotnav4 { width:101px; } /* COLLEGE BB */
.ie6 #cnnBotnav LI#cnnBotnav5 { width:58px; } /* GOLF */
.ie6 #cnnBotnav LI#cnnBotnav6 { width:50px; } /* NHL */
.ie6 #cnnBotnav LI#cnnBotnav7 { width:74px; } /* RACING */
.ie6 #cnnBotnav LI#cnnBotnav8 { width:74px; } /* SOCCER */
.ie6 #cnnBotnav LI#cnnBotnav9 { width:121px; } /* MMA & BOXING */
.ie6 #cnnBotnav LI#cnnBotnav11 { width:73px; } /* TENNIS */
.ie6 #cnnBotnav LI#cnnBotnav12 { width:63px; } /* MORE */
.ie6 #cnnBotnav LI#cnnBotnav13 { width:74px; } /* VIDEO */
#cnnBotnav LI#cnnBotnav0 STRONG { width:49px; } /* NFL */
#cnnBotnav LI#cnnBotnav1 STRONG { width:150px; } /* COLLEGE FOOTBALL */
#cnnBotnav LI#cnnBotnav2 STRONG { width:50px; } /* MLB */
#cnnBotnav LI#cnnBotnav3 STRONG { width:51px; } /* NBA */
#cnnBotnav LI#cnnBotnav4 STRONG { width:101px; } /* COLLEGE BB */
#cnnBotnav LI#cnnBotnav5 STRONG { width:58px; } /* GOLF */
#cnnBotnav LI#cnnBotnav6 STRONG { width:50px; } /* NHL */
#cnnBotnav LI#cnnBotnav7 STRONG { width:74px; } /* RACING */
#cnnBotnav LI#cnnBotnav8 STRONG { width:74px; } /* SOCCER */
#cnnBotnav LI#cnnBotnav9 STRONG { width:121px; } /* MMA & BOXING */
#cnnBotnav LI#cnnBotnav11 STRONG { width:73px; } /* TENNIS */
#cnnBotnav LI#cnnBotnav12 STRONG { width:63px; } /* MORE */
#cnnBotnav LI#cnnBotnav13 STRONG { width:74px; } /* VIDEO */
/* realignment */
#cnnBotnav LI#cnnBotnav11:hover UL,
#cnnBotnav LI#cnnBotnav11 LI.cnnOver UL { margin-left:0; } /* width of subnav minus width of TENNIS minus width of MORE minus 2 lines */
#cnnBotnav LI#cnnBotnav12:hover UL,
#cnnBotnav LI#cnnBotnav12 LI.cnnOver UL { margin-left:-41px; } /* width of subnav minus width of MORE minus 1 line */
#cnnBotnav LI#cnnBotnav13:hover UL,
#cnnBotnav LI#cnnBotnav13 LI.cnnOver UL { margin-left:-93px; width:168px; } /* width of subnav minus width of MORE minus 1 line */
#cnnBotnav LI#cnnBotnav13 UL LI { width:168px; }
</style>
<!-- start botnav -->
<div class="cnnBotnav">
<div>
<ul id="cnnBotnav" style="height:29px;overflow:hidden;">
<li id="cnnBotnav0" nav="nfl">
<a href="/football/nfl/?eref=sinav"><strong>NFL</strong></a>
</li>
<li id="cnnBotnav1" nav="ncaaf">
<a href="/football/ncaa/?eref=sinav"><strong>COLLEGE FOOTBALL</strong></a>
</li>
<li id="cnnBotnav2" nav="mlb">
<a href="/baseball/mlb/?eref=sinav"><strong>MLB</strong></a>
</li>
<li id="cnnBotnav3" nav="nba">
<a href="/basketball/nba/?eref=sinav"><strong>NBA</strong></a>
</li>
<li id="cnnBotnav4" nav="ncaabb">
<a href="/basketball/ncaa/?eref=sinav"><strong>COLLEGE BB</strong></a>
</li>
<li id="cnnBotnav5" nav="golf">
<a href="http://www.golf.com/?eref=sinav"><strong>GOLF</strong></a>
</li>
<li id="cnnBotnav6" nav="nhl">
<a href="/hockey/nhl/?eref=sinav"><strong>NHL</strong></a>
</li>
<li id="cnnBotnav7" nav="racing">
<a href="/racing/?eref=sinav"><strong>RACING</strong></a>
</li>
<li id="cnnBotnav8" nav="soccer">
<a href="/soccer/?eref=sinav"><strong>SOCCER</strong></a>
</li>
<li id="cnnBotnav9" nav="boxmma">
<a href="/mma/?eref=sinav"><strong>MMA &amp; BOXING</strong></a>
</li>
<li id="cnnBotnav11" nav="tennis">
<a href="/tennis/?eref=sinav"><strong>TENNIS</strong></a>
</li>
<li id="cnnBotnav12" nav="more">
<a href="/more/?eref=sinav"><strong>MORE</strong></a>
</li>
<li id="cnnBotnav13" nav="video">
<a href="/video/?eref=sinav"><strong>VIDEO</strong></a>
</li>
</ul>
</div>
</div>
<!-- end botnav -->
<div class="cnnViewerAd"><script type="text/javascript">iframeAdFactory.getAd('i_728x90', 728, 90, new Array('728x90','101x1'), true);</script></div>
<!-- start scoreboard ticker -->
<div id="scoreticker" class="stMLB">
<div id="stScrollWrap">
<a href="" class="stScrollControl left disabled"></a>
<a href="" class="stScrollControl right"></a>
<div id="stScroller"></div>
</div>
</div>
<!-- end scoreboard ticker -->
<!-- end contentHeader-->
<!-- start scoreboard -->
<div class="cnngScoreboardNoLastPlay">
<div class="cnngScoreboard">
<div class="cnnLeft">
<div>&nbsp;
</div>
<table border="0" cellpadding="0" cellspacing="0">
<tr class="cnnRow0">
<td class="cnnCol0">&nbsp;</td>
<td class="cnnCol1">1</td>
<td class="cnnCol2">2</td>
<td class="cnnCol3">3</td>
<td class="cnnCol4">4</td>
<td class="cnnCol5">5</td>
<td class="cnnCol6">6</td>
<td class="cnnCol7">7</td>
<td class="cnnCol8">8</td>
<td class="cnnCol9">9</td>
<td class="cnnColR">R</td>
<td class="cnnColH">H</td>
<td class="cnnColE">E</td>
</tr>
<tr class="cnnRow1">
<td class="cnnCol0"><a href="/baseball/mlb/teams/tigers/">TIGERS</a></td>
<td class="cnnCol1">&nbsp;</td>
<td class="cnnCol2">&nbsp;</td>
<td class="cnnCol3">&nbsp;</td>
<td class="cnnCol4">&nbsp;</td>
<td class="cnnCol5">&nbsp;</td>
<td class="cnnCol6">&nbsp;</td>
<td class="cnnCol7">&nbsp;</td>
<td class="cnnCol8">&nbsp;</td>
<td class="cnnCol9">&nbsp;</td>
<td class="cnnColR">&nbsp;</td>
<td class="cnnColH">&nbsp;</td>
<td class="cnnColE">&nbsp;</td>
</tr>
<tr class="cnnRow2">
<td class="cnnCol0"><a href="/baseball/mlb/teams/royals/">ROYALS</a></td>
<td class="cnnCol1">&nbsp;</td>
<td class="cnnCol2">&nbsp;</td>
<td class="cnnCol3">&nbsp;</td>
<td class="cnnCol4">&nbsp;</td>
<td class="cnnCol5">&nbsp;</td>
<td class="cnnCol6">&nbsp;</td>
<td class="cnnCol7">&nbsp;</td>
<td class="cnnCol8">&nbsp;</td>
<td class="cnnCol9">&nbsp;</td>
<td class="cnnColR">&nbsp;</td>
<td class="cnnColH">&nbsp;</td>
<td class="cnnColE">&nbsp;</td>
</tr>
</table>
</div>
<div class="cnnRight">
<ol>
<li class="cnnItem4">8:10 PM ET
</li>
</ol>
<ul>
<li class="cnnItem0"><strong>Tigers</strong><a href="/baseball/mlb/players/7590/"><img src="http://i.cdn.turner.com/si/.e1d/img/4.0/global/baseball/mlb/players/7590_small.jpg" border="0" width="50" height="76" alt="Verlander" title="Verlander"></a><a href="/baseball/mlb/players/7590/">
<div class="cnnLine0">Verlander</div>
<div class="cnnLine4">0-1</div>
<div class="cnnLine5">2.2&nbsp;ERA</div>
<div class="cnnLine6">&nbsp;</div>
<div class="cnnLine7">&nbsp;</div></a></li>
<li class="cnnItem1"><strong>Royals</strong><a href="/baseball/mlb/players/8932/"><img src="http://i.cdn.turner.com/si/.e1d/img/4.0/global/baseball/mlb/players/8932_small.jpg" border="0" width="50" height="76" alt="Duffy" title="Duffy"></a><a href="/baseball/mlb/players/8932//">
<div class="cnnLine0">Duffy</div>
<div class="cnnLine4">1-0</div>
<div class="cnnLine5">0&nbsp;ERA</div>
<div class="cnnLine6">&nbsp;</div>
<div class="cnnLine7">&nbsp;</div></a></li>
</ul>
</div>
</div>
</div>
<!-- end scoreboard -->
<!-- start navbar -->
<div class="cnngNavbar">
<table border="0" cellpadding="0" cellspacing="0">
<tr>
<td class="cnnCol0"><span>PREVIEW</span></td>
<td class="cnnCol0"><a href="40630_matchup.html">MATCHUP</a></td></li>
<td class="cnnCol3"><a href="40630_fancomment.html">FAN COMMENTS</a></td>
</tr>
</table>
</div>
<!-- end navbar -->
<!-- start content -->
<div class="cnngContent">
<div class="cnngPreview">
<div class="cnnLeft">
<!-- REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
<h1>Tigers-Royals Preview</h1>
<p>
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7590/index.html">Justin Verlander</a></span>
has pitched well in each of his first two starts, though he doesn't have a win to show for those efforts.
</p>
<p>
He hasn't had much trouble earning victories against the
<span class="cnnDataLinked"><a href="/baseball/mlb/teams/royals/index.html">Kansas City Royals</a></span>
.
</p>
<p>
Verlander looks to continue his mastery of the Royals when the
<span class="cnnDataLinked"><a href="/baseball/mlb/teams/tigers/index.html">Detroit Tigers</a></span>
visit Kauffman Stadium in the opener of a three-game series Monday night.
</p>
<p>
The reigning AL
<span class="cnnDataLinked"><a href="/baseball/mlb/players/49534/index.html">Cy Young</a></span>
winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't able to hold the lead.
</p>
<p>Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay on Wednesday, getting
charged with four runs in 8 1-3 innings of a 4-2 defeat.
</p>"Once a couple guys got on, really the first time I've cranked it up like that - and lost a little bit of my consistency that
I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders."
<p>The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits before the bullpen faltered.
Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a decision.
</p>
<p>That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. Verlander is 13-2 with
a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than he has against any other
team. He's also beaten Cleveland 13 times.
</p>
<p>Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a three-game series against
the Indians with Sunday's 13-7 loss.
</p>
<p>
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7634/index.html">Billy Butler</a></span>
, who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with four doubles and a homer
during a five-game hitting streak.
</p>
<p>
Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager
<span class="cnnDataLinked"><a href="/baseball/mlb/players/1716/index.html">Ned Yost</a></span>
turned to outfielder
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7899/index.html">Mitch Maier</a></span>
in the ninth to pitched a scoreless inning Sunday.
</p>"Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we needed an inning."
<p>
Kansas City will look to bounce back with the help of another solid outing from
<span class="cnnDataLinked"><a href="/baseball/mlb/players/8932/index.html">Danny Duffy</a></span>
(1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on Tuesday.
</p>
<p>The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in three starts versus the Tigers
as a rookie.
</p>
<p>
<span class="cnnDataLinked"><a href="/baseball/mlb/players/7129/index.html">Gerald Laird</a></span>
was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game series with a 5-2 victory over
Chicago on Sunday.
</p>
<p>
<span class="cnnDataLinked"><a href="/baseball/mlb/players/8419/index.html">Rick Porcello</a></span>
allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory.
</p>"All the other starters have pitched well," Porcello said. "It's just the way it's happened so far."
<p>Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, who gave up three runs
over five.
</p>
<!-- /REAPFINDREPLACE:20120515:/.element/ssi/story/4.1/wires/ap/expired_story.html:/baseball/mlb/gameflash/2012/04/16/40630_preview.html-->
<p class="cnnLast">
<a href="http://biz.stats.com/" target="new">&#169; 2011 STATS LLC <img src="http://i.a.cnn.net/si/images/STATSlogo.gif" align="absmiddle" alt="STATS, Inc"></a>
</p>
</div>
<div class="cnnRight">
<div class="cnngCommentsBox" id="cnngCommentsBox">
<div class="cnngComments">
<div class="cnnHolder">
<div id="fanComments">
<iframe src="http://www.fannation.com/gameday/gameflash_game_comments/320416107?sport_id=2" width="397" height="390" marginwidth="0" scrolling="no" frameborder="0"></iframe>
</div>
</div>
</div>
<div class="cnn_footer">
<div class="cnngToggleOn"><a href="javascript:hidediv();">TURN COMMENTS <span>OFF</span></a></div>
<div class="cnngToggleOff"><a href="javascript:showdiv();">TURN COMMENTS <span>ON</span></a></div>
</div>
</div>
</div>
</div>
</div>
<!-- end content -->
<!-- start contentFooter -->
<div class="cnnWideSL"><script type="text/javascript">adsonar_placementId=1488671;adsonar_pid=769769;adsonar_ps=-1;adsonar_zw=978;adsonar_zh=150;</script><script>cnnad_createSL();</script></div>
<!-- start footerbox -->
<div class="cnnFooterBox">
<div class="cnnHolder">
<div class="cnnRight">
<dl>
<dt><a href="/"><img src="http://i.cdn.turner.com/si/.element/img/4.1/global/footer_logo.jpg" alt="SI.com" title="SI.com"/></a></dt>
<dd><span>Hot Topics:</span> <a href="/2012/writers/peter_king/04/16/countdown/index.html" title="Peter King: MMQB"class="cnnFirst">Peter King: MMQB</a> <a href="http://nhl-red-light.si.com/2012/04/16/mayhem-reigns-in-stanley-cup-playoffs/" title="NHL Playoffs" target="new" >NHL Playoffs</a> <a href="/2012/writers/george_schroeder/04/16/arkansas-football-petrino/index.html" title="Bobby Petrino">Bobby Petrino</a> <a href="/2012/baseball/mlb/04/16/valentine.youkilis.ap/index.html" title="Bobby Valentine">Bobby Valentine</a> <a href="/2012/writers/michael_mccann/04/16/roger.clemens.trial.preview/index.html" title="Roger Clemens">Roger Clemens</a> <a href="/2012/baseball/mlb/04/16/power.rankings/index.html" title="MLB Power Rankings">MLB Power Rankings</a> <a href="/2012/writers/richard_rothschild/04/13/jackie.robinson/index.html" title="Jackie Robinson">Jackie Robinson</a> </dd>
</dl>
<div class="cnnClear"></div>
<ul>
<li><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1002348.html" target="_blank" rel="nofollow">SUBSCRIBE TO SI</a></li>
<li><a href="http://www.sportsillustratedeverywhere.com" target="_blank" rel="nofollow">DIGITAL EDITION</a></li>
<li><a href="/mobile/">SI MOBILE</a></li>
<li><a href="/2010/about_us/jobs/">JOBS</a></li>
<li><a href="/sitemap/">SITE MAP</a></li>
<li><a href="https://subscription.si.com/storefront/subscribe-to-sports-illustrated/link/1003862.html" target="_blank" rel="nofollow">GIVE THE GIFT OF SI</a></li>
<li><a href="http://sipictures.com/" target="_blank" rel="nofollow">SI PICTURE SALES</a></li>
<li><a href="http://www.sportsillustratedsnapshot.com" target="_blank" rel="nofollow">PICTURES OF THE DAY</a></li>
<li><a href="/about_us/">ABOUT US</a></li>
<li><a href="http://simediakit.com" target="_blank" rel="nofollow">SI MEDIA KITS</a></li>
<li><a href="http://www.sicovers.com/default.aspx?utm_source=sicom&utm_medium=ftr&utm_campaign=icrefer&xid=siftr" target="_blank" rel="nofollow">SI COVER COLLECTION</a></li>
<li><a href="http://sicustomerservice.com/" target="_blank" rel="nofollow">SI CUSTOMER SERVICE</a></li>
<li><a href="/2008/magazine/si.books/">SI BOOKS</a></li>
<li><a href="/about_us/feedback/">CONTACT US</a></li>
<li><a href="/services/rss/">ADD RSS HEADLINE</a></li>
</ul>
<div class="cnnClear"></div>
<div class="cnnCopyright">
<style>
.cnnFooterBox .cnnHolder { overflow:hidden; }
.cnnFooterBox .cnnRight DIV.cnnCopyright { line-height:16px;padding-top:2px;text-align:left; }
.cnnFooterBox .cnnRight DIV.cnnCopyright IMG { float:left;margin:0 6px 14px 0; }
.cnnFooterBox .cnnRight DIV.cnnCopyright IMG#cnnFooterAdOpt { float:none;margin:0 0 0 6px;vertical-align:bottom; }
</style>
<img src="http://i.cdn.turner.com//si/.element/img/4.1/global/logo_footer_turner.png" alt="Turner - SI Digital"/>
<script type="text/javascript">if( ( ( document.location.pathname ).indexOf( '/basketball/nba' ) >= 0 ) || ( ( document.location.pathname ).indexOf( '/video/nba' ) == 0 ) ) { document.write( 'TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.' ); } else { document.write( 'TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.' ); }</script><noscript>TM & &#169; 2012 Turner Broadcasting System, Inc. A Time Warner Company. All Rights Reserved. SI.com is part of CNN Digital Network, which is part of the Turner Digital Network.</noscript>
<br/> <a href="/interactive_legal.html" rel="nofollow">Terms</a> under which this service is provided to you. Read our <a href="/privacy/" rel="nofollow">privacy guidelines</a>, <a href="https://subscription.timeinc.com/storefront/privacy/si/generic_privacy_new.html?dnp-source=E#california" rel="nofollow">your California privacy rights</a>, and <a href="http://subscription-assets.timeinc.com/prod/assets/themes/magazines/default/template-resources/html/legal/ti-corp-behavioral.html">ad choices<img src="http://i.cdn.turner.com/si/.element/img/4.1/global/logo_adchoices.gif" id="cnnFooterAdOpt"/></a>.
</div>
</div>
<div class="cnnLeft"><a href="http://sportsillustrated.cnn.com/vault/cover/featured/11730/index.htm?xid=sivcoverhome"><img style="vertical-align:bottom;" title="SI Cover" alt="SI Cover" src="http://i.cdn.turner.com/si/si_online/covers/images/2012/0416_thumb.jpg"></a><a href="http://www.sportsillustratedeverywhere.com/?xid=sivcoverhome"><img style="vertical-align:bottom;" src="http://i.cdn.turner.com/sivault/.element/img/1.0/read_all_articles_96x12.gif" alt="Read All Articles" border="0" width="96" height="12"></a><a href="http://www.sicovers.com/ils.aspx?p=SPR20120416golf&utm_source=sivault&utm_medium=inet&utm_campain=icrefer &xid=sivcoverhome" target="_blank"><img style="vertical-align:bottom;" src="http://i.cdn.turner.com/sivault/.element/img/1.0/buy_cover_reprint.gif" alt="Buy Cover Reprint" border="0" width="96" height="12"></a>
</div>
</div>
</div>
<!-- end footerbox -->
<!-- start searchbar -->
<div class="cnnSearchFooter">
<div class="cnnCenter"><form method="get" action="http://sportsillustrated.cnn.com/search/" name="footer_search"><input id="searchInputFooter" type="text" name="text" class="cnnLeft"/><input type="image" src="http://i.cdn.turner.com/si/.element/img/4.1/global/search.gif" alt="Search" title="Search" class="cnnRight"/></form></div>
</div>
<!-- end searchbar -->
<!--START OF PAGELINKS.JS-->
<script language="Javascript">// Post Processing code to update links with tracking references
var url = window.location.href.toString();
url = url.replace(/http:\/\/[^\/]*/, '');
url = url.replace(/\?.*$/, '');
// All links on page
var links = document.getElementsByTagName('a');
for (var i=0; i < links.length; i++) {
var link = links[i];
if (link.href); else continue;
if (link.href.indexOf('.html/')>0) { siLog.debug('Fix trail slash - ',link.href); link.href = link.href.replace(/\.html\//,'.html'); }
if (!cnnPage.isHomepage) {
// Loop through links, add erefs where expected
if (link.href.indexOf('http://www.fannation.com/') == 0) {
cnnAddQ( link, 'eref=fromSI' );
}
if (url != '/' && link.href.indexOf('/vault') > 0) {
cnnAddQ( link, 'eref=sisf' );
}
if (url.indexOf('/danpatrick') != 0 && link.href.indexOf('/danpatrick') > 0 && link.href.indexOf('.mp3') < 0) {
cnnAddQ( link, 'eref=fromSI' );
}
}
if (link.innerHTML == link.getAttribute('title')) {
link.setAttribute('title','');
}
}
function cnnAddQ (link, add) {
if (link.href.toLowerCase().indexOf('javascript') == -1) {
if (link.href.indexOf('?') > 0) link.href = link.href + '&' + add;
else link.href = link.href + '?' + add;
}
}
// Add whitespace to cnnClear
var breaks = $c('cnnClear','div');
/* Homepage */
if (cnnPage.isHomepage) {
cnnTagHPLinks();
/* iPad */
if(navigator.userAgent.indexOf('iPad')>-1) {
$e('cnnShareRow_mobile').href='http://ax.itunes.apple.com/WebObjects/MZStore.'
+'woa/wa/browserRedirect?url=itms%253A%252F%252Fax.itunes.apple.com%252FWebObj'
+'ects%252FMZStore.woa%252Fwa%252FviewSoftware%253Fid%253D329510739%2526mt%253D8';
}
/* Poll frame height issue */
if ($e('cnnPollFrame')) { $e('cnnPollFrame').setAttribute('height','169'); }
}</script>
<!--END OF PAGELINKS.JS-->
</div>
<div><!-- move tracking out of cnnpage -->
<!-- ADBP/JSMD -->
<!-- ADBP Meta Data -->
<script type="text/javascript" src="http://i.cdn.turner.com/si/.e/js/4.1/global/jsmd/metadata.js"></script>
<!-- /ADBP Meta Data -->
<!-- JSMD Code -->
<script language="JavaScript" type="text/javascript" src="http://i.cdn.turner.com/si/.element/js/4.1/global/jsmd/jsmd.js"></script>
<script language="JavaScript">
<!-- $pathname is defined in metadata.js
if($pathname.indexOf("/.element/ssi/ads.iframes/") == -1 && $pathname.indexOf("/doubleclick/dartiframe.html") == -1) {
var jsmd=_jsmd.init();
if(document.referrer !== window.location.href){
jsmd.send();
}
}
//-->
</script>
<!-- / End JSMD Code -->
<!-- /ADBP/JSMD -->
</div>
<div style="font-size:1px;line-height:1px;">
<div><img src="/cookie.crumb" width="1" height="1"></div>
</div>
<img src="http://i.cdn.turner.com/si/.e/img/4.0/global/pixels/blank_pixel.gif" alt="" id="TargetImageDE" name="TargetImageDE" onload="cnnad_getDEAdHeadCookie(this)" height="1" width="1">
<script language="JavaScript">
siTracking.init();
</script>
<script language="JavaScript">
//ADM
cnnad_sendADMData();
cnnad_ugsync();
</script>
<!-- TIIAD -->
<script type="text/javascript">
function siQuantcast()
{
var lb = "Time Inc News Business and Sports,Sports Illustrated";
var lb_ch = (jsmd.get("m:page.section[0]") ? jsmd.get("m:page.section[0]") : "");
lb+=(lb_ch != null && typeof(lb_ch) == "string" && lb_ch.length > 0) ? "." + lb_ch:"";
return lb;
}
_qoptions={
qacct:"p-5dyPa639IrgIw",
labels:siQuantcast()
};
</script>
<script type="text/javascript" src="http://edge.quantserve.com/quant.js"></script>
<noscript><img src="http://pixel.quantserve.com/pixel/p-5dyPa639IrgIw.gif?labels=Time Inc News Business and Sports,Sports Illustrated" style="display: none;" border="0" height="1" width="1" alt="Quantcast"/></noscript>
<script src="http://js.revsci.net/gateway/gw.js?csid=H07710&auto=t" type="text/javascript"></script>
<!-- /TIIAD -->
<script src="http://i.cdn.turner.com/si/.e1d/js/4.1/global/pagelinks.js" type="text/javascript"></script>
<script src="http://i.cdn.turner.com/si/.e1d/js/4.1/global/subnav.js" type="text/javascript"></script>
<!-- end contentFooter -->
<!--[if IE 6]></div></div><![endif]--><!--[if IE 7]></div></div><![endif]--><!--[if IE 8]></div></div><![endif]-->
</body>
</html>

@ -0,0 +1,39 @@
import os
import unittest
from readability import Document
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
def load_sample(filename):
"""Helper to get the content out of the sample files"""
return open(os.path.join(SAMPLES, filename)).read()
class TestArticleOnly(unittest.TestCase):
"""The option to not get back a full html doc should work
Given a full html document, the call can request just divs of processed
content. In this way the developer can then wrap the article however they
want in their own view or application.
"""
def setUp(self):
""""""
pass
def tearDown(self):
""""""
pass
def test_si_sample(self):
"""Using the si sample, make sure we can get the article alone."""
sample = load_sample('si-game.sample.html')
doc = Document(sample)
res = doc.summary(document_only=True)
self.assertEqual('<div class="', res[0:12])
Loading…
Cancel
Save