Jerry Merge: First working find_next_page_link case

0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent edc0e4d4c6
commit 2505c78e5b

@ -0,0 +1,307 @@
"""
This module provides a regression test for results of running the readability
algorithm on a variety of different real-world examples. For each page in the
test suite, a benchmark was captured that represents the current readability
results. Note that these are not necessarily ideal results, just the ones used
as a benchmark.
This allows you to tweak and change the readability algorithm and see how it
changes existing results, hopefully for the better.
"""
from lxml.html import builder as B
import lxml.html
import lxml.html.diff
import os
import os.path
import re
import readability
import sys
import unittest
import yaml
YAML_EXTENSION = '.yaml'
ORIGINAL_SUFFIX = '-orig.html'
READABLE_SUFFIX = '-rdbl.html'
RESULT_SUFFIX = '-result.html'
DIFF_SUFFIX = '-diff.html'
TEST_DATA_PATH = 'regression_test_data'
TEST_OUTPUT_PATH = 'regression_test_output'
TEST_SUMMARY_PATH = os.path.join(TEST_OUTPUT_PATH, 'index.html')
SUMMARY_CSS = '''
table, th, td {
border: 1px solid black;
border-collapse: collapse;
font-family: Georgia, 'Times New Roman', serif;
}
table {
margin: auto;
}
.skipped {
color: gray;
}
td, th {
font-size: 1.2em;
border: 1px solid black;
padding: 3px 7px 2px 7px;
}
th {
font-size: 16px;
text-align: left;
padding-top: 5px;
padding-bottom: 4px;
}
'''
READABILITY_CSS = '''
#article {
margin: 0 auto;
max-width: 705px;
min-width: 225px;
font-family: Georgia, 'Times New Roman', serif;
font-size: 19px;
line-height: 29px;
}
#article p {
font-size: 19px;
line-height: 29px;
margin: 19px 0px 19px 0px;
}
ins {
background-color: #C6F7C3;
text-decoration: none;
}
ins img {
border-width: 3px;
border-style: dotted;
border-color: #51B548;
}
del {
background-color: #F7C3C3;
text-decoration: none;
}
del img {
border-width: 3px;
border-style: dotted;
border-color: #D12626;
}
'''
class ReadabilityTest:
def __init__(
self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path
):
self.dir_path = dir_path
self.enabled = enabled
self.name = name
self.desc = desc
self.notes = notes
self.orig_path = orig_path
self.rdbl_path = rdbl_path
class ReadabilityTestData:
def __init__(self, test, orig_html, rdbl_html):
self.test = test
self.orig_html = orig_html
self.rdbl_html = rdbl_html
class ReadabilityTestResult:
def __init__(self, test_data, result_html, diff_html):
self.test_data = test_data
self.result_html = result_html
self.diff_html = diff_html
def read_yaml(path):
with open(path, 'r') as f:
return yaml.load(f)
def make_path(dir_path, name, suffix):
return os.path.join(dir_path, ''.join([name, suffix]))
def make_readability_test(dir_path, name, spec_dict):
if 'enabled' in spec_dict:
enabled = spec_dict['enabled']
else:
enabled = True
if 'notes' in spec_dict:
notes = spec_dict['notes']
else:
notes = ''
return ReadabilityTest(
dir_path,
enabled,
name,
spec_dict['test_description'],
notes,
make_path(dir_path, name, ORIGINAL_SUFFIX),
make_path(dir_path, name, READABLE_SUFFIX)
)
def load_test_data(test):
if test.enabled:
orig = open(test.orig_path, 'r').read()
rdbl = open(test.rdbl_path, 'r').read()
return ReadabilityTestData(test, orig, rdbl)
else:
return None
def load_readability_tests(dir_path, files):
yaml_files = [f for f in files if f.endswith(YAML_EXTENSION)]
yaml_paths = [os.path.join(dir_path, f) for f in yaml_files]
names = [re.sub('.yaml$', '', f) for f in yaml_files]
spec_dicts = [read_yaml(p) for p in yaml_paths]
return [
make_readability_test(dir_path, name, spec_dict)
for (name, spec_dict) in zip(names, spec_dicts)
]
def execute_test(test_data):
if test_data is None:
return None
else:
doc = readability.Document(test_data.orig_html)
summary = doc.summary()
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
return ReadabilityTestResult(test_data, summary.html, diff)
def element_string_lengths(elems):
return [len(e.xpath('string()')) for e in elems]
class ResultSummary():
def __init__(self, result):
doc = lxml.html.fragment_fromstring(result.diff_html)
insertions = doc.xpath('//ins')
insertion_lengths = element_string_lengths(insertions)
self.insertions = sum(insertion_lengths)
self.insertion_blocks = len(insertions)
deletions = doc.xpath('//del')
deletion_lengths = element_string_lengths(deletions)
self.deletions = sum(deletion_lengths)
self.deletion_blocks = len(deletions)
pass
def make_summary_row(test, result):
def data(suffix):
return os.path.join('..', TEST_DATA_PATH, test.name + suffix)
def output(suffix):
return test.name + suffix
if test.enabled:
s = ResultSummary(result)
return B.TR(
B.TD(test.name),
B.TD('%d (%d)' % (s.insertions, s.insertion_blocks)),
B.TD('%d (%d)' % (s.deletions, s.deletion_blocks)),
B.TD(
B.A('original', href = data(ORIGINAL_SUFFIX)),
' ',
B.A('benchmark', href = output(READABLE_SUFFIX)),
' ',
B.A('result', href = output(RESULT_SUFFIX)),
' ',
B.A('diff', href = output(DIFF_SUFFIX))
),
B.TD(test.notes)
)
else:
return B.TR(
B.CLASS('skipped'),
B.TD('%s (SKIPPED)' % test.name),
B.TD('N/A'),
B.TD('N/A'),
B.TD('N/A'),
B.TD(test.notes)
)
def make_summary_doc(tests_w_results):
tbody = B.TBODY(
B.TR(
B.TH('Test Name'),
B.TH('Inserted (in # of blocks)'),
B.TH('Deleted (in # of blocks)'),
B.TH('Links'),
B.TH('Notes')
)
)
for (test, result) in tests_w_results:
row = make_summary_row(test, result)
tbody.append(row)
return B.HTML(
B.HEAD(
B.TITLE('Readability Test Summary'),
B.STYLE(SUMMARY_CSS, type = 'text/css')
),
B.BODY(
B.TABLE(
tbody
)
)
)
def write_summary(path, tests_w_results):
doc = make_summary_doc(tests_w_results)
with open(path, 'w') as f:
f.write(lxml.html.tostring(doc))
def add_css(doc):
style = B.STYLE(READABILITY_CSS, type = 'text/css')
head = B.HEAD(style, content = 'text/html; charset=utf-8')
doc.insert(0, head)
def write_output_fragment(fragment, output_dir_path, test_name, suffix):
doc = lxml.html.document_fromstring(fragment)
add_css(doc)
html = lxml.html.tostring(doc)
file_name = ''.join([test_name, suffix])
path = os.path.join(output_dir_path, file_name)
with open(path, 'w') as f:
f.write(html)
def write_result(output_dir_path, result):
test_name = result.test_data.test.name
specs = [
(result.test_data.rdbl_html, READABLE_SUFFIX),
(result.diff_html, DIFF_SUFFIX),
(result.result_html, RESULT_SUFFIX)
]
for (html, suffix) in specs:
write_output_fragment(html, output_dir_path, test_name, suffix)
def print_test_info(test):
name_string = '%s' % test.name
if test.enabled:
skipped = ''
else:
skipped = ' (SKIPPED)'
print('%20s: %s%s' % (name_string, test.desc, skipped))
def run_readability_tests():
files = os.listdir(TEST_DATA_PATH)
tests = load_readability_tests(TEST_DATA_PATH, files)
test_datas = [load_test_data(t) for t in tests]
results = [execute_test(t) for t in test_datas]
for (test, result) in zip(tests, results):
print_test_info(test)
if result:
write_result(TEST_OUTPUT_PATH, result)
write_summary(TEST_SUMMARY_PATH, zip(tests, results))
def main():
if len(sys.argv) > 1 and sys.argv[1] == 'unittest':
del sys.argv[1]
return unittest.main()
run_readability_tests()
if __name__ == '__main__':
main()

@ -0,0 +1,664 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
<head>
<title>June Web browser stats: Rapid Release edition</title>
<!-- Begin CSS -->
<link rel="stylesheet" type="text/css" href="http://static.arstechnica.net//public/v6/styles/light/light.c.css?1309476728" media="screen" />
<link rel="stylesheet" type="text/css" href="http://static.arstechnica.net//public/v6/styles/print/print.css?1309476728" media="print" />
<!-- End CSS -->
<link rel="apple-touch-icon" href="http://static.arstechnica.net/apple-touch-icon.png" />
<link rel="canonical" href="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars" />
<link rel="shorturl" href="http://arst.ch/q4c" />
<link rel="shortlink" href="http://arst.ch/q4c" />
<link rev="canonical" href="http://arst.ch/q4c" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Ars Technica" />
<link rel="shortcut icon" href="http://static.arstechnica.net/favicon.ico" />
<link rel="icon" type="image/x-icon" href="http://static.arstechnica.net/favicon.ico" />
<!-- Begin Feeds -->
<link rel="alternate" type="application/rssxml" title="The Web" href="http://feeds.arstechnica.com/arstechnica/web/" />
<link rel="alternate" type="application/rss+xml" title="All Articles " href="http://feeds.arstechnica.com/arstechnica/everything" />
<!-- End Feeds -->
<!-- C-razy IE9 stuff -->
<meta name="application-name" content="Ars Technica"/>
<meta name="msapplication-starturl" content="http://arstechnica.com/"/>
<meta name="msapplication-tooltip" content="Ars Technica: Serving the technologist for 1.2 decades"/>
<meta name="msapplication-task" content="name=News;action-uri=http://arstechnica.com/;icon-uri=http://arstechnica.com/favicon.ico"/>
<meta name="msapplication-task" content="name=Features;action-uri=http://arstechnica.com/features/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-features.ico"/>
<meta name="msapplication-task" content="name=OpenForum;action-uri=http://arstechnica.com/civis/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-forum.ico"/>
<meta name="msapplication-task" content="name=One Microsoft Way;action-uri=http://arstechnica.com/microsoft/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-omw.ico"/>
<meta name="msapplication-task" content="name=Subscribe;action-uri=http://arstechnica.com/subscriptions/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-subscribe.ico"/>
<!-- Begin Metadata -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=1000" />
<meta name="description" content="In our monthly look at the world of Web browser market share statistics, we take a look at the first impact of Mozilla's new Rapid Release policy for Firefox and also consider why some Chrome users aren't aboard Google's update bandwagon." />
<meta name="keywords" content="" />
<meta name="title" content="June Web browser stats: Rapid Release edition" />
<link rel="image_src" href="http://static.arstechnica.net/assets/2011/03/firefox-09-small-thumb-300x169-20442-f.jpg" />
<meta name="medium" content="news" />
<meta name="entry_id" content="51247" />
<meta property="og:title" content="June Web browser stats: Rapid Release edition"/>
<meta property="og:site_name" content="Ars Technica"/>
<meta property="og:image" content="http://static.arstechnica.net/assets/2011/03/firefox-09-small-thumb-300x169-20442-f.jpg"/>
<meta name="advertising" content="ask" />
<meta property="fb:admins" content="13703630" />
<!-- End Metadata -->
<!-- Entry - itbiz_general_computing -->
<style type="text/css" id="resource-styles"> </style>
<script type="text/javascript" src="/public/shared/scripts/da-1.5.js"></script>
<script type="text/javascript">
try {
cnp.ad.dart.setSite("ars.dart");
cnp.ad.dart.setZone('itbiz_general_computing');
//cnp.ad.dart.addParameterString('kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
cnp.ad.dart.addParameterString('mtfIFPath=/mt-static/plugins/ArsTheme/ad-campaigns/doubleclick/');
cnp.ad.emptyFrameSrc="/public/shared/scripts/empty.html";
cnp.ad.loaderFrameSrc="/public/shared/scripts/ad-loader-frame.html";
} catch(e) {}
</script>
<script type="text/javascript" charset="utf-8">
// In case someone on a desktop clicks a mobile #! link
var l = window.location;
if(l.hash.indexOf('#!') !== -1){
window.location = l.protocol + '//' + l.host + l.hash.slice(2);
}
</script>
</head>
<body class="individual">
<div id="page" class="">
<div id="masthead" class="">
<div id="logo"><a href="/"><img src="http://static.arstechnica.net//public/v6/styles/light/images/masthead/logo.png?1309476728" alt="Ars Technica: The Art of Technology" width="110" height="81" /></a></div>
<div id="ebc51ce07629d0e14d2fbc4236e44067" >
<script type="text/javascript">
var pbanner_start = new Date();
try {
var pbanner = cnp.ad.create(cnp.ad.refreshable, false);
//pbanner.addParameter({'dcopt':'ist'});
pbanner.addParameterString('kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
pbanner.addParameter({'sz': '728x90' });
} catch(e) {}
</script>
</div>
</div>
<div id="search-navigation">
<div id="search">
<a id="search-link" href="http://www.google.com/cse?cx=011835048811694782689:7zpko-isndo">Search</a>
<div class="form">
<span>Search:</span>
<form action="http://www.google.com/cse" id="search-form">
<div>
<input type="hidden" value="011835048811694782689:7zpko-isndo" name="cx"/>
<input type="hidden" value="UTF-8" name="ie"/>
<input type="text" id="search-form-text" value="" name="q"/>
</div>
</form>
</div>
</div>
<div id="navigation">
<ul id="primary-navigation">
<li class=""><a href="/">All</a></li>
<li class="apple"><a href="/apple/">Apple</a></li>
<li class="ask-ars"><a href="/ask-ars/">Ask Ars</a></li>
<li class="business"><a href="/business/">Business</a></li>
<li class="gadgets"><a href="/gadgets/">Gadgets</a></li>
<li class="gaming"><a href="/gaming/">Gaming</a></li>
<li class="microsoft"><a href="/microsoft/">Microsoft</a></li>
<li class="open-source"><a href="/open-source/">Open Source</a></li>
<li class="science"><a href="/science/">Science</a></li>
<li class="tech-policy"><a href="/tech-policy/">Tech Policy</a></li>
<li id="primary-navigation-more" style="display:none;">
More
<ul >
<li><a href="/hardware/">Hardware</a></li>
<li><a href="/media/">Media</a></li>
<li><a href="/security/">Security</a></li>
<li><a href="/software/">Software</a></li>
<li><a href="/staff/">Staff</a></li>
<li><a href="/telecom/">Telecom</a></li>
<li><a href="/web/">Web</a></li>
<li style="padding:0;"><span style="display:inline;background-color: #920404; padding: 3px; color:white; -webkit-border-radius: 4px;">New</span> <a style="display:inline;" href="/site/tv.ars" title="Ars Technica TV">Ars.TV</a></li>
</ul>
</li>
</ul>
<ul id="secondary-navigation" class="web">
<li class="news selected"><a href="/web/news/">News</a></li>
<li class="guides"><a href="/web/guides/">Guides</a></li>
<li class="reviews"><a href="/web/reviews/">Reviews</a></li>
</ul>
<ul id="auxiliary-navigation">
<li class="subscribe"><a href="/subscriptions/">Upgrade to a Premier Subscription</a>
</li>
<li class="customize" style="display:none;">
<a href="#">Customize ▾</a>
<ul>
<li>
<p>Site Theme:</p>
<label><input type="radio" checked="checked" value="light.css" class="site-style" name="site-style" /> White</label>
<label><input type="radio" value="dark.css" class="site-style" name="site-style" /> Black</label>
</li>
<li>
<p>Choose body font:</p>
<label><input type="radio" checked="checked" value="arial" class="body_font" name="body_font" /> Arial</label>
<label><input type="radio" value="helvetica" class="body_font" name="body_font" /> Helvetica</label>
</li>
<li>
<p>Layout (beta):</p>
<label><input type="radio" checked="checked" value="normal" class="fp_layout" name="fp_layout" /> Normal</label>
<label><input type="radio" value="compact" class="fp_layout" name="fp_layout" /> Compact</label>
</li>
</ul>
</li>
<li class="openforum"><a href="http://arstechnica.com/civis/">OpenForum</a></li>
<li class="login-join"><a href="/civis/ucp.php?mode=login&amp;return_to=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars">Login/Join</a></li>
</ul>
</div>
</div>
<div id="main">
<div id="silo-header" class="">
<h1 class="web"><a href="/web/" title="Go to The Web">The Web</a></h1>
</div>
<div id="content" class="normal"> <div id="content-inner">
<div id="story">
<h2 class="title">June Web browser stats: Rapid Release edition</h2>
<div class="byline"><span class="author">By <a rel="author" href="/author/peter-bright/">Peter Bright</a>
</span> | <span class="posted"><span class="published updated"><span class="name">Published </span> <abbr class="timeago datetime" title="2011-07-06T16:00:00Z">July 6, 2011 11:00 AM</abbr></span><span class="modified" style="display:none;"><span class="name">Last updated </span> <abbr class="timeago datetime" title="2011-07-06T16:33:33Z">July 6, 2011 11:33 AM</abbr></span></span></div>
<div class="story-image" style="width:300px;">
<img width="300" src="http://static.arstechnica.net/opensource/firefox-09-small.jpg" alt="" />
</div>
<div id="" class="body" style="">
<!--body--><p>June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different <a href="http://arstechnica.com/open-source/news/2011/06/firefox-5-released-arrives-only-three-months-after-firefox-4.ars">Firefox 5</a>. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.</p><!--page 1-->
<p>Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/global-browser-share.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
<p>The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points&#8212;that's a loss representing 20% of its market share. Our own Ryan Paul <a href="http://arstechnica.com/software/reviews/2011/06/hands-on-opera-1150s-new-featherweight-interface-packs-a-punch.ars">liked Opera 11.50</a>, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.</p>
<p>Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions&#8212;Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista&#8212;rather than making converts of the other browsers.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/internet-explorer-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
<p>Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we <a href="http://arstechnica.com/microsoft/news/2010/04/why-microsoft-did-the-right-thing-in-ditching-xp-for-ie9.ars">agree with the decision to cut Windows XP off</a>, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent&#8212;putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.</p>
<p>Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/firefox-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
<p>Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/chrome-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
<p>Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.</p>
<p>Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.</p>
<p>That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).</p>
<p>To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The <a href="http://data.cnzz.com/main.php?s=brow">most popular</a> of these are <a href="http://se.360.cn/">360 Secure Browser</a> with about 19 percent share of the Chinese market, and <a href="http://ie.sogou.com/">Sogou high speed browser</a>, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like <a href="http://code.google.com/chrome/chromeframe/">Chrome Frame</a>, that allows Internet Explorer users to use Chrome for some browser tabs.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><a href="http://static.arstechnica.com/browsers-june-2011/sogou-ie.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-ie.png" /></a></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-text">Sogou browser running as Internet Explorer</div><div class="news-item-figure-caption-byline">Thanks to Ars reader WJ</div></div></div>
<p>These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><a href="http://static.arstechnica.com/browsers-june-2011/sogou-chrome.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-chrome.png" /></a></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-text">Sogou browser using its embedded Chrome</div><div class="news-item-figure-caption-byline">Thanks to Ars reader WJ</div></div></div>
<p>The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.</p>
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/ars-browser-share.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline">Ars Technica</div></div></div>
<p>Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."</p>
</div>
<!-- Article Pager -->
</div>
<noscript>
<img style="position: absolute; bottom: 0px; right: 0px; width: 1px; height: 1px;" src="http://arstechnica.com/dragons/brains.gif?id=51247&amp;1396906973" alt="" />
</noscript>
<script type="text/javascript">
document.write('<img style="position: absolute; bottom: 0px; right: 0px; width: 1px; height: 1px;" src="http://arstechnica.com/dragons/brains.gif?id=51247&amp;' + (parseInt(Math.random()*99999999, 10)).toString() + '" alt="" />');
</script>
<!--googleoff: all-->
<div id="comments-bar" class="with-bubble">
<h2>User comments</h2>
<div class="comments-link">
<a name="comments-bar" rel="nofollow" href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars?comments=1#comments-bar">Click here to view the 81 comments on this story</a>
</div>
</div>
<div id="hiddencomment"></div>
<!--<div id="alert"><p><img src="http://arstechnica.com/civis/images/smilies/flail.gif" /> We're making some updates to the commenting system. We should have the kinks worked out soon.</p></div>-->
<!--googleon: all-->
<div id="links-bar">
<ul>
<li class="facebook">
<iclint src="http://www.facebook.com/plugins/like.php?href=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars&amp;layout=button_count&amp;show_faces=false&amp;width=85&amp;action=like&amp;font=arial&amp;colorscheme=light&amp;height=21" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:85px; height:21px;" allowTransparency="true"></iclint>
</li>
<li><a href="http://twitter.com/share" class="twitter-share-button" data-url="http://arst.ch/q4c" data-counturl="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars" data-count="horizontal" data-via="arstechnica" data-related="drpizza:Peter Bright">Tweet</a></li>
<li class="reddit">
<iclint src="http://www.reddit.com/static/button/button1.html?width=120&url=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars&amp;title=June%20Web%20browser%20stats%3A%20Rapid%20Release%20edition&amp;bgcolor=fff&amp;bordercolor=eee" width="120" height="20" scrolling="no" frameborder="0"></iclint>
</li>
<li class="share">
<a class="a2a_dd" href="http://www.addtoany.com/share_save?linkname=June%20Web%20browser%20stats%3A%20Rapid%20Release%20edition&amp;linkurl=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars"><img src="http://static.addtoany.com/buttons/favicon.png" width="16" height="16" border="0" alt="Share/Bookmark" style="display:inline;vertical-align:middle;"/> Share/Email</a>
<script type="text/javascript">
var a2a_linkname="June Web browser stats: Rapid Release edition",
a2a_linkurl="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars",
a2a_onclick=1,
a2a_show_title=1,
a2a_hide_embeds=0,
a2a_num_services=8,
a2a_color_main="989EA3",
a2a_color_border="989EA3",
a2a_color_link_text="FF5B00",
a2a_color_link_text_hover="ffffff",
a2a_track_links='ga',
a2a_prioritize= [
"digg",
"yahoo_buzz",
"stumbleupon",
"instapaper",
"slashdot",
"linkedin",
"delicious",
"google_reader",
"tumblr",
"posterous"
];
var a2a_config = a2a_config || {};
a2a_config.no_3p = 1;
</script>
<style type="text/css">#a2apage_BROWSER { display:none !important; }</style>
</li>
<li class="copypasta copy-pasta-button">Make a correction</li>
</ul>
</div>
<!--googleoff: all-->
<div id="read-more-stories">
<h2>Read more stories</h2>
<div class="story-navigation">
<a href="/gadgets/news/2011/07/amazon-appstore-game-developer-pulls-app-highlights-problems.ars" title="Read the previously published article">&lt; Older Story</a>
|
<a href="/tech-policy/news/2011/07/copyright-troll-righthaven-now-starts-paying-those-it-sued.ars" title="Read the next newest article">Newer Story &gt;</a>
</div>
<!--googleoff: all-->
<script language='JavaScript'>
var OB_langJS = "http://static.arstechnica.net//public/v6/scripts/outbrain.lang_en_ars.js",OBITm = '1306449288604',OB_raterMode = 'singlethumb',OB_recMode = 'strip',OutbrainPermaLink='http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars';
if (typeof(OB_Script)!='undefined' ){OutbrainStart();}else{var OB_Script = true,str = unescape("%3Cscript src=\'http://widgets.outbrain.com/OutbrainRater.js\' type=\'text/javascript\'%3E%3C/script%3E");document.write(str);}
</script>
<!--googleon: all-->
</div>
<!--googleon: all-->
</div>
</div>
<!--googleoff: all-->
<div id="sidebar">
<div id="article-links" class="with-divider" style="display:none;">
<ul>
<li class="enlarge-text"><a href="#">Increase text size</a></li>
<li class="shrink-text"><a href="#">Reduce text size</a></li>
<li class="print"><a href="#">Print this story</a></li>
<li class="comment"><a href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars?comments=1#comments-bar#comments-bar">Leave a comment (81)</a></li>
<li class="copy-pasta-button edit-suggestion" style="display: none;"><a href="#">Make a correction</a></li>
<li class="shorturl"><a rel="nofollow" href="http://arst.ch/q4c">http://arst.ch/q4c</a></li>
</ul>
</div>
<style type="text/css" media="screen">
#gwmdRBfSihEbZa {
height: 250px;
width: 300px;
min-height: 250px;
margin-bottom: 10px;
padding-bottom: 10px;
}
#gwmdRBfSihEbZa.tall {
height: 600px;
}
body.premium-adset #gwmdRBfSihEbZa {
/* height: 600px; */
}
</style>
<abbr></abbr>
<blah></blah>
<abbr></abbr>
<div id="gwmdRBfSihEbZa" class="">
<noscript>
<div id="help-by-subscribing">
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/2"><img src="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/4" alt="Please subscribe" /></a></div>
</noscript>
<script type="text/javascript">
try {
var ppanel = cnp.ad.create(cnp.ad.refreshable, false);
ppanel.addParameter({'sz':'300x250'});
ppanel.addParameterString('kw=top;kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
ppanel.load();
} catch(e) {}
</script>
</div>
<div id="journals-box" class="with-divider">
<h2 class="title">Latest Top Stories</h2>
<ul class="category">
<li class="all selected">
<span class="tab-inner">
<a href="/" title="All">All</a>
</span>
</li>
<li class="apple">
<span class="tab-inner">
<a href="/apple/" title="Apple">Apple</a>
</span>
</li>
<li class="gaming">
<span class="tab-inner">
<a href="/gaming/" title="Gaming">Gaming</a>
</span>
</li>
<li class="microsoft">
<span class="tab-inner">
<a href="/microsoft/" title="Microsoft">Microsoft</a>
</span>
</li>
<li class="gadgets">
<span class="tab-inner">
<a href="/gadgets/" title="Gadgets">Gadgets</a>
</span>
</li>
<li class="open-source">
<span class="tab-inner">
<a href="/open-source/" title="Open Source">Open Source</a>
</span>
</li>
<li class="business">
<span class="tab-inner">
<a href="/business/" title="Business">Business</a>
</span>
</li>
<li class="science">
<span class="tab-inner">
<a href="/science/" title="Science">Science</a>
</span>
</li>
<li class="tech-policy">
<span class="tab-inner">
<a href="/tech-policy/" title="Tech Policy">Tech Policy</a>
</span>
</li>
<li class="staff">
<span class="tab-inner">
<a href="/staff/" title="Staff">Staff</a>
</span>
</li>
</ul>
<ul class="stories">
<li id="journal-box-0" class="gadgets">
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars">Dual-core Motorola Droid 3 launches July 14 for $199 on Verizon</a>
</li>
<li id="journal-box-1" class="tech-policy">
<a href="/tech-policy/news/2011/07/major-isps-agree-to-six-strikes-copyright-enforcement-plan.ars">Major ISPs agree to "six strikes" copyright enforcement plan</a>
</li>
<li id="journal-box-2" class="gaming">
<a href="/gaming/news/2011/07/sony-to-include-mandatory-psn-pass-codes-in-first-party-games.ars">Sony to include one-time use "PSN Pass" code in its games</a>
</li>
<li id="journal-box-3" class="gaming">
<a href="/gaming/news/2011/07/journey-turns-strangers-to-friends-in-odd-desolate-landscape.ars"><em>Journey</em> turns strangers into friends in odd, desolate landscape</a>
</li>
<li id="journal-box-4" class="science">
<a href="/science/news/2011/07/is-science-getting-harder-first-define-easy.ars">Is scientific progress slowing? Depends how you measure it</a>
</li>
<li id="journal-box-5" class="tech-policy">
<a href="/tech-policy/news/2011/07/did-the-titanic-disaster-let-uncle-sam-take-over-the-airwaves.ars">How the <em>Titanic</em> disaster pushed Uncle Sam to "rule the air"</a>
</li>
<li id="journal-box-6" class="web">
<a href="/web/news/2011/07/facebook-video-chatting-handy-definitely-not-awesome.ars">Analysis: Facebook video chatting handy, definitely not "awesome"</a>
</li>
<li id="journal-box-7" class="tech-policy">
<a href="/tech-policy/news/2011/07/dozens-of-law-professors-protect-ip-act-is-unconstitutional.ars">Dozens of law professors: PROTECT IP Act is unconstitutional</a>
</li>
<li id="journal-box-8" class="tech-policy">
<a href="/tech-policy/news/2011/07/should-net-neutrality-protect-third-party-mobile-tethering-apps.ars">Does net neutrality protect mobile tethering apps?</a>
</li>
<li id="journal-box-9" class="apple">
<a href="/apple/news/2011/07/wsj-next-iphone-to-be-thinner-and-lighter-than-iphone-4.ars">WSJ: next iPhone to be "thinner and lighter" than iPhone 4</a>
</li>
<li id="journal-box-10" class="apple">
<a href="/apple/news/2011/07/iphone-users-spend-147-hours-a-month-playing-games.ars">iPhone users spend 14.7 hours a month playing games</a>
</li>
<li id="journal-box-11" class="tech-policy">
<a href="/tech-policy/news/2011/07/copyright-troll-righthaven-now-starts-paying-those-it-sued.ars">Copyright troll Righthaven now starts paying those it sued</a>
</li>
<li id="journal-box-12" class="web">
<a href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars">June Web browser stats: Rapid Release edition</a>
</li>
<li id="journal-box-13" class="gadgets">
<a href="/gadgets/news/2011/07/amazon-appstore-game-developer-pulls-app-highlights-problems.ars">Amazon Appstore problems: why one developer pulled its game</a>
</li>
<li id="journal-box-14" class="science">
<a href="/science/news/2011/07/ocean-sediment-promising-source-of-rare-earth-metals.ars">Why ocean mud might matter to your future iPhone</a>
</li>
</ul>
</div>
<div class="with-divider" id="fb">
<iclint src="http://www.facebook.com/plugins/likebox.php?href=http%3A%2F%2Ffacebook.com%2Farstechnica&amp;width=300&amp;colorscheme=light&amp;show_faces=false&amp;stream=false&amp;header=false&amp;height=62&amp;border_color=%23FFFFFF" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:300px; height:62px;" allowTransparency="true"></iclint>
<iclint src="http://www.facebook.com/plugins/activity.php?site=arstechnica.com&amp;width=300&amp;height=370&amp;header=false&amp;colorscheme=light&amp;recommendations=false&amp;border_color=%23FFFFFF" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:300px; height:370px;" allowTransparency="true"></iclint>
<p><a href="#" class="anonymous">Disable Facebook on Ars</a></p>
</div>
<style type="text/css" media="screen">
#mieBfNdjZYK {
height: 250px;
width: 300px;
min-height: 250px;
margin-bottom: 10px;
padding-bottom: 10px;
}
#mieBfNdjZYK.tall {
height: 600px;
}
body.premium-adset #mieBfNdjZYK {
/* height: 600px; */
}
</style>
<kjaskjas></kjaskjas>
<blah></blah>
<sakjasd></sakjasd>
<div></div>
<kjaskjas></kjaskjas>
<div></div>
<span></span>
<clint></clint>
<div id="mieBfNdjZYK" class="">
<noscript>
<div id="help-by-subscribing">
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/2"><img src="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/4" alt="Please subscribe" /></a></div>
</noscript>
<script type="text/javascript">
try {
var ppanel = cnp.ad.create(cnp.ad.refreshable, false);
ppanel.addParameter({'sz':'300x250'});
ppanel.addParameterString('kw=bottom;kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
ppanel.load();
} catch(e) {}
</script>
</div>
<div id="jobs-ars" class="with-divider">
<h2 class="title">
<span class="title">Job.Ars</span>:
<span class="subtitle">looking for a new job?</span>
</h2>
<div class="body">
<ul>
<div id="jobs-ars-content">
<ul>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1027/">Software Engineer</a> at minerva-associates.com</div>
<div class="job-location">San Diego, CA</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1026/">Software Engineer</a> at minerva-associates.com</div>
<div class="job-location">San Diego, CA</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1025/">Senior Java / Scala Developer - Sequencing Informatics </a> at The Broad Institute</div>
<div class="job-location">Cambridge, MA</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1024/">Senior Java / Scala Developer - Sequencing Informatics </a> at The Broad Institute</div>
<div class="job-location">Cambridge, MA</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1022/">Web Developer for Online Organizing Incubator</a> at Citizen Engagement Laboratory</div>
<div class="job-location">San Francisco Bay Area required</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1021/">.NET Developer (Oklahoma City &amp; Salt Lake City) </a> at a la mode, inc.</div>
<div class="job-location">Oklahoma City and Salt Lake City</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1019/">Senior Systems Administrator</a> at Synacor</div>
<div class="job-location">Buffalo, NY</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1018/">Network Engineer</a> at Box.net</div>
<div class="job-location">Palo Alto, CA</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1017/">Software Engineer - Operations</a> at imo</div>
<div class="job-location">Palo Alto, CA</div>
</li>
<li>
<div class="job-title"><a href="//jobs.arstechnica.com/list/1016/">Software Engineer</a> at imo</div>
<div class="job-location">Palo Alto, CA</div>
</li>
</ul>
<div id="more-jobs"><a href="//jobs.arstechnica.com">More Job Listings</a></div>
</div> </ul>
</div>
</div>
</div>
<!--googleon: all-->
</div>
<div id="footer">
<div id="slogan">Serving the technologist for <span id="decades">1</span> &#x00d7; 10<sup>-1</sup> centuries</div>
<iframe src="http://static.arstechnica.net//public/v6/footer.html?1309476727" frameborder="0" scrolling="no" width="1000" height="350"></iframe>
</div>
</div>
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-31997-1']);
_gaq.push(['_trackPageview']);
_gaq.push(['_trackPageLoadTime']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
<script type="text/javascript">
var page_class = 'individual',
site_root = "",
site_root_rel = '/',
discussion_url = "",
entry_author = {
"peter bright":true,
"peter bright":true,
"drpizza":true
},
entry_id = 51247,
fp_layout = 'normal',
syntaxhighlighter = "http://arstechnica.com/public/full/scripts/syntaxhighlighter.js",
new_comments = true,
disable_fb = 'false';
</script>
<script src="http://static.arstechnica.net//public/v6/scripts/site.min.js?1309476727" type="text/javascript" charset="utf-8"></script>
<noscript>
<img src="http://b.scorecardresearch.com/b?c1=2&c2=6035094&c3=&c4=&c5=&c6=&c15=&cv=1.3&cj=1" style="position:absolute; bottom: 0px; right:0px;"
width="1" height="1" alt="" />
</noscript>
<span style="display: none" id="ArsTechnicaNews" class="hslice">
<span style="display: none" class="entry-title">Ars Technica News</span>
<a style="display: none" href="http://www.ieaddons.com/en/ie8slice/Content.ashx?id=330" rel="entry-content"></a>
</span>
</body>
</html>

@ -0,0 +1,53 @@
<div id="article"><div id="" class="body">
<p>June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different <a href="http://arstechnica.com/open-source/news/2011/06/firefox-5-released-arrives-only-three-months-after-firefox-4.ars">Firefox 5</a>. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.</p>
<p>Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/global-browser-share.png"/></div></div>
<p>The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points&#8212;that's a loss representing 20% of its market share. Our own Ryan Paul <a href="http://arstechnica.com/software/reviews/2011/06/hands-on-opera-1150s-new-featherweight-interface-packs-a-punch.ars">liked Opera 11.50</a>, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.</p>
<p>Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions&#8212;Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista&#8212;rather than making converts of the other browsers.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/internet-explorer-transition.png"/></div></div>
<p>Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we <a href="http://arstechnica.com/microsoft/news/2010/04/why-microsoft-did-the-right-thing-in-ditching-xp-for-ie9.ars">agree with the decision to cut Windows XP off</a>, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent&#8212;putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.</p>
<p>Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/firefox-transition.png"/></div></div>
<p>Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/chrome-transition.png"/></div></div>
<p>Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.</p>
<p>Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.</p>
<p>That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).</p>
<p>To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The <a href="http://data.cnzz.com/main.php?s=brow">most popular</a> of these are <a href="http://se.360.cn/">360 Secure Browser</a> with about 19 percent share of the Chinese market, and <a href="http://ie.sogou.com/">Sogou high speed browser</a>, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like <a href="http://code.google.com/chrome/chromeframe/">Chrome Frame</a>, that allows Internet Explorer users to use Chrome for some browser tabs.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><a href="http://static.arstechnica.com/browsers-june-2011/sogou-ie.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-ie.png"/></a></div><div class="news-item-figure-caption"><p class="news-item-figure-caption-text">Sogou browser running as Internet Explorer</p><p class="news-item-figure-caption-byline">Thanks to Ars reader WJ</p></div></div>
<p>These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><a href="http://static.arstechnica.com/browsers-june-2011/sogou-chrome.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-chrome.png"/></a></div><div class="news-item-figure-caption"><p class="news-item-figure-caption-text">Sogou browser using its embedded Chrome</p><p class="news-item-figure-caption-byline">Thanks to Ars reader WJ</p></div></div>
<p>The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.</p>
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/ars-browser-share.png"/></div></div>
<p>Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."</p>
</div>
</div>

@ -0,0 +1,2 @@
test_description: standard article from arstechnica
url: http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars

File diff suppressed because one or more lines are too long

@ -0,0 +1,11 @@
<div id="article"><div class="comment-content" id="comment-content-4e141229cadcbbb33f050000">
<p class="comment-text">
Yep, you gotta love that almost 90% market share failure. Like I said before, if that's failure than sign me up for some of that. I'm pretty sure the good people over at Apple, Google, etc. would like to be signed up for some of that failure too.<br/><br/>
For the, "If this, if that, (insert scenario)" people, enjoy your new OS and whatever other new software you may choose to use. However, don't be surprised when those metro ui interface imitations start to land on those products too. Did you really think that static grid-icons on a screen was going to last forever? I think 20+ years is enough, it's time for new innovation in design and don't be surprised when the copycats jump on board. That's the way the industry works. One group comes up with a new design or concept and the others tend to follow suit and you don't have to be a market leader to get that following. Just ask the Opera/Chrome developers. That's just one of many, many examples that could be pointed out. The metro ui is a very suitable design for the touch screen world that we're migrating to. Sure, there will be changes and enhancements as time goes on and everyone will put their own spin on it, but I'd get used to similar offerings from MSFT's competitors if I were you.<br/><br/>
Also, for those who like to comment, but seem to have little info about what's expected in things like Windows 8, let me fill you in a bit. The info. out right now is that Windows 8 will let you choose to use the new ui or to use the more, "Windows past" icon ui. I think anyone with some modicum of common sense can see how that would be a wise move from MSFT. For instance: The metro ui may not appeal to the corporate world as much as the consumer world. Plus, it give long-time Window's users the option to stick with what they know, but still gain the newest features and security measures that new OS's tend to bring. So, if your going to use another product, but all means, have fun with it, but don't try to justify it to yourself with reasons that are unlikely to exist. Just say you want to move on and anyone else can respect that, but when you seem to have little knowledge of what your options will be, it just makes you look like the typical sheep some people can be.<br/><br/>
Personally, I love the new direction MSFT is going in and for the first time in years, they seem to be thinking more and more consumer friendly. That's not an easy task for a company who has to appeal to business the way MSFT does and I commend the effort. Believe me, or don't, but Apple, Google and any other group would suffer the same balancing act if they dominated the corporate world the way Microsoft does. Corporate and consumers are very different beasts and it's not always easy to appeal to both, yet Microsoft has kept a large following in both sectors and anyone who doesn't see the skill it takes to do that, has a lot to learn my friends. </p>
</div>
</div>

@ -0,0 +1,3 @@
test_description: businessinsider article
notes: missed the article completely; got a long comment instead
url: http://www.businessinsider.com/where-windows-8-came-from-microsoft-ui-ideas-that-never-took-off-2011-7

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,2 @@
test_description: cnet article
url: http://howto.cnet.com/8301-11310_39-20078249-285/best-free-alternatives-to-top-selling-software/?tag=epicStories

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,2 @@
test_description: deadspin article
url: http://deadspin.com/5820463/would-you-kill-a-stranger-to-save-football

File diff suppressed because one or more lines are too long

@ -0,0 +1,31 @@
<div id="article"><div class="mod-article-title">
<div class="datehead"><span class="page-actions">
<p id="fb-root"/><p class="date"><span>Updated: </span>July 12, 2011, 4:52 PM ET</p>
</span></div>
<p class="headline">
</p><h1 class="h2">Roger Clemens' defense sets strategy</h1>
</div>
<div><p>
WASHINGTON -- <a href="http://espn.go.com/mlb/player/_/id/1427/roger-clemens">Roger Clemens</a>' attorney revealed Tuesday that the ex-baseball star plans to begin his defense against charges of lying to Congress by questioning if the lawmakers' investigation into whether he used performance-enhancing drugs was proper.</p><p>Clemens attorney Michael Attanasio said in court that the hearing the House Oversight and Government Reform Committee held in February 2008 had nothing to do with Congress' responsibility for legislation. He said the hearing was only concerned with airing a "credibility contest" between Clemens and his longtime trainer, Brian McNamee, who said he injected the pitcher with steroids and human growth hormone.</p><p/><div class="mod-container mod-inline content-box mod-podcast floatright mod-no-header-footer">
<div class="mod-content"><h4>Mike and Mike in the Morning</h4><p class="podcast-player"/>
<p>ESPN legal analyst Roger Cossack explains what is going on with the Roger Clemens trial.</p>
<p class="footer clear"><a href="http://espn.go.com/espnradio/podcast/"> More Podcasts &#187;</a></p></div></div>
<p>Clemens denied those allegations and has been charged with perjury, false statements and obstruction of Congress. The obstruction count charges Clemens with making 15 false or misleading statements to the committee, including his repeated denials he didn't take performance-enhancing drugs during his 24-season career and even whether he attended a 1998 pool party at then-<a href="http://espn.go.com/mlb/team/_/name/tor/toronto-blue-jays">Toronto Blue Jays</a> teammate Jose Canseco's home in Miami.</p><p>McNamee says he saw Clemens and admitted steroids user Canseco talking at the party with another man and that after they returned to Canada, Clemens asked McNamee to inject him with steroids for the first time. </p><p>
Clemens and Canseco say Clemens was never at the party but was golfing at the time. Attanasio said that dispute suggests how improper the whole inquiry was and that jurors should be able to determine whether a "he said, he said debate" between Clemens and McNamee was a legitimate congressional concern.</p><p>"We're going to have a mini-trial on whether Roger Clemens went swimming," Attanasio said. "We're going to have a trial in U.S. District Court, Congress is going to have a hearing on these things? That's our point."</p><p>Assistant U.S. attorney Daniel Butler responded that the committee has responsibility for oversight that is broad and goes beyond legislation. He said steroids in baseball is a drug matter and pointed out that a 2005 hearing into the issue led to legislation to regulate steroids and triggered Major League Baseball to commission a report by former Sen. George Mitchell into the extent of the problem in the league.</p><p/><div class="mod-container mod-no-footer mod-inline content-box floatright mod-no-header-footer">
<div class="mod-content"><h4>Follow the trial</h4>
<img class="io-img" src="http://a.espncdn.com/photo/2010/0116/quinn_tj_m.jpg" border="0"/><p>ESPN's T.J. Quinn will provide live coverage from the courtroom during the Clemens trial. Follow along with our up-to-the-minute <a href="http://twitter.com/#!/TJQuinnESPN" target="_blank"><b>Twitter coverage</b></a>.<br/>
&#8226;&#160; <b><a href="http://espn.go.com/photo/preview/!pdfs/espn_voir_dire_questions.pdf">Voir dire questions</a></b>
</p></div>
</div><p>The Mitchell report was released in December 2007 and named Clemens and 85 other current and former ballplayers as using drugs. Clemens denied the allegations and Butler pointed out that leaders of the House committee said they needed to investigate Clemens' denials to determine what weight to give the Mitchell report and its recommendations.</p><p>Attanasio argued that if the committee's purpose was to come full circle on the Mitchell report, it had done so with a January 2008 hearing featuring testimony by Mitchell, baseball commissioner Bud Selig and former players union director Donald Fehr.</p><p>"That ship had left. That work was done. And now it becomes a question between Mr. Clemens and Mr. McNamee," Attanasio said.</p><p>But U.S. District Judge Reggie Walton said if "one of the icons of baseball" was taking exception to the Mitchell report, "it seems to me that Congress has the authority to hold hearings to determine which view is correct."</p><p>Attanasio said the issue will be addressed in testimony from the first two witnesses prosecutors plan to call after opening arguments Wednesday morning. He said the first will be retired House Parliamentarian Charles Johnson, followed by Phil Barnett, who was chief counsel for the committee at the time it investigated Clemens.</p><p>The dispute over the committee's proper role came as Walton considered what preliminary instructions to give the jury, which was seated Tuesday afternoon after 3&#189; days of screening potential members.</p><p>The jury of 10 women and two men includes a woman whose cousin, former outfielder Al Bumbry, was a coach for the <a href="http://espn.go.com/mlb/team/_/name/bos/boston-red-sox">Boston Red Sox</a> when Clemens played for the team. Another woman on the jury said she believes <a href="http://espn.go.com/nfl/team/_/name/phi/philadelphia-eagles">Philadelphia Eagles</a> quarterback <a href="http://sports.espn.go.com/nfl/players/profile?playerId=2549">Michael Vick</a> was "done wrong" in his criminal conviction in connection with dogfighting.</p><p>Four other people were seated as alternate jurors in case any of the 12 can't serve.</p><p>Prosecutors and Clemens' defense team removed 20 people from the pool of 36 jurors, offering no public explanation for their decisions.</p><p>Clemens' attorney pressed potential jurors not to hold it against Clemens if he chooses not to testify, his strongest hint yet that the ex-pitcher might not take the stand.</p><p>Walton also said he was upset to read a New York Daily News item that members of Clemens' family have been criticizing McNamee and other government witnesses on Twitter and elsewhere online. The judge has a gag order on parties involved in the case, but he said he doesn't have any authority over anyone who isn't before him and hopes that those that are were not involved. </p><p>Clemens' attorney Rusty Hardin said he would look into it but that it's been "extremely difficult" for Clemens' family to see harsh criticisms of the baseball star online and in the media and not be able to respond.</p><p><i>Information from The Associated Press was used in this report.</i>
</p>
</div>
</div>

@ -0,0 +1,2 @@
test_description: espn article
url: http://sports.espn.go.com/mlb/news/story?id=6760720

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,3 @@
test_description: mit news article
notes: links are broken out into paragraph divs
url: http://web.mit.edu/newsoffice/2011/compare-recommendation-systems-0708.html

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,2 @@
test_description: nytimes article
url: http://thecaucus.blogs.nytimes.com/2011/07/12/mcconnell-proposal-gives-obama-power-to-increase-debt-limit/?hp

@ -0,0 +1,9 @@
test_description: multi-page article from nytimes
enabled: false
notes: multi-page not yet implemented
url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html
url_map:
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2: nytimes-000-orig-2.html
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=3: nytimes-000-orig-3.html
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=4: nytimes-000-orig-4.html
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=5: nytimes-000-orig-5.html

File diff suppressed because it is too large Load Diff

@ -0,0 +1,6 @@
<div id="article"><article><p>Put another way, Democrats reacted to the &#8220;grand bargain&#8221; proposed by President Obama and House Speaker John Boehner by squawking, complaining and highlighting elements they didn&#8217;t like. This is known throughout the world as the way to begin a process of negotiation.</p><p>Republicans, by contrast, answered with a definitive &#8220;no&#8221; and then covered their ears. Given the looming Aug. 2 deadline for default if the debt ceiling is not raised, the proper term for this approach is blackmail.</p><p>Yet the &#8220;both sides are to blame&#8221; narrative somehow gained currency after <a href="http://www.washingtonpost.com/business/economy/boehner-abandons-efforts-to-reach-comprehensive-debt-reduction-deal/2011/07/09/gIQARUJ55H_story.html">Boehner announced Saturday</a> that House Republicans would not support any increase in revenue, period. A false equivalence was drawn between the absolute Republican rejection of &#8220;revenue-positive&#8221; tax reform and the less-than-absolute Democratic opposition to &#8220;benefit cuts&#8221; in Medicare and Social Security.</p><p>The bogus story line is that the radical right-wing base of the GOP and the radical left-wing base of the Democratic Party are equally to blame for sinking the deal. </p><p>Leave aside, for the moment, the fact that in the Obama-Boehner proposal, there would be roughly three dollars&#8217; worth of budget cuts for every dollar of new revenue. Don&#8217;t pause to ask whether it makes sense to slash government spending when the economy is still sputtering out of the worst recession in decades. Instead, focus narrowly on the politics of the deal.</p><p>It is true that House Minority Leader Nancy Pelosi howled like a blindsided politician when she learned that entitlement programs were on the table. But her objections &#8212; and those of Democrats in general &#8212; are philosophical and tactical, not absolute.</p><p>Progressives understand that Medicare and Social Security are not sustainable on their current trajectories; in the long term, both must have their revenue and costs brought into balance. Pelosi&#8217;s position is that each program should be addressed with an eye toward sustainability &#8212; not as a part of a last-minute deal for a hike in the debt ceiling that covers us for two or three years.</p><p>It&#8217;s also true that Democrats believe they can win back a passel of House seats next year by highlighting the GOP plan to convert Medicare into a voucher program. They don&#8217;t want Republicans to be able to point and say, &#8220;See, the Democrats want to cut Medicare, too.&#8221;</p><p>There&#8217;s nothing in these Democratic objections, however, that couldn&#8217;t be creatively finessed. You can claim you haven&#8217;t actually &#8220;cut&#8221; a benefit, for example, if what you&#8217;ve done is restrained the rate at which its cost will grow. You can offset spending with new revenue, and you can do so in a way that gives low-income taxpayers a break. Democrats left the door open and these options could have been explored.</p><p>The story on the Republican side is entirely different. There are ways to finesse a &#8220;no new taxes&#8221; pledge, too. Instead of raising tax rates, you close loopholes in the name of reform; you add an enhancement here, a &#8220;user fee&#8221; there, and you can manage to get the revenue you need and still claim you haven&#8217;t voted to raise taxes.</p><p>But Republicans are taking the position that not a cent of new revenue can be raised, no matter the euphemism. Some Democrats, yes, are being scratchy and cantankerous. But Republicans are refusing to negotiate at all. That&#8217;s not the same thing.</p><p>I understand why President Obama, <a href="http://projects.washingtonpost.com/obama-speeches/speech/736/">in his news conference Monday</a>, chided &#8220;each side&#8221; for taking a &#8220;maximalist position.&#8221; For political and practical reasons, it&#8217;s advantageous for him to be seen as an honest broker.</p><p>Meanwhile, though, the clock ticks toward Aug. 2 and the possibility of a catastrophic default becomes more real. And no one should be confused about what the president confronts: On one side, grousing and grumbling. On the other, a brick wall. </p><p>
<i>
<a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Eugene Robinson will be online</a> to chat with readers at 1 p.m. Eastern time Tuesday. <a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Submit your questions</a> before or during the discussion.</i>
</p></article></div>

@ -0,0 +1,2 @@
test_description: washingtonpost.com op-ed
url: http://www.washingtonpost.com/opinions/dont-blame-both-sides-for-debt-impasse/2011/07/11/gIQA0XDg9H_story.html?hpid=z1

@ -0,0 +1,2 @@
*
!.gitignore

@ -39,6 +39,11 @@ REGEXES = {
'tool|widget'), re.I),
'divToPElementsRe': re.compile(
'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
# Match: next, continue, >, >>, but not >|, as those usually mean last.
'nextLink': re.compile(r'(next|weiter|continue|>[^\|]|$)', re.I),
'prevLink': re.compile(r'(prev|earl|old|new|<)', re.I),
'page': re.compile(r'pag(e|ing|inat)', re.I),
'firstLast': re.compile(r'(first|last)', re.I)
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
@ -94,6 +99,12 @@ def text_length(i):
return len(clean(i.text_content() or ""))
def tags(node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def clean_segment_extension(segments, index, segment):
if segment.find('.') == -1:
return segment
@ -206,6 +217,120 @@ def find_base_url(url):
return urlparse.urlunsplit(new_parts)
class CandidatePage():
def __init__(self, link_text, href):
self.link_text = link_text
self.href = href
self.score = 0
def same_domain(lhs, rhs):
split_lhs = urlparse.urlsplit(lhs)
split_rhs = urlparse.urlsplit(rhs)
if split_lhs.netloc == '' or split_rhs.netloc == '':
return True
else:
return split_lhs.netloc == split_rhs.netloc
def strip_trailing_slash(s):
return re.sub(r'/$', '', s)
def eval_possible_next_page_link(
parsed_urls,
url,
base_url,
candidates,
link
):
raw_href = link.get('href')
# If we've already seen this page, ignore it.
if raw_href is None:
return
href = strip_trailing_slash(raw_href)
logging.debug('evaluating next page link: %s' % href)
if href == base_url or href == url or href in parsed_urls:
return
# If it's on a different domain, skip it.
if not same_domain(url, href):
logging.debug('rejecting %s: different domain' % href)
return
link_text = clean(link.text_content() or '')
if REGEXES['extraneous'].search(link_text) or len(link_text) > 25:
return
href_leftover = href.replace(base_url, '')
if not re.search(r'\d', href_leftover):
return
if href in candidates:
candidates[href].link_text += ' | ' + link_text
else:
candidates[href] = CandidatePage(link_text, href)
candidate = candidates[href]
if href.find(base_url) != 0:
candidate.score -= 25
link_class_name = link.get('class') or ''
link_id = link.get('id') or ''
link_data = ' '.join([link_text, link_class_name, link_id])
if REGEXES['nextLink'].search(link_data):
candidate.score += 50
if REGEXES['page'].search(link_data):
candidate.score += 25
if REGEXES['firstLast'].search(link_data):
if not REGEXES['nextLink'].search(candidate.link_text):
candidate.score -= 65
neg_re = REGEXES['negativeRe']
ext_re = REGEXES['extraneous']
if neg_re.search(link_data) or ext_re.search(link_data):
candidate.score -= 50
if REGEXES['prevLink'].search(link_data):
candidate.score -= 200
# TODO: Score ancestry.
# TODO: Score a bunch of other stuff.
def find_next_page_link(parsed_urls, url, elem):
links = tags(elem, 'a')
base_url = find_base_url(url)
# candidates is a mapping from URLs to CandidatePage objects that represent
# information used to determine if a URL points to the next page in the
# article.
candidates = {}
for link in links:
eval_possible_next_page_link(
parsed_urls,
url,
base_url,
candidates,
link
)
top_page = None
for url, page in candidates.items():
logging.debug('next page score of %s: %s' % (url, page.score))
if 50 <= page.score and (not top_page or top_page.score < page.score):
top_page = page
if top_page:
parsed_urls.add(top_page.href)
return top_page.href
else:
return None
class Document:
"""Class to build a etree document out of html."""
TEXT_LENGTH_THRESHOLD = 25
@ -292,9 +417,9 @@ class Document:
while True:
self.html = self._parse(self.input_doc)
for i in self.tags(self.html, 'script', 'style'):
for i in tags(self.html, 'script', 'style'):
i.drop_tree()
for i in self.tags(self.html, 'body'):
for i in tags(self.html, 'body'):
i.set('id', 'readabilityBody')
if ruthless:
self.remove_unlikely_candidates()
@ -434,8 +559,10 @@ class Document:
'min_text_length',
self.TEXT_LENGTH_THRESHOLD)
candidates = {}
#self.debug(str([describe(node) for node in tags(self.html, "div")]))
ordered = []
for elem in self.tags(self.html, "p", "pre", "td"):
for elem in tags(self.html, "p", "pre", "td"):
self.debug('Scoring %s' % describe(elem))
parent_node = elem.getparent()
if parent_node is None:
@ -540,14 +667,14 @@ class Document:
elem.drop_tree()
def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'):
for elem in tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into <p>s
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
for elem in self.tags(self.html, 'div'):
for elem in tags(self.html, 'div'):
if elem.text and elem.text.strip():
p = fragment_fromstring('<p/>')
p.text = elem.text
@ -568,15 +695,6 @@ class Document:
#print 'Dropped <br> at '+describe(elem)
child.drop_tree()
def findNextPageLink(self, elem):
allLinks = self.tags(elem, ['a'])
baseUrl = self.find_base_url(self.options['url'])
def tags(self, node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)):
@ -585,13 +703,13 @@ class Document:
def sanitize(self, node, candidates):
MIN_LEN = self.options.get('min_text_length',
self.TEXT_LENGTH_THRESHOLD)
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
class_weight = self.class_weight(header)
link_density = self.get_link_density(header)
if class_weight < 0 or link_density > 0.33:
header.drop_tree()
for elem in self.tags(node, "form", "iframe", "textarea"):
for elem in tags(node, "form", "iframe", "textarea"):
elem.drop_tree()
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
@ -663,6 +781,26 @@ class Document:
' many <embed>s')
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img')
# valid_img = False
# self.debug(tounicode(el))
# for img in imgs:
#
# height = img.get('height')
# text_length = img.get('text_length')
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
# if to_int(height) >= 100 or to_int(text_length) >= 100:
# valid_img = True
# self.debug("valid image" + tounicode(img))
# break
# if valid_img:
# to_remove = False
# self.debug("Allowing %s" %el.text_content())
# for desnode in tags(el, "table", "ul", "div"):
# allowed[desnode] = True
# don't really understand what this is doing. Originally
# the i/j were =+ which sets the value to 1. I think that
# was supposed to be += which would increment. But then
@ -670,6 +808,8 @@ class Document:
# ever do one loop in each iteration and don't understand
# it. Will have to investigate when we get to testing more
# pages.
#find x non empty preceding and succeeding siblings
i, j = 0, 0
x = 1
@ -694,7 +834,7 @@ class Document:
if siblings and sum(siblings) > 1000:
to_remove = False
self.debug("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"):
for desnode in tags(el, "table", "ul", "div"):
allowed[desnode] = True
if to_remove:

@ -0,0 +1,15 @@
import os
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
REGRESSION_DATA = os.path.join(os.path.dirname(__file__), 'test_data')
def load_sample(filename):
"""Helper to get the content out of the sample files"""
return open(os.path.join(SAMPLES, filename)).read()
def load_regression_data(filename):
"""Get the content of a test_data regression file"""
return open(os.path.join(REGRESSION_DATA, filename)).read()

@ -0,0 +1,975 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml">
<head>
<title>The Dark Art of Breaking Bad - NYTimes.com</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="description" content="Vince Gilligan doesnt have a twisted, violent mind. But youd never know that from his twisted, brilliant show.">
<meta name="keywords" content="Television,Gilligan Vince,AMC (TV Network),Breaking Bad (TV Program)">
<meta name="ROBOTS" content="NOARCHIVE">
<meta name="DISPLAYDATE" content="July 6, 2011">
<meta name="hdl" content="The Dark Art of Breaking Bad">
<meta name="hdl_p" content="Art Of Darkness">
<meta name="byl" content="By DAVID SEGAL">
<meta name="lp" content="Vince Gilligan doesnt have a twisted, violent mind. But youd never know that from his twisted, brilliant show.">
<meta name="cre" content="The New York Times">
<meta name="edt" content="NewYork">
<meta name="pdate" content="20110706">
<meta name="ttl" content="Breaking Bad (TV Program)">
<meta name="virtloc" content="">
<meta name="des" content="Television">
<meta name="per" content="Gilligan, Vince">
<meta name="org" content="AMC (TV Network)">
<meta name="geo" content="">
<meta name="ticker" content="McDonald's Corporation|MCD|NYSE;Wal-Mart Stores Inc|WMT|NYSE;CBS Corp|CBS|NYSE">
<meta name="misspelling" content="">
<meta name="dat" content="July 6, 2011">
<meta name="tom" content="News">
<meta name="cat" content="">
<meta name="col" content="">
<meta name="dsk" content="Magazine">
<meta name="articleid" content="1248069794500">
<meta name="ARTICLE_TEMPLATE_VERSION" CONTENT="700">
<meta name="hdr_img" content="/images/article/header/sect_magazine.gif">
<meta name="thumbnail" content="images/2011/07/10/magazine/10bad1/10bad1-thumbStandard.jpg">
<meta name="thumbnail_height" content="75">
<meta name="thumbnail_width" content="75">
<meta name="xlarge" content="images/2011/07/10/magazine/10bad_span/10bad_span-articleLarge.jpg">
<meta name="xlarge_height" content="399">
<meta name="xlarge_width" content="600">
<meta name="sectionfront_jsonp" content="http://json8.nytimes.com/pages/magazine/index.jsonp">
<meta name="CG" content="magazine">
<meta name="SCG" content="">
<meta name="PT" content="Article">
<meta name="PST" content="News">
<meta name="msapplication-starturl" content="http://www.nytimes.com/">
<link rel="canonical" href="http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=all">
<meta property="og:url" content="http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=all"/>
<meta property="og:type" content="article"/>
<meta property="og:title" content="The Dark Art of Breaking Bad"/>
<meta property="og:image" content="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad1/10bad1-thumbStandard.jpg"/>
<link rel="stylesheet" type="text/css" href="http://graphics8.nytimes.com/css/0.1/screen/build/article/2.0/styles.css"><!--[if IE]>
<style type="text/css">
@import url(http://graphics8.nytimes.com/css/0.1/screen/common/ie.css);
</style>
<![endif]-->
<!--[if IE 6]>
<style type="text/css">
@import url(http://graphics8.nytimes.com/css/0.1/screen/common/ie6.css);
</style>
<![endif]-->
<script type="text/javascript" src="http://graphics8.nytimes.com/js/common.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/common/screen/DropDown.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/util/tooltip.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/common/screen/altClickToSearch.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/app/article/upNext.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/article/articleShare.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/article/comments/crnrXHR.js"></script>
<script type="text/javascript" src="http://graphics8.nytimes.com/js/app/article/articleCommentCount.js"></script>
</head>
<body >
<a name="top"></a>
<div id="shell">
<ul id="memberTools">
<li><a id="memberToolsLogin" href="https://myaccount.nytimes.com/auth/login">Log In</a></li>
<li><a href="https://myaccount.nytimes.com/gst/regi.html">Register Now</a></li>
<li><a href="http://www.nytimes.com/membercenter/sitehelp.html">Help</a></li>
</ul>
<div class="tabsContainer">
<ul id="mainTabs" class="tabs">
<li class="first mainTabHome"><a href="http://www.nytimes.com">Home Page</a></li>
<li class="mainTabTodaysPaper"><a href="http://www.nytimes.com/pages/todayspaper/index.html">Today's Paper</a></li>
<li class="mainTabVideo"><a href="http://www.nytimes.com/video">Video</a></li>
<li class="mainTabMostPopular"><a href="http://www.nytimes.com/mostpopular">Most Popular</a></li>
<li class="mainTabTimesTopics"><a href="http://topics.nytimes.com/top/reference/timestopics">Times Topics</a></li>
</ul>
</div>
<script type="text/javascript">
window.setTimeout(function() {
var login = document.getElementById('memberToolsLogin');
if (login) {
login.href += "?URI=" + window.location.href;
}
}, 0)
</script> <div id="page" class="tabContent active">
<div class="clearfix" id="masthead">
<div class="singleAd" id="Middle1C">
<!-- ADXINFO classification="button" campaign="ING_DirectSiteSearchQ111_1694924-nyt1"--><A HREF="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Middle1C&sn2=870e9fa9/aa4cef7c&sn1=62ffdd2c/afa1df5a&camp=ING_DirectSiteSearchQ111_1694924-nyt1&ad=NEW.88x31.SiteSearch.Wizard&goto=http://ad.doubleclick.net/jump/N3282.nytimes.comSD6440/B3948326.5;sz=88x31;pc=nyt158514A252821;ord=2011.07.14.23.06.54" TARGET="_blank">
<IMG SRC="http://ad.doubleclick.net/ad/N3282.nytimes.comSD6440/B3948326.5;sz=88x31;pc=nyt158514A252821;ord=2011.07.14.23.06.54"
BORDER=0 WIDTH=88 HEIGHT=31
ALT="Click Here"></A>
</div>
<div id="searchWidget">
<div class="inlineSearchControl">
<form enctype="application/x-www-form-urlencoded" action="http://query.nytimes.com/search/sitesearch" method="get" name="searchForm" id="searchForm">
<input type="hidden" value="full" name="date_select"/>
<label for="searchQuery">Search All NYTimes.com</label>
<input type="text" class="text" value="" size="" name="query" id="searchQuery"/>
<input type="hidden" id="searchAll" name="type" value="nyt"/>
<input id="searchSubmit" title="Search" width="22" height="19" alt="Search" type="image" src="http://graphics8.nytimes.com/images/global/buttons/go.gif">
</form>
</div>
</div>
<div id="branding" >
<a href="http://www.nytimes.com"><span id="nytIhtMastheadLogo">
<a href="http://www.nytimes.com"><img src="http://graphics8.nytimes.com/images/misc/nytlogo152x23.gif" alt="New York Times" id="NYTLogo"/></a>
</span></a>
</div>
<h2>
<a href="http://www.nytimes.com/pages/magazine/index.html">Magazine</a>
</h2>
</div>
<div class="navigation tabsContainer">
<ul class="tabs">
<li id="navWorld" class="first ">
<a href="http://www.nytimes.com/pages/world/index.html">World</a>
</li> <li id="navUs" >
<a href="http://www.nytimes.com/pages/national/index.html">U.S.</a>
</li> <li id="navNyregion" >
<a href="http://www.nytimes.com/pages/nyregion/index.html">N.Y. / Region</a>
</li> <li id="navBusiness" >
<a href="http://www.nytimes.com/pages/business/index.html">Business</a>
</li> <li id="navTechnology" >
<a href="http://www.nytimes.com/pages/technology/index.html">Technology</a>
</li> <li id="navScience" >
<a href="http://www.nytimes.com/pages/science/index.html">Science</a>
</li> <li id="navHealth" >
<a href="http://www.nytimes.com/pages/health/index.html">Health</a>
</li> <li id="navSports" >
<a href="http://www.nytimes.com/pages/sports/index.html">Sports</a>
</li> <li id="navOpinion" >
<a href="http://www.nytimes.com/pages/opinion/index.html">Opinion</a>
</li> <li id="navArts" >
<a href="http://www.nytimes.com/pages/arts/index.html">Arts</a>
</li> <li id="navStyle" >
<a href="http://www.nytimes.com/pages/style/index.html">Style</a>
</li> <li id="navTravel" >
<a href="http://www.nytimes.com/pages/travel/index.html">Travel</a>
</li> <li id="navJobs" >
<a href="http://www.nytimes.com/pages/jobs/index.html">Jobs</a>
</li> <li id="navRealestate" >
<a href="http://www.nytimes.com/pages/realestate/index.html">Real Estate</a>
</li> <li id="navAutomobiles" >
<a href="http://www.nytimes.com/pages/automobiles/index.html">Autos</a>
</li></ul>
</div>
<div class="singleAd" id="TopAd">
<!-- ADXINFO classification="leaderboard_728" campaign="Google_2011_ROS_LB"--><div class="clearfix">
<script type="text/javascript" language="JavaScript">
<!--
google_ad_client = 'ca-nytimes_display_html';
google_alternate_ad_url = 'http://www.nytimes.com/ads/remnant/networkredirect-leaderboard.html';
google_ad_width = 728;
google_ad_height = 90;
google_ad_format = '728x90_pas_abgc';
google_ad_type = 'image,flash';
google_encoding = 'utf8';
google_safe = 'high';
google_targeting = 'site_content';
google_ad_channel = 'ROS_leaderboard';
// -->
</script>
<script type="text/javascript" language="JavaScript" src="http://pagead2.googlesyndication.com/pagead/show_ads.js"></script>
<noscript>
<img height="1" width="1" border="0" src="http://pagead2.googlesyndication.com/pagead/imp.gif?client=ca-nytimes_display_html&event=noscript" />
</noscript>
<div style="font-family: Arial; font-size: 10px; color:#004276; float: right; margin-right: 125px;"><a href="http://www.nytimes.whsites.net/mediakit/">Advertise on NYTimes.com</a></div></div>
</div>
<div id="main">
<div class="spanAB wrap closing">
<div id="abColumn" class="abColumn"><!--open abColumn -->
<div id="article">
<!--cur: prev:-->
<div class="columnGroup first">
<h1 class="articleHeadline"><NYT_HEADLINE version="1.0" type=" ">The Dark Art of Breaking Bad</NYT_HEADLINE></h1>
<div class="articleSpanImage"><img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad_span/10bad_span-articleLarge.jpg" width="600" height="399" alt="" border="0">
<div class="credit">Robert Yager for The New York Times</div>
<p class="caption"><strong></strong>Gilligan on the set with the actors Bryan Cranston and Aaron Paul. </p>
</div> <!--[if lt IE 8]>
<script type="text/javascript">
if($$('div.articleSpanImage') != null) {
var articleSpanImage = $$('div.articleSpanImage')[0].getElementsByTagName("img")[0];
var articleSpanImageSrc = articleSpanImage.getAttribute('src');
articleSpanImage.setAttribute('src',"http://graphics8.nytimes.com/images/global/backgrounds/transparentBG.gif");
var filter = "progId:DXImageTransform.Microsoft.AlphaImageLoader(src='"+articleSpanImageSrc+"', sizingMethod='scale' )";
articleSpanImage.style.filter = filter;
}
</script>
<![endif]-->
<NYT_BYLINE > <h6 class="byline">By <a rel="author" href="http://topics.nytimes.com/top/reference/timestopics/people/s/david_segal/index.html?inline=nyt-per" title="More Articles by David Segal" class="meta-per">DAVID SEGAL</a></h6>
</NYT_BYLINE>
<h6 class="dateline">Published: July 6, 2011 </h6>
<script type="text/javascript">
var articleToolsShareData = {"url":"http:\/\/www.nytimes.com\/2011\/07\/10\/magazine\/the-dark-art-of-breaking-bad.html","headline":"The Dark Art of \u2018Breaking Bad\u2019","description":"Vince Gilligan doesn\u2019t have a twisted, violent mind. But you\u2019d never know that from his twisted, brilliant show.","keywords":"Television,Gilligan Vince,AMC (TV Network)","section":"magazine","sub_section":null,"section_display":"Magazine","sub_section_display":null,"byline":"By <a rel=\"author\" href=\"http:\/\/topics.nytimes.com\/top\/reference\/timestopics\/people\/s\/david_segal\/index.html?inline=nyt-per\" title=\"More Articles by David Segal\" class=\"meta-per\">DAVID SEGAL<\/a>","pubdate":"July 6, 2011","passkey":null};
function getShareURL() {
return encodeURIComponent(articleToolsShareData.url);
}
function getShareHeadline() {
return encodeURIComponent(articleToolsShareData.headline);
}
function getShareDescription() {
return encodeURIComponent(articleToolsShareData.description);
}
function getShareKeywords() {
return encodeURIComponent(articleToolsShareData.keywords);
}
function getShareSection() {
return encodeURIComponent(articleToolsShareData.section);
}
function getShareSubSection() {
return encodeURIComponent(articleToolsShareData.sub_section);
}
function getShareSectionDisplay() {
return encodeURIComponent(articleToolsShareData.section_display);
}
function getShareSubSectionDisplay() {
return encodeURIComponent(articleToolsShareData.sub_section_display);
}
function getShareByline() {
return encodeURIComponent(articleToolsShareData.byline);
}
function getSharePubdate() {
return encodeURIComponent(articleToolsShareData.pubdate);
}
function getSharePasskey() {
return encodeURIComponent(articleToolsShareData.passkey);
}
</script>
<div id="articleToolsTop" class="articleTools">
<div class="box">
<div class="inset">
<ul id="toolsList" class="toolsList wrap">
<li class="comments"><a onClick="javascript:dcsMultiTrack('DCS.dcssip','www.nytimes.com','DCS.dcsuri','/article comments/view-tools.html','WT.ti','Article Comments View Tools','WT.z_aca','Tools-View','WT.gcom','Com');" href="http://community.nytimes.com/comments/www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html" >comments <span id="commentCount"></span></a></li>
<li class="email">
<a id="emailThis" onClick="s_code_linktrack('Article-Tool-EmailSignIn');"
href="http://www.nytimes.com/auth/login?URI=http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html">Sign In to E-Mail</a>
</li>
<li class="print">
<A HREF="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?_r=1&pagewanted=print">Print</a>
</li>
<li class="singlePage">
<A HREF="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?_r=1&pagewanted=all"> Single Page</a>
</li>
<NYT_REPRINTS_FORM>
<script name="javascript">
function submitCCCForm(){
var PopUp = window.open('', '_Icon','location=no,toolbar=no,status=no,width=650,height=550,scrollbars=yes,resizable=yes');
var form = document.forms["cccform"];
// ensure that we are operating on the Form, not a NodeList
if (form.nodeName == "FORM") {
form.submit();
} else if (form[0] && form[0].nodeName == "FORM") {
form[0].submit();
}
}
</script>
<li class="reprints"> <form name="cccform" action="https://s100.copyright.com/CommonApp/LoadingApplication.jsp" target="_Icon">
<input type="hidden" name="Title" value="The Dark Art of Breaking Bad">
<input type="hidden" name="Author" value="By DAVID SEGAL ">
<input type="hidden" name="ContentID" value="http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html">
<input type="hidden" name="FormatType" value="default">
<input type="hidden" name="PublicationDate" value="July 10, 2011">
<input type="hidden" name="PublisherName" value="The New York Times">
<input type="hidden" name="Publication" value="nytimes.com">
<input type="hidden" name="wordCount" value="12">
</form>
<a href="#" onClick="submitCCCForm()">Reprints</a>
</li>
</NYT_REPRINTS_FORM>
</ul>
<div class="articleToolsSponsor" id="Frame4A"><!-- ADXINFO classification="Button120x60" campaign="foxsearch2011_emailtools_1629903c_nyt5"--><a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Frame4A&sn2=8aedd159/145f16b9&sn1=8c7a6250/71b7f853&camp=foxsearch2011_emailtools_1629903c_nyt5&ad=MMMM_120x60&goto=http%3A%2F%2Fwww%2Efoxsearchlight%2Ecom%2Fmarthamarcymaymarlene%2F" target="_blank">
<img src="http://graphics8.nytimes.com/adx/images/ADS/26/57/ad.265768/MMMM_120X60_b.gif" width="120" height="60" border="0"></a>
</div> </div>
</div>
</div>
<div class="articleBody">
<NYT_TEXT >
<NYT_CORRECTION_TOP>
</NYT_CORRECTION_TOP>
<p>
In the first three seasons of the AMC series &ldquo;Breaking Bad,&rdquo; Aaron Paul &mdash; or rather, his meth-dealing character, Jesse Pinkman &mdash; has been slapped, mauled and beaten purple by, respectively, a hit man, a sociopath and a federal drug-enforcement agent. If he were a pi&ntilde;ata, the candy would have poured out of this guy long ago. And apparently there is little mercy for Paul in the new season on the way. For there Paul was, one day in late May, standing on Tijeras Avenue in downtown Albuquerque, being tasered by a brawny man in sunglasses. </p>
</div>
<div class="articleInline runaroundLeft">
<!--forceinline-->
<div class="columnGroup doubleRule"> </div></div> <script type="text/javascript">
if (typeof NYTDVideoManager != "undefined") {
NYTDVideoManager.setAllowMultiPlayback(false);
}
function displayCompanionBanners(banners, tracking) {
tmDisplayBanner(banners, "videoAdContent", 300, 250, null, tracking);
}
</script>
<div class="articleInline runaroundLeft" style="margin-top: -11px"> <h6 class="sectionHeader flushBottom">Multimedia</h6>
</div>
<div class="articleInline runaroundLeft firstArticleInline">
<div class="story">
<div class="wideThumb">
<a href="javascript:pop_me_up2('http://www.nytimes.com/imagepages/2011/07/10/magazine/10bad2.html?ref=magazine','776_1024','width=776,height=1024,location=no,scrollbars=yes,toolbars=no,resizable=yes')">
<img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad2/10bad2-thumbWide.jpg" width="190" height="126" alt="" border="0" />
<span class="mediaOverlay graphic">Graphic</span>
</a>
</div>
<h6><a href="javascript:pop_me_up2('http://www.nytimes.com/imagepages/2011/07/10/magazine/10bad2.html?ref=magazine','776_1024','width=776,height=1024,location=no,scrollbars=yes,toolbars=no,resizable=yes')">
</a></h6>
<h6 class="byline">
</h6>
</div>
</div>
<div class="articleInline runaroundLeft"><div class="articleInline runaroundLeft"></div>
<div class="inlineImage module">
<div class="image">
<div class="icon enlargeThis"><a href="javascript:pop_me_up2('http://www.nytimes.com/imagepages/2011/07/10/magazine/10bad1.html','10bad1_html','width=469,height=730,scrollbars=yes,toolbars=no,resizable=yes')">Enlarge This Image</a></div>
<a href="javascript:pop_me_up2('http://www.nytimes.com/imagepages/2011/07/10/magazine/10bad1.html','10bad1_html','width=469,height=730,scrollbars=yes,toolbars=no,resizable=yes')">
<img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad1/mag-10Bad-t_CA1-articleInline.jpg" width="190" height="286" alt="">
</a>
</div>
<h6 class="credit">Robert Yager for The New York Times</h6>
<p class="caption">The goal, Gilligan says, was to turn "Mr. Chips into Scarface." </p>
</div>
</div>
<div id="readerscomment" class="inlineLeft"></div>
<div class="articleBody">
<p>
The street had been blocked off, and a crew of dozens waited as the actors rehearsed the assault with Vince Gilligan, the creator, head writer and show runner, who was also directing the episode. </p><p>
&ldquo;Maybe we play this moment just a little longer, so we know for sure he got zapped,&rdquo; Gilligan said. &ldquo;Otherwise, Jesse would fight back more.&rdquo; </p><p>
&ldquo;Yeah, I like that,&rdquo; Paul said. </p><p>
&ldquo;And let&rsquo;s go back to the brass-knuckle-looking taser,&rdquo; Gilligan said. </p><p>
&ldquo;Fly in the brass-knuckle taser!&rdquo; a nearby crew member shouted into a walkie-talkie. </p><p>
As the cameras were moved into place, Gilligan, who is 44 and speaks in a lyrical Southern drawl, reminisced fondly about some of the torments he has inflicted on Jesse Pinkman. One of the most gruesome was a plunge through the roof of a Port-a-Potty in a junkyard in Season 2. </p><p>
&ldquo;The original version was that he was going to get bit by a guard dog,&rdquo; Gilligan said, leaning up against a rail and squinting against the New Mexico sun. &ldquo;But the guard dog would have cost us $25,000, and we didn&rsquo;t have the money. So we came up with the $5,000 outhouse gag. Which is quite a bit more memorable.&rdquo; </p><p>
Mordantly amusing ordeals are a specialty on &ldquo;Breaking Bad,&rdquo; which begins its fourth season on July 17. Credit the show&rsquo;s forbiddingly grim premise: A 50-year-old high-school chemistry teacher named Walter White (played by Bryan Cranston) finds out he has terminal lung cancer and starts making crystal meth, hoping to leave behind a nest egg for his son and pregnant wife. Walter, it emerges, is a chemistry wizard, and after teaming up with Pinkman, a burnout student he once flunked, the pair drive a ramshackle R.V. into the desert and confect the purest, most coveted meth that local dealers have ever known. With the death penalty of his diagnosis looming, Walt wakes from the slumber of an unfulfilling life, evolving from feckless drudge to reluctant part-time criminal, then gradually to something worse. </p><p>
In its first season, &ldquo;Breaking Bad&rdquo; seemed like the story of the nuttiest midlife crisis ever, told with elements that felt vaguely familiar. The structure &mdash; felonious dad copes with stress of work and family; complications ensue &mdash; owed an obvious debt to &ldquo;The Sopranos,&rdquo; and the collision of regular people and colorfully violent thugs nodded to Tarantino. The story and setting were an update of the spaghetti Western, minus the cowboys and set in the present. </p><p>
But it was soon clear that &ldquo;Breaking Bad&rdquo; was something much more satisfying and complex: a revolutionary take on the serial drama. What sets the show apart from its small-screen peers is a subtle metaphysical layer all its own. As Walter inches toward damnation, Gilligan and his writers have posed some large questions about good and evil, questions with implications for every kind of malefactor you can imagine, from Ponzi schemers to terrorists. Questions like: Do we live in a world where terrible people go unpunished for their misdeeds? Or do the wicked ultimately suffer for their sins? </p><p>
Gilligan has the nerve to provide his own hopeful answer. &ldquo;Breaking Bad&rdquo; takes place in a universe where nobody gets away with anything and karma is the great uncredited player in the cast. This moral dimension might explain why &ldquo;Breaking Bad&rdquo; has yet to achieve pop cultural breakthrough status, at least on the scale of other cable hits set in decidedly amoral universes, like &ldquo;True Blood&rdquo; or &ldquo;Mad Men,&rdquo; AMC&rsquo;s far-more-buzzed-about series that takes place in an ad agency in the &rsquo;60s. The total audience for &ldquo;Breaking Bad&rdquo; is only slightly smaller than that of &ldquo;Mad Men&rdquo; &mdash; 19.5 million versus 22.4 million cumulative viewers in their respective third seasons &mdash; but the top three markets for &ldquo;Breaking Bad&rdquo; are Albuquerque/Santa Fe, Kansas City and Memphis; neither New York nor Los Angeles are in its top 10. The show, in other words, doesn&rsquo;t play on the coasts. It gets chatter, just not among what has long been considered the chattering class. </p><div id="pageLinks"><ul id="pageNumbers"><li> 1 </li><li> <a onClick="s_code_linktrack('Article-MultiPagePageNum2');" title="Page 2" href="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1">2</a> </li><li> <a onClick="s_code_linktrack('Article-MultiPagePageNum3');" title="Page 3" href="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=3&_r=1">3</a> </li><li> <a onClick="s_code_linktrack('Article-MultiPagePageNum4');" title="Page 4" href="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=4&_r=1">4</a> </li><li> <a onClick="s_code_linktrack('Article-MultiPagePageNum5+');" title="Page 5" href="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=5&_r=1">5</a> </li></ul><a class="next" onClick="s_code_linktrack('Article-MultiPage-Next');"
title="Next Page"
href="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1">Next Page &#x00bb;</a></div><NYT_AUTHOR_ID> <div class="authorIdentification">
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</NYT_AUTHOR_ID><NYT_CORRECTION_BOTTOM> <div class="articleCorrection">
</div>
</NYT_CORRECTION_BOTTOM><NYT_UPDATE_BOTTOM>
</NYT_UPDATE_BOTTOM>
</NYT_TEXT>
</div> </div>
<!--cur: prev:-->
<div class="columnGroup ">
<div class="articleFooter">
<div class="articleMeta">
<div class="opposingFloatControl wrap">
<div class="element1">
<h6 class="metaFootnote">A version of this article appeared in print on July 10, 2011, on page MM18 of the Sunday Magazine with the headline: Art Of Darkness.</h6>
</div>
</div>
</div>
</div> </div>
<!--cur: prev:-->
<div class="columnGroup ">
<div id="articleExtras">
<div class="expandedToolsRight">
<div class="articleTools">
<div class="box">
<div class="inset">
<ul id="toolsList" class="toolsList wrap">
<li class="comments"><a onClick="javascript:dcsMultiTrack('DCS.dcssip','www.nytimes.com','DCS.dcsuri','/article comments/view-tools.html','WT.ti','Article Comments View Tools','WT.z_aca','Tools-View','WT.gcom','Com');" href="http://community.nytimes.com/comments/www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html" >comments <span id="commentCount"></span></a></li>
<li class="email">
<a id="emailThis" onClick="s_code_linktrack('Article-Tool-EmailSignIn');"
href="http://www.nytimes.com/auth/login?URI=http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html">Sign In to E-Mail</a>
</li>
<li class="print">
<A HREF="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?_r=1&pagewanted=print">Print</a>
</li>
<li class="singlePage">
<A HREF="/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?_r=1&pagewanted=all"> Single Page</a>
</li>
<NYT_REPRINTS_FORM>
<script name="javascript">
function submitCCCForm(){
var PopUp = window.open('', '_Icon','location=no,toolbar=no,status=no,width=650,height=550,scrollbars=yes,resizable=yes');
var form = document.forms["cccform"];
// ensure that we are operating on the Form, not a NodeList
if (form.nodeName == "FORM") {
form.submit();
} else if (form[0] && form[0].nodeName == "FORM") {
form[0].submit();
}
}
</script>
<li class="reprints"> <form name="cccform" action="https://s100.copyright.com/CommonApp/LoadingApplication.jsp" target="_Icon">
<input type="hidden" name="Title" value="The Dark Art of Breaking Bad">
<input type="hidden" name="Author" value="By DAVID SEGAL ">
<input type="hidden" name="ContentID" value="http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html">
<input type="hidden" name="FormatType" value="default">
<input type="hidden" name="PublicationDate" value="July 10, 2011">
<input type="hidden" name="PublisherName" value="The New York Times">
<input type="hidden" name="Publication" value="nytimes.com">
<input type="hidden" name="wordCount" value="12">
</form>
<a href="#" onClick="submitCCCForm()">Reprints</a>
</li>
</NYT_REPRINTS_FORM>
</ul>
</div>
</div>
</div>
<script type="text/javascript">
writePost();
</script>
</div>
</div>
<div class="singleAd" id="Bottom1">
<!-- ADXINFO classification="text_ad" campaign="nyt2011-circ-sf-bottom1-3844L"--><table width="468" border="0">
<tr>
<td width="84"><a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Bottom1&sn2=77e5bd1c/ebcd09ea&sn1=c0389032/4694fa7c&camp=nyt2011-circ-sf-bottom1-3844L&ad=050511-sf-bottom1-3844L&goto=https%3A%2F%2Fwww%2Enytimesathome%2Ecom%2Fhd%2F150%3FMediaCode%3DW16AK%26CMP%3D3844L"><img src="http://graphics8.nytimes.com/adx/images/ADS/24/26/ad.242614/90x79_newspaper.gif" width="90" height="70" border="0"></a></td>
<td width="381">
<p align="left">
<font size="-1" class="nav" color="#000066"><a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Bottom1&sn2=77e5bd1c/ebcd09ea&sn1=c0389032/4694fa7c&camp=nyt2011-circ-sf-bottom1-3844L&ad=050511-sf-bottom1-3844L&goto=https%3A%2F%2Fwww%2Enytimesathome%2Ecom%2Fhd%2F150%3FMediaCode%3DW16AK%26CMP%3D3844L">The New York Times and the Bay Area -- save 50% on home delivery plus FREE All Digital Access.
</a></font>
</td>
</tr>
</table>
<br>
</div>
</div>
<!--cur: prev:-->
<div class="columnGroup ">
<div class="singleRuleDivider"></div>
<div class="articleBottomExtra subColumns">
<div class="column">
<script type="text/javascript" src="http://graphics8.nytimes.com/js/app/article/outbrain.js"></script> </div>
<div class="column lastColumn">
<div class="emailAlertModule module">
<h5 class="sectionHeaderSm">Get Free E-mail Alerts on These Topics</h5>
<form action="https://myaccount.nytimes.com/mem/tnt.html" method="GET" enctype="application/x-www-form-urlencoded">
<input type="hidden" name="retA" value="http://www.nytimes.com//2011/07/10/magazine/the-dark-art-of-breaking-bad.html" >
<input type="hidden" name="retT" value="The Dark Art of Breaking Bad">
<input type="hidden" name="module" value="call">
<input type="hidden" name="alert_context" value="1">
<ul class="flush">
<li>
<input type="hidden" name="topic1" value="Television">
<input type="hidden" name="topic_field1" value="des">
<a class="inTextReferEmail" href="https://myaccount.nytimes.com/mem/tnt.html?module=call&alert_context=1&topic1=Television&topic_field1=des&topic1_check=y&retA=&retT=&cskey=" onClick="javascript:s_code_linktrack('Article-RelatedTopics'); dcsMultiTrack('DCS.dcssip','www.nytimes.com','DCS.dcsuri','/newstracker/add.html','WT.ti','Newstracker Add','WT.z_nta','Add','WT.pers','Per','WT.z_dcsm','1');" onmousedown="NYTD.relatedSearches.clickHandler(event);" >Television</a>
</li>
<li>
<input type="hidden" name="topic1" value="AMC+%28TV+Network%29">
<input type="hidden" name="topic_field1" value="org">
<a class="inTextReferEmail" href="https://myaccount.nytimes.com/mem/tnt.html?module=call&alert_context=1&topic1=AMC+%28TV+Network%29&topic_field1=org&topic1_check=y&retA=&retT=&cskey=" onClick="javascript:s_code_linktrack('Article-RelatedTopics'); dcsMultiTrack('DCS.dcssip','www.nytimes.com','DCS.dcsuri','/newstracker/add.html','WT.ti','Newstracker Add','WT.z_nta','Add','WT.pers','Per','WT.z_dcsm','1');" onmousedown="NYTD.relatedSearches.clickHandler(event);" >AMC (TV Network)</a>
</li>
<li>
<input type="hidden" name="topic1" value="Gilligan%2C+Vince">
<input type="hidden" name="topic_field1" value="per">
<a class="inTextReferEmail" href="https://myaccount.nytimes.com/mem/tnt.html?module=call&alert_context=1&topic1=Gilligan%2C+Vince&topic_field1=per&topic1_check=y&retA=&retT=&cskey=" onClick="javascript:s_code_linktrack('Article-RelatedTopics'); dcsMultiTrack('DCS.dcssip','www.nytimes.com','DCS.dcsuri','/newstracker/add.html','WT.ti','Newstracker Add','WT.z_nta','Add','WT.pers','Per','WT.z_dcsm','1');" onmousedown="NYTD.relatedSearches.clickHandler(event);" >Gilligan, Vince</a>
</li>
</ul>
</form>
</div>
</div>
</div> </div>
<!--cur: prev:-->
<div class="columnGroup last">
<div class="columnGroup" id="adxSponLink"></div>
<script language="JavaScript">
google_hints="The Dark Art of Breaking Bad";google_ad_channel="archive, archive_magazine, archive_Magazine";
</script>
<script language="JavaScript" type="text/javascript">
// Sponlink_short
NYTD.GoogleAds.getGoogleAds("AFC", {
google_ad_client:'nytimes_article_var',
google_ad_channel:'left',
ad_target_list:'sponLink'
});
</script>
</div>
</div>
</div><!--close abColumn -->
<div class="cColumn">
<div class="columnGroup">
</div>
<!----> <div class="columnGroup first">
</div>
<!----> <div class="columnGroup ">
<div class="singleAd" id="MiddleRight">
<!-- ADXINFO classification="bigad" campaign="Google_2011_ROS_BA_6"--><div class="clearfix">
<script language="JavaScript">
<!--
if (!window.nyt_google_count) { var nyt_google_count = 0; }
if ((!window.nyt_google_ad_channel) && (window.google_ad_channel)) { var nyt_google_ad_channel = google_ad_channel; }
if ((!window.nyt_google_hints) && (window.google_hints)) { var nyt_google_hints = google_hints; }
if ((!window.nyt_google_contents) && (window.google_contents)) { var nyt_google_contents = google_contents; }
function ss(w,id) {window.status = w;return true;}function cs(){window.status='';}function ha(a){ pha=document.getElementById(a); nhi=pha.href.indexOf("&nh=");if(nhi < 1) {phb=pha.href+"&nh=1";} pha.href=phb;}function ca(a) { pha=document.getElementById(a); nci=pha.href.indexOf("&nc=");if(nci < 1) {phb=pha.href+"&nc=1";} pha.href=phb;window.open(document.getElementById(a).href);}function ga(o,e) {if (document.getElementById) {a=o.id.substring(1);p = "";r = "";g = e.target;if (g) {t = g.id;f = g.parentNode;if (f) {p = f.id;h = f.parentNode;if (h)r = h.id;}} else {h = e.srcElement;f = h.parentNode;if (f)p = f.id;t = h.id;}if (t==a || p==a || r==a)return true;pha=document.getElementById(a); nbi=pha.href.indexOf("&nb=");if(nbi < 1) {phb=pha.href+"&nb=1";} pha.href=phb;window.open(document.getElementById(a).href);}}
function google_ad_request_done(ads) {
var s = '';
var i;
if (ads.length == 0) {
return;
}
if (ads[0].type == "image") {
s += '<a href="' + ads[0].url +
'" target="_blank" title="go to ' + ads[0].visible_url +
'"><img border="0" src="' + ads[0].image_url +
'"width="' + ads[0].image_width +
'"height="' + ads[0].image_height + '"></a>';
} else if (ads[0].type == "flash") {
s += '<object classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000"' +
' codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,0,0"' +
' WIDTH="' + google_ad.image_width +
'" HEIGHT="' + google_ad.image_height + '">' +
'<PARAM NAME="movie" VALUE="' + google_ad.image_url + '">' +
'<PARAM NAME="quality" VALUE="high">' +
'<PARAM NAME="AllowScriptAccess" VALUE="never">' +
'<EMBED src="' + google_ad.image_url +
'" WIDTH="' + google_ad.image_width +
'" HEIGHT="' + google_ad.image_height +
'" TYPE="application/x-shockwave-flash"' +
' AllowScriptAccess="never" ' +
' PLUGINSPAGE="http://www.macromedia.com/go/getflashplayer"></EMBED></OBJECT>';
} else if (ads[0].type == "text") {
nyt_google_count += ads.length;
google_ad_section_line_height = "14px";
google_ad_section_padding_left = "7px";
google_title_link_font_size = "12px";
google_ad_text_font_size = "11px";
google_visible_url_font_size = "10px";
s += '<table width="100%" height="" border="0" cellspacing="0" cellpadding="0" style="text-align:left; width:100%; border-style: solid; border-width: 1px; border-color: #9da3ad" >\n<tr>\n<td style="font-family:Arial,Helvetica,sans-serif; font-size:12px; color:#333333;" valign="top"><table width="100%" height="100%" cellspacing="0" cellpadding="0" border="0" style="width:100%; height:100%;">\n<tr>\n <td style="background-color:#9da3ad; width:70%; height:20px; padding-top:2px; padding-left:11px; padding-bottom:2px; font-family:Arial,Helvetica,sans-serif; font-size:12px; color:#333333;" width="70%" height="20" bgcolor="#9da3ad" ><span style="font-size: 12px; font-weight: normal; color:#ffffff;" >Ads by Google</span></td>\n<td style="padding-top:2px; padding-bottom:2px; width:30%; height:20px; align:right; background-color:#9da3ad; font-family:Arial,Helvetica,sans-serif; font-size:12px; color:#333333;" width="30%" height="20" align="right" bgcolor="#9da3ad" ><span><a style="font-family:Arial,Helvetica,sans-serif; color: white; font-size:12px; padding-right:7px;" href="http://www.nytimes.com/ref/membercenter/faq/linkingqa16.html" onclick="window.open(\'\',\'popupad\',\'left=100,top=100,width=390,height=390,resizable,scrollbars=no\')" target="popupad">what\'s this?</a></span></td>\n</tr>\n</table>\n</td>\n</tr>\n<tr>\n<td style="height:110px; font-family:Arial,Helvetica,sans-serif; font-size:12px; color:#333333;" valign="top" height="110"><table height="100%" width="100%" cellpadding="4" cellspacing="0" border="0" bgcolor="#f8f8f9" style="height:100%; width:100%; padding:4px; background-color:#f8f8f9;">\n';
for (i = 0; i < ads.length; ++i) {
s += '<tr>\n<td style="cursor:pointer; cursor:hand; font-family:Arial,Helvetica,sans-serif; font-size:12px; color:#333333; background-color:#f8f8f9;" id="taw' + i + '" valign="middle" onFocus="ss(\'go to ' + ads[i].visible_url + '\',\'aw' + i + '\')" onMouseOver="ss(\'go to ' + ads[i].visible_url + '\',\'aw' + i + '\')" onMouseOut="cs()" onClick="ga(this,event)"><div style="line-height:' + google_ad_section_line_height + '; padding-left:' + google_ad_section_padding_left + '; padding-bottom:5px;" ><a id="aw' + i + '" href="' + ads[i].url + '" target="_blank" style="font-size:' + google_title_link_font_size + '; color:#000066; font-weight:bold; text-decoration:underline;" onFocus="ss(\'go to ' + ads[i].visible_url + '\',\'aw' + i + '\')" onClick="ha(\'aw' + i + '\')" onMouseOver="return ss(\'go to ' + ads[i].visible_url + '\',\'aw' + i + '\')" onMouseOut="cs()">' + ads[i].line1 + '</a><br>\n<a href="' + ads[i].url + '" target="_blank" style="font-family:Arial,Helvetica,sans-serif; font-size:' + google_ad_text_font_size + ';color:#333333;text-decoration:none;">' + ads[i].line2 + ' ' + ads[i].line3 + '</a><br>\n<a href="' + ads[i].url + '" target="_blank" style="font-size:' + google_visible_url_font_size + '; color:#000066; font-weight:normal; text-decoration:none;">' + ads[i].visible_url + '</a></div>\n</td>\n</tr>\n';
}
s += '</table>\n</td>\n</tr>\n</table>';
}
document.write(s);
return;
}
google_ad_client = 'ca-nytimes_display_html';
google_ad_channel = 'ROS_big_ad';
google_ad_output = 'js';
google_max_num_ads = '6';
google_ad_type = 'text, image, flash';
google_image_size = '336x280';
google_safe = 'high';
google_targeting = 'site_content';
if (window.nyt_google_contents) { google_contents = nyt_google_contents; }
else if (window.nyt_google_hints) { google_hints = nyt_google_hints; }
// -->
</script>
<script language="JavaScript" src="http://pagead2.googlesyndication.com/pagead/show_ads.js"></script>
<div style="font-family: Arial; font-size: 10px; color:#004276; float: right; margin-right: 9px;"><a href="http://www.nytimes.whsites.net/mediakit/">Advertise on NYTimes.com</a></div></div>
</div>
</div>
<!----> <div class="columnGroup ">
<div class="singleAd" id="Box3">
<!-- ADXINFO classification="feature_squares" campaign="nyt2011-regilite-P1-ticketwatch"--><IFRAME title="regilite" src="https://myaccount.nytimes.com/regilite?product=TR" width="336" height="90" marginheight="0" marginwidth="0" frameborder="0" vspace="0" hspace="0" scrolling="no"></IFRAME>
</div>
</div>
<!----> <div class="columnGroup ">
</div>
<!----> <div class="columnGroup ">
</div>
<!----> <div class="columnGroup ">
<div id="mostPopWidget" class="doubleRule"></div>
<script src="http://graphics8.nytimes.com/js/app/recommendations/recommendationsModule.js" type="text/javascript" charset="utf-8"></script>
</div>
<!----> <div class="columnGroup ">
</div>
<!----> <div class="columnGroup ">
<div class="bigAd" id="Box1">
<!-- ADXINFO classification="Module" campaign="NYT2011_marketingmodule_TMag"--><!-- MARKETING MODULE -->
<div style="border:solid #999;border-width:1px;font-family:Arial,sans-serif;text-align:left; width:334px;background:#fff;" class="clearfix wrap">
<a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=66c084c/ffbd4040&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://tmagazine.blogs.nytimes.com/tag/haute-couture-paris%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-PH%26WT.mc_ev=click" target="_blank"><img src="http://graphics8.nytimes.com/ads/marketing/mm11/tmagazine_071411.jpg" width="334" height="154" border="0" alt=""></a>
<div style="padding:7px 9px 0;background:#fff">
<h2 style="font-size:22px;line-height:24px; margin:0;padding:0 0 4px;"><a style="color:#504448;" target="_blank" href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=66c084c/ffbd4040&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://tmagazine.blogs.nytimes.com/tag/haute-couture-paris%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-HDR%26WT.mc_ev=click">Haute couture in Paris</a></h2>
<p style="margin:0 0 3px; padding:0;font-size: 11px;"><a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=f72cae84/3f0445bd&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://www.nytimes.com/pages/t-magazine/index.html%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-URL%26WT.mc_ev=click" target="_blank" style="font-size:11px;margin:3px 0;padding:0;font-family:Arial,sans-serif; color:#000; text-transform:uppercase;">Also in T Magazine &raquo;</a></p>
<ul style="font-size:12px;margin:0; padding-bottom: 10px; border-bottom:1px solid #ccc;" class="refer">
<li style="font-size:12px"><a target="_blank" href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=66c084c/ffbd4040&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://tmagazine.blogs.nytimes.com/2011/06/24/now-showing-primp-my-ride%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-L1%26WT.mc_ev=click" style="color: #004276">Primp my ride</a></li>
<li style="font-size:12px"><a target="_blank" href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=66c084c/ffbd4040&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://tmagazine.blogs.nytimes.com/2011/06/27/look-of-the-moment-catherine-duchess-of-cambridge%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-L2%26WT.mc_ev=click" style="color: #004276">Look of the Moment: Catherine, Duchess of Cambridge</a></li>
</ul>
</div>
<div style="padding:5px 9px; float:left; width:316px; background:#fff"> <a style="float:left" href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=252d6d80/ea8ec045&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://nytimes.com%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-LOGO%26WT.mc_ev=click" target="_blank"><img src="http://graphics8.nytimes.com/ads/marketing/mm09/verticalst/nytimes.gif" alt="nytimes.com" width="116" height="18" border="0"></a><a style="float:right" href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Box1&sn2=585d7930/c4351de1&sn1=f72cae84/3f0445bd&camp=NYT2011_marketingmodule_TMag&ad=TM-D-I-NYT-MOD-MOD-M209-ROS-0711&goto=http://www.nytimes.com/pages/t-magazine/index.html%3FWT.mc_id=TM-D-I-NYT-MOD-MOD-M209-ROS-0711-VRT%26WT.mc_ev=click" target="_blank"><img src="http://graphics8.nytimes.com/ads/marketing/mm09/verticalst/verticals_tmagazine.gif" alt="T Magazine" width="120" height="18" border="0"></a></div><br clear="all">
</div>
<!-- /MARKETING MODULE -->
</div>
</div>
<!----> <div class="columnGroup ">
<!--[TwoColumnAdLeft - Begin] -->
<div class="adHeader">
<h4>
Advertisements </h4>
</div>
<div class="cColumn-TextAdsBox">
<div class="cColumn-TextAdsLeft">
<div class="cColumn-TextAd">
<!-- ADXINFO classification="SiteForADay" campaign="NYT2011-Mktg-TimesLimited-S4D-ROS"--><div style="width:320px; height:60px;">
<a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Right5A&sn2=2e9fe2cb/b1bf5670&sn1=d6176eb9/3ff0a8c&camp=NYT2011-Mktg-TimesLimited-S4D-ROS&ad=TL-D-I-NYT-AD-S4D-TLP-ROS-0211-NA&goto=http%3A%2F%2Fwww%2Enytimes%2Ecom%2Fmarketing%2Ftimeslimited%2F%3FWT%2Emc%5Fid%3DTL%2DD%2DI%2DNYT%2DAD%2DS4D%2DTLP%2DROS%2D0211%2DNA%26WT%2Emc%5Fev%3Dclick" target="_blank">
<img style="float:left; margin-right:10px;" src="http://graphics8.nytimes.com/adx/images/ADS/25/86/ad.258614/Times_Limited_86x60.gif" width="86" height="60" border="0" style="vertical-align:text-top;"></a>
<a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Right5A&sn2=2e9fe2cb/b1bf5670&sn1=d6176eb9/3ff0a8c&camp=NYT2011-Mktg-TimesLimited-S4D-ROS&ad=TL-D-I-NYT-AD-S4D-TLP-ROS-0211-NA&goto=http%3A%2F%2Fwww%2Enytimes%2Ecom%2Fmarketing%2Ftimeslimited%2F%3FWT%2Emc%5Fid%3DTL%2DD%2DI%2DNYT%2DAD%2DS4D%2DTLP%2DROS%2D0211%2DNA%26WT%2Emc%5Fev%3Dclick" target="_blank"> <br /> Exclusive offers delivered to your inbox</a>
</div>
</div>
<div class="cColumn-TextAd">
</div>
<div class="cColumn-TextAd">
</div>
<div class="cColumn-TextAd">
</div>
</div>
</div>
<!--[TwoColumnAdLeft - End] -->
</div>
<!----> <div class="columnGroup ">
<div class="singleAd" id="Middle5">
<!-- ADXINFO classification="feature_position" campaign="nyt2011-circ-sf-middle5-3844X"--><a href="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=Middle5&sn2=d52677ee/70a9f641&sn1=b206cbbd/9e867ac1&camp=nyt2011-circ-sf-middle5-3844X&ad=050911-nyt2011-circ-sf-middle5-3844X&goto=https%3A%2F%2Fwww%2Enytimesathome%2Ecom%2Fhd%2F150%3FMediaCode%3DW22AS%26CMP%3D3844X" target="_blank">
<img src="http://graphics8.nytimes.com/adx/images/ADS/26/58/ad.265876/101452_SomePromiseHD_336x79_sf.jpg" width="336" height="79" border="0"></a>
</div>
</div>
<!----> <div class="columnGroup last">
<div class="singleAd" id="BigAd2">
<!-- ADXINFO classification="bigad" campaign="Amex_AcqEngine_1758529-nyt1"--><div align="center">
<SCRIPT type="text/javascript" SRC="http://ad.doubleclick.net/adj/N553.newyorktimes.com/B5114832;sz=300x250;pc=nyt164708A262770;ord=2011.07.14.23.06.54;click=http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=BigAd2&camp=Amex_AcqEngine_1758529-nyt1&ad=Amex_300x250_B5114832&sn2=f6b0b96f/33f9b85c&snr=doubleclick&snx=1310683590&sn1=f0e9b1b/2cc3cfd5&goto=">
</SCRIPT>
<NOSCRIPT>
<A HREF="http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/yr/mo/day/magazine&pos=BigAd2&sn2=f6b0b96f/33f9b85c&sn1=242036d4/2040e7c&camp=Amex_AcqEngine_1758529-nyt1&ad=Amex_300x250_B5114832&goto=http://ad.doubleclick.net/jump/N553.newyorktimes.com/B5114832;sz=300x250;pc=nyt164708A262770;ord=2011.07.14.23.06.54" TARGET="_blank">
<IMG SRC="http://ad.doubleclick.net/ad/N553.newyorktimes.com/B5114832;sz=300x250;pc=nyt164708A262770;ord=2011.07.14.23.06.54"
BORDER=0 WIDTH=300 HEIGHT=250
ALT="Click Here"></A>
</NOSCRIPT>
</div>
</div>
</div>
<div class="columnGroup">
<div id="adxSponLinkA"></div>
<script language="JavaScript" type="text/javascript">
// Sponlink_A_Short
if (document.getElementById("MiddleRight")) { google_targeting = 'content'; }
NYTD.GoogleAds.getGoogleAds("AFC", {
google_ad_client:'nytimes_article_var',
ad_target_list:'sponLinkA'
});
</script>
</div>
</div>
</div><!--close spanAB -->
<!-- start MOTH -->
<div id="insideNYTimes" class="doubleRule">
<script type="text/javascript" src="http://js.nyt.com/js/app/moth/moth.js"></script>
<div id="insideNYTimesHeader">
<div class="navigation"><span id="leftArrow"><img id="mothReverse" src="http://i1.nyt.com/images/global/buttons/moth_reverse.gif" /></span>&nbsp;<span id="rightArrow"><img id="mothForward" src="http://i1.nyt.com/images/global/buttons/moth_forward.gif" /></span></div>
<h4>
Inside NYTimes.com </h4>
</div>
<div id="insideNYTimesScrollWrapper">
<table id="insideNYTimesBrowser" cellspacing="0">
<tbody>
<tr>
<td class="first">
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/arts/music/index.html">Music &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/arts/music/new-york-city-operas-troubled-vision-quest.html"><img src="http://i1.nyt.com/images/2011/07/14/arts/music/14moth_opera/14moth_opera-moth.jpg" alt="City Opera&rsquo;s Troubled Vision Quest" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/arts/music/new-york-city-operas-troubled-vision-quest.html">City Opera&rsquo;s Troubled Vision Quest</a></h6>
</div>
</td>
<td>
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/fashion/index.html">Fashion & Style &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/fashion/skin-deep-can-a-fragrance-attract-romance.html"><img src="http://i1.nyt.com/images/2011/07/14/fashion/14moth-skin/14moth-skin-moth.jpg" alt="Pumpkin Pie: Provocative or Just Tasty?" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/fashion/skin-deep-can-a-fragrance-attract-romance.html">Pumpkin Pie: Provocative or Just Tasty?</a></h6>
</div>
</td>
<td>
<div class="story">
<h6 class="kicker"><a href="http://www.nytimes.com/pages/opinion/index.html">Opinion &raquo;</a></h6>
<h3><a href="http://video.nytimes.com/video/2011/07/13/opinion/100000000944503/bloggingheads-libya-and-impeachment.html">Bloggingheads: Libya and Impeachment</a></h3>
<p class="summary">Glenn Greenwald of Salon.com and Ilya Somin of George Mason University School of Law debate options on Libya.</p>
</div>
</td>
<td>
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/fashion/index.html">Fashion & Style &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/fashion/oscar-pistorius-a-model-and-front-runner.html"><img src="http://i1.nyt.com/images/2011/07/14/fashion/14moth-oscar/14moth-oscar-moth.jpg" alt="Model and Front-Runner" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/fashion/oscar-pistorius-a-model-and-front-runner.html">Model and Front-Runner</a></h6>
</div>
</td>
<td>
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/opinion/index.html">Opinion &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/opinion/14mccullough.html"><img src="http://i1.nyt.com/images/2011/07/14/opinion/14moth_opart/14moth_opart-moth.jpg" alt="Op-Ed: Vive la Similarit&eacute;" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/opinion/14mccullough.html">Op-Ed: Vive la Similarit&eacute;</a></h6>
</div>
</td>
<td>
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/nyregion/index.html">N.Y. / Region &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/nyregion/hill-cumorah-pageant-offers-mormon-spectacle-way-off-broadway.html"><img src="http://i1.nyt.com/images/2011/07/14/nyregion/14moth_mormon/14moth_mormon-moth.jpg" alt="A Mormon Spectacle, Way Off Broadway" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/nyregion/hill-cumorah-pageant-offers-mormon-spectacle-way-off-broadway.html">A Mormon Spectacle, Way Off Broadway</a></h6>
</div>
</td>
<td class="hidden">
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/garden/index.html">Home & Garden &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/garden/the-how-tos-of-house-painting-the-pragmatist.html"><span class="img" src="http://i1.nyt.com/images/2011/07/14/garden/14moth_pragmatist/14moth_pragmatist-moth.jpg" alt="The How-Tos of House Painting" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/garden/the-how-tos-of-house-painting-the-pragmatist.html">The How-Tos of House Painting</a></h6>
</div>
</td>
<td class="hidden">
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/garden/index.html">Home & Garden &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/garden/in-the-catskills-building-stone-by-stone-bale-by-bale.html"><span class="img" src="http://i1.nyt.com/images/2011/07/14/garden/14moth_straw/14moth_straw-moth.jpg" alt="Stone By Stone, Bale by Bale" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/garden/in-the-catskills-building-stone-by-stone-bale-by-bale.html">Stone By Stone, Bale by Bale</a></h6>
</div>
</td>
<td class="hidden">
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/arts/design/index.html">Art & Design &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/interactive/2011/07/13/arts/design/kimmelman-postcards-da-vinci-last-supper.html?ref=arts"><span class="img" src="http://i1.nyt.com/images/2011/07/14/arts/design/14moth_postcard/14moth_postcard-moth.jpg" alt="15 (Long) Minutes With &lsquo;The Last Supper&rsquo;" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/interactive/2011/07/13/arts/design/kimmelman-postcards-da-vinci-last-supper.html?ref=arts">15 (Long) Minutes With &lsquo;The Last Supper&rsquo;</a></h6>
</div>
</td>
<td class="hidden">
<div class="story">
<h6 class="kicker"><a href="http://www.nytimes.com/pages/opinion/index.html">Opinion &raquo;</a></h6>
<h3><a href="http://opinionator.blogs.nytimes.com/2011/07/13/phantoms-of-the-east-river/">Townies: Phantoms of the East River</a></h3>
<p class="summary">What started out as a birding tour on the East River becomes a journey into New York&rsquo;s dark history.</p>
</div>
</td>
<td class="hidden">
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/business/index.html">Business &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/2011/07/14/business/smallbusiness/how-a-small-business-can-survive-an-immigration-audit.html"><span class="img" src="http://i1.nyt.com/images/2011/07/14/business/14moth_sbiz/14moth_sbiz-moth.jpg" alt="High Price for an Immigration Audit" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/2011/07/14/business/smallbusiness/how-a-small-business-can-survive-an-immigration-audit.html">High Price for an Immigration Audit</a></h6>
</div>
</td>
<td class="hidden">
<div class="story">
<h6 class="kicker">
<a href="http://www.nytimes.com/pages/opinion/index.html">Opinion &raquo;</a>
</h6>
<div class="mothImage">
<a href="http://www.nytimes.com/roomfordebate/2011/07/13/europes-cities-where-would-hemingway-go-in-2011"><span class="img" src="http://i1.nyt.com/images/2011/07/14/opinion/14moth_rfd/14moth_rfd-moth.jpg" alt="Room for Debate: What&rsquo;s the Best City in Europe?" width="151" height="151" /></a>
</div>
<h6 class="headline"><a href="http://www.nytimes.com/roomfordebate/2011/07/13/europes-cities-where-would-hemingway-go-in-2011">Room for Debate: What&rsquo;s the Best City in Europe?</a></h6>
</div>
</td>
</tr>
</tbody>
</table>
</div>
</div><!-- end #insideNYTimes -->
</div><!--close main -->
<div id="footer">
<ul class="first">
<li class="first"><a href="http://www.nytimes.com">Home</a></li>
<li >
<a href="http://www.nytimes.com/pages/world/index.html">World</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/national/index.html">U.S.</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/nyregion/index.html">N.Y. / Region</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/business/index.html">Business</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/technology/index.html">Technology</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/science/index.html">Science</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/health/index.html">Health</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/sports/index.html">Sports</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/opinion/index.html">Opinion</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/arts/index.html">Arts</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/style/index.html">Style</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/travel/index.html">Travel</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/jobs/index.html">Jobs</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/realestate/index.html">Real Estate</a>
</li>
<li >
<a href="http://www.nytimes.com/pages/automobiles/index.html">Autos</a>
</li>
<li><a href="http://spiderbites.nytimes.com/">Site Map</a></li>
</ul> <ul>
<li class="first"><a href="http://www.nytimes.com/ref/membercenter/help/copyright.html">&copy; 2011</a> <a href="http://www.nytco.com/">The New York Times Company</a></li>
<li><a href="http://www.nytimes.com/privacy">Privacy</a></li>
<li><a href="http://www.nytimes.com/ref/membercenter/help/privacy.html#pp">Your Ad Choices</a></li>
<li><a href="http://www.nytimes.com/ref/membercenter/help/agree.html">Terms of Service</a></li>
<li class="termsOfSale"><a href="http://www.nytimes.com/content/help/rights/sale/terms-of-sale.html">Terms of Sale</a></li>
<li><a href="http://www.nytimes.com/corrections.html">Corrections</a></li>
<li><a class="rssButton" href="http://www.nytimes.com/rss">RSS</a></li>
<li><a href="http://www.nytimes.com/membercenter/sitehelp.html">Help</a></li>
<li><a href="http://www.nytimes.com/ref/membercenter/help/infoservdirectory.html">Contact Us</a></li>
<li><a href="http://www.nytco.com/careers">Work for Us</a></li>
<li><a href="http://www.nytimes.whsites.net/mediakit/">Advertise</a></li>
</ul>
</div>
</div><!--close page -->
</div><!--close shell -->
<IMG SRC="/adx/bin/clientside/73acaa16Q2FN3n(!N4Q5DQ2A!hQ5EQ5DJQ2034!eYQ20Bo3Q2A3DQ22y,yoQ7DleoBQ3C(" height="1" width="3">
</body>
<!-- Start UPT call -->
<img height="1" width="3" border=0 src="http://up.nytimes.com/?d=0/15/&t=2&s=0&ui=0&r=&u=www%2enytimes%2ecom%2f2011%2f07%2f10%2fmagazine%2fthe%2ddark%2dart%2dof%2dbreaking%2dbad%2ehtml%3f%5fr%3d1">
<!-- End UPT call -->
<script language="JavaScript"><!--
var dcsvid="0";
var regstatus="non-registered";
//--></script>
<script src="http://graphics8.nytimes.com/js/app/analytics/trackingTags_v1.1.js" type="text/javascript"></script>
<noscript>
<div><img alt="DCSIMG" id="DCSIMG" width="1" height="1" src="http://wt.o.nytimes.com/dcsym57yw10000s1s8g0boozt_9t1x/njs.gif?dcsuri=/nojavascript&amp;WT.js=No&amp;WT.tv=1.0.7"/></div>
</noscript>
</html>

@ -1,5 +1,6 @@
import unittest
from helpers import load_regression_data
from readability_lxml.readability import Document
from readability_lxml import readability as r
@ -139,3 +140,18 @@ class TestFindBaseUrl(unittest.TestCase):
)
]
self._run_urls(specs)
class TestFindNextPageLink(unittest.TestCase):
def test_nytimes(self):
# This better work for the New York Times.
html = load_regression_data('nytimes-next-page.html')
expected = '/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1'
doc = r.document_fromstring(html)
url = 'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html'
parsed_urls = {url}
actual = r.find_next_page_link(parsed_urls, url, doc)
logging.debug('next page link: ' + str(actual))

@ -2,22 +2,15 @@
import os
import unittest
from helpers import load_sample
from readability_lxml.readability import Document
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
sample_list = [
'nyt.sample.html',
'si-game.sample.html',
]
def load_sample(filename):
"""Helper to get the content out of the sample files"""
return open(os.path.join(SAMPLES, filename)).read()
def test_processes():
for article in sample_list:
yield process_article, article

Loading…
Cancel
Save