Add regression tests for readability results

These test cases provide a baseline from which we can start improving the
readability algorithm and making sure that we do not horribly break anything.

Conflicts:

	src/tests/regression.py
0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent a700bb8bd4
commit 7980ca84c9

@ -0,0 +1,71 @@
import errno
import os
import os.path
import readability
import sys
import test
import urllib2
import yaml
OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
def y_or_n(question):
while True:
response = raw_input(question).strip()
if len(response) > 0:
return response[0] in ['y', 'Y']
def write_file(test_name, suffix, data):
path = os.path.join(test.TEST_DATA_PATH, test_name + suffix)
mode = 0644
try:
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
except OSError as e:
if e.errno == errno.EEXIST:
if y_or_n(OVERWRITE_QUESTION % path):
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, mode)
else:
return False
else:
raise e
f = os.fdopen(fd, 'w')
f.write(data)
return True
def gen_test(url, test_name, test_description):
spec_dict = {'url': url, 'test_description': test_description}
spec = yaml.dump(spec_dict, default_flow_style = False)
if not write_file(test_name, test.YAML_EXTENSION, spec):
return False
orig = urllib2.urlopen(url).read()
if not write_file(test_name, test.ORIGINAL_SUFFIX, orig):
return False
rdbl_doc = readability.Document(orig)
summary = rdbl_doc.summary()
if not write_file(test_name, test.READABLE_SUFFIX, summary.html):
return False
return True
USAGE = '''
usage: %s <url> <test name> <test description>
'''
def usage(prog_name):
print(USAGE % prog_name)
def main():
if len(sys.argv) != 4:
usage(sys.argv[0])
return
url = sys.argv[1]
test_name = sys.argv[2]
test_description = sys.argv[3]
result = gen_test(url, test_name, test_description)
if not result:
print('test was not fully generated')
if __name__ == '__main__':
main()

@ -1,3 +1,14 @@
"""
This module provides a regression test for results of running the readability
algorithm on a variety of different real-world examples. For each page in the
test suite, a benchmark was captured that represents the current readability
results. Note that these are not necessarily ideal results, just the ones used
as a benchmark.
This allows you to tweak and change the readability algorithm and see how it
changes existing results, hopefully for the better.
"""
import lxml.html
import lxml.html.diff
import os
@ -25,11 +36,14 @@ TEST_SUMMARY_PATH = os.path.join(TEST_OUTPUT_PATH, 'index.html')
class ReadabilityTest:
def __init__(self, dir_path, enabled, name, desc, orig_path, rdbl_path):
def __init__(
self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path
):
self.dir_path = dir_path
self.enabled = enabled
self.name = name
self.desc = desc
self.notes = notes
self.orig_path = orig_path
self.rdbl_path = rdbl_path
@ -63,11 +77,16 @@ def make_readability_test(dir_path, name, spec_dict):
enabled = spec_dict['enabled']
else:
enabled = True
if 'notes' in spec_dict:
notes = spec_dict['notes']
else:
notes = ''
return ReadabilityTest(
dir_path,
enabled,
name,
spec_dict['test_description'],
notes,
make_path(dir_path, name, ORIGINAL_SUFFIX),
make_path(dir_path, name, READABLE_SUFFIX)
)
@ -99,8 +118,6 @@ def execute_test(test_data):
else:
doc = readability.Document(test_data.orig_html)
summary = doc.summary()
benchmark_doc = (test_data.rdbl_html, 'benchmark')
result_doc = (summary.html, 'result')
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
return ReadabilityTestResult(test_data, summary.html, diff)
@ -139,12 +156,14 @@ class ResultSummary():
def __init__(self, result):
doc = lxml.html.fragment_fromstring(result.diff_html)
insertions = doc.xpath('//ins')
insertion_lengths = element_string_lengths(insertions)
deletions = doc.xpath('//del')
deletion_lengths = element_string_lengths(deletions)
self.insertions = sum(insertion_lengths)
self.insertion_blocks = len(insertions)
deletions = doc.xpath('//del')
deletion_lengths = element_string_lengths(deletions)
self.deletions = sum(deletion_lengths)
self.deletion_blocks = len(deletions)
pass
@ -169,7 +188,8 @@ def make_summary_row(test, result):
B.A('result', href = output(RESULT_SUFFIX)),
' ',
B.A('diff', href = output(DIFF_SUFFIX))
)
),
B.TD(test.notes)
)
else:
return B.TR(
@ -177,7 +197,8 @@ def make_summary_row(test, result):
B.TD('%s (SKIPPED)' % test.name),
B.TD('N/A'),
B.TD('N/A'),
B.TD('N/A')
B.TD('N/A'),
B.TD(test.notes)
)
@ -187,7 +208,8 @@ def make_summary_doc(tests_w_results):
B.TH('Test Name'),
B.TH('Inserted (in # of blocks)'),
B.TH('Deleted (in # of blocks)'),
B.TH('Links')
B.TH('Links'),
B.TH('Notes')
)
)
for (test, result) in tests_w_results:
@ -253,7 +275,7 @@ del img {
def add_css(doc):
style = B.STYLE(CSS, type = 'text/css')
head = B.HEAD(style)
head = B.HEAD(style, content = 'text/html; charset=utf-8')
doc.insert(0, head)

File diff suppressed because one or more lines are too long

@ -0,0 +1,11 @@
<div id="article"><div class="comment-content" id="comment-content-4e141229cadcbbb33f050000">
<p class="comment-text">
Yep, you gotta love that almost 90% market share failure. Like I said before, if that's failure than sign me up for some of that. I'm pretty sure the good people over at Apple, Google, etc. would like to be signed up for some of that failure too.<br/><br/>
For the, "If this, if that, (insert scenario)" people, enjoy your new OS and whatever other new software you may choose to use. However, don't be surprised when those metro ui interface imitations start to land on those products too. Did you really think that static grid-icons on a screen was going to last forever? I think 20+ years is enough, it's time for new innovation in design and don't be surprised when the copycats jump on board. That's the way the industry works. One group comes up with a new design or concept and the others tend to follow suit and you don't have to be a market leader to get that following. Just ask the Opera/Chrome developers. That's just one of many, many examples that could be pointed out. The metro ui is a very suitable design for the touch screen world that we're migrating to. Sure, there will be changes and enhancements as time goes on and everyone will put their own spin on it, but I'd get used to similar offerings from MSFT's competitors if I were you.<br/><br/>
Also, for those who like to comment, but seem to have little info about what's expected in things like Windows 8, let me fill you in a bit. The info. out right now is that Windows 8 will let you choose to use the new ui or to use the more, "Windows past" icon ui. I think anyone with some modicum of common sense can see how that would be a wise move from MSFT. For instance: The metro ui may not appeal to the corporate world as much as the consumer world. Plus, it give long-time Window's users the option to stick with what they know, but still gain the newest features and security measures that new OS's tend to bring. So, if your going to use another product, but all means, have fun with it, but don't try to justify it to yourself with reasons that are unlikely to exist. Just say you want to move on and anyone else can respect that, but when you seem to have little knowledge of what your options will be, it just makes you look like the typical sheep some people can be.<br/><br/>
Personally, I love the new direction MSFT is going in and for the first time in years, they seem to be thinking more and more consumer friendly. That's not an easy task for a company who has to appeal to business the way MSFT does and I commend the effort. Believe me, or don't, but Apple, Google and any other group would suffer the same balancing act if they dominated the corporate world the way Microsoft does. Corporate and consumers are very different beasts and it's not always easy to appeal to both, yet Microsoft has kept a large following in both sectors and anyone who doesn't see the skill it takes to do that, has a lot to learn my friends. </p>
</div>
</div>

@ -0,0 +1,3 @@
test_description: businessinsider article
notes: missed the article completely; got a long comment instead
url: http://www.businessinsider.com/where-windows-8-came-from-microsoft-ui-ideas-that-never-took-off-2011-7

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,2 @@
test_description: cnet article
url: http://howto.cnet.com/8301-11310_39-20078249-285/best-free-alternatives-to-top-selling-software/?tag=epicStories

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,2 @@
test_description: deadspin article
url: http://deadspin.com/5820463/would-you-kill-a-stranger-to-save-football

File diff suppressed because one or more lines are too long

@ -0,0 +1,31 @@
<div id="article"><div class="mod-article-title">
<div class="datehead"><span class="page-actions">
<p id="fb-root"/><p class="date"><span>Updated: </span>July 12, 2011, 4:52 PM ET</p>
</span></div>
<p class="headline">
</p><h1 class="h2">Roger Clemens' defense sets strategy</h1>
</div>
<div><p>
WASHINGTON -- <a href="http://espn.go.com/mlb/player/_/id/1427/roger-clemens">Roger Clemens</a>' attorney revealed Tuesday that the ex-baseball star plans to begin his defense against charges of lying to Congress by questioning if the lawmakers' investigation into whether he used performance-enhancing drugs was proper.</p><p>Clemens attorney Michael Attanasio said in court that the hearing the House Oversight and Government Reform Committee held in February 2008 had nothing to do with Congress' responsibility for legislation. He said the hearing was only concerned with airing a "credibility contest" between Clemens and his longtime trainer, Brian McNamee, who said he injected the pitcher with steroids and human growth hormone.</p><p/><div class="mod-container mod-inline content-box mod-podcast floatright mod-no-header-footer">
<div class="mod-content"><h4>Mike and Mike in the Morning</h4><p class="podcast-player"/>
<p>ESPN legal analyst Roger Cossack explains what is going on with the Roger Clemens trial.</p>
<p class="footer clear"><a href="http://espn.go.com/espnradio/podcast/"> More Podcasts &#187;</a></p></div></div>
<p>Clemens denied those allegations and has been charged with perjury, false statements and obstruction of Congress. The obstruction count charges Clemens with making 15 false or misleading statements to the committee, including his repeated denials he didn't take performance-enhancing drugs during his 24-season career and even whether he attended a 1998 pool party at then-<a href="http://espn.go.com/mlb/team/_/name/tor/toronto-blue-jays">Toronto Blue Jays</a> teammate Jose Canseco's home in Miami.</p><p>McNamee says he saw Clemens and admitted steroids user Canseco talking at the party with another man and that after they returned to Canada, Clemens asked McNamee to inject him with steroids for the first time. </p><p>
Clemens and Canseco say Clemens was never at the party but was golfing at the time. Attanasio said that dispute suggests how improper the whole inquiry was and that jurors should be able to determine whether a "he said, he said debate" between Clemens and McNamee was a legitimate congressional concern.</p><p>"We're going to have a mini-trial on whether Roger Clemens went swimming," Attanasio said. "We're going to have a trial in U.S. District Court, Congress is going to have a hearing on these things? That's our point."</p><p>Assistant U.S. attorney Daniel Butler responded that the committee has responsibility for oversight that is broad and goes beyond legislation. He said steroids in baseball is a drug matter and pointed out that a 2005 hearing into the issue led to legislation to regulate steroids and triggered Major League Baseball to commission a report by former Sen. George Mitchell into the extent of the problem in the league.</p><p/><div class="mod-container mod-no-footer mod-inline content-box floatright mod-no-header-footer">
<div class="mod-content"><h4>Follow the trial</h4>
<img class="io-img" src="http://a.espncdn.com/photo/2010/0116/quinn_tj_m.jpg" border="0"/><p>ESPN's T.J. Quinn will provide live coverage from the courtroom during the Clemens trial. Follow along with our up-to-the-minute <a href="http://twitter.com/#!/TJQuinnESPN" target="_blank"><b>Twitter coverage</b></a>.<br/>
&#8226;&#160; <b><a href="http://espn.go.com/photo/preview/!pdfs/espn_voir_dire_questions.pdf">Voir dire questions</a></b>
</p></div>
</div><p>The Mitchell report was released in December 2007 and named Clemens and 85 other current and former ballplayers as using drugs. Clemens denied the allegations and Butler pointed out that leaders of the House committee said they needed to investigate Clemens' denials to determine what weight to give the Mitchell report and its recommendations.</p><p>Attanasio argued that if the committee's purpose was to come full circle on the Mitchell report, it had done so with a January 2008 hearing featuring testimony by Mitchell, baseball commissioner Bud Selig and former players union director Donald Fehr.</p><p>"That ship had left. That work was done. And now it becomes a question between Mr. Clemens and Mr. McNamee," Attanasio said.</p><p>But U.S. District Judge Reggie Walton said if "one of the icons of baseball" was taking exception to the Mitchell report, "it seems to me that Congress has the authority to hold hearings to determine which view is correct."</p><p>Attanasio said the issue will be addressed in testimony from the first two witnesses prosecutors plan to call after opening arguments Wednesday morning. He said the first will be retired House Parliamentarian Charles Johnson, followed by Phil Barnett, who was chief counsel for the committee at the time it investigated Clemens.</p><p>The dispute over the committee's proper role came as Walton considered what preliminary instructions to give the jury, which was seated Tuesday afternoon after 3&#189; days of screening potential members.</p><p>The jury of 10 women and two men includes a woman whose cousin, former outfielder Al Bumbry, was a coach for the <a href="http://espn.go.com/mlb/team/_/name/bos/boston-red-sox">Boston Red Sox</a> when Clemens played for the team. Another woman on the jury said she believes <a href="http://espn.go.com/nfl/team/_/name/phi/philadelphia-eagles">Philadelphia Eagles</a> quarterback <a href="http://sports.espn.go.com/nfl/players/profile?playerId=2549">Michael Vick</a> was "done wrong" in his criminal conviction in connection with dogfighting.</p><p>Four other people were seated as alternate jurors in case any of the 12 can't serve.</p><p>Prosecutors and Clemens' defense team removed 20 people from the pool of 36 jurors, offering no public explanation for their decisions.</p><p>Clemens' attorney pressed potential jurors not to hold it against Clemens if he chooses not to testify, his strongest hint yet that the ex-pitcher might not take the stand.</p><p>Walton also said he was upset to read a New York Daily News item that members of Clemens' family have been criticizing McNamee and other government witnesses on Twitter and elsewhere online. The judge has a gag order on parties involved in the case, but he said he doesn't have any authority over anyone who isn't before him and hopes that those that are were not involved. </p><p>Clemens' attorney Rusty Hardin said he would look into it but that it's been "extremely difficult" for Clemens' family to see harsh criticisms of the baseball star online and in the media and not be able to respond.</p><p><i>Information from The Associated Press was used in this report.</i>
</p>
</div>
</div>

@ -0,0 +1,2 @@
test_description: espn article
url: http://sports.espn.go.com/mlb/news/story?id=6760720

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,3 @@
test_description: mit news article
notes: links are broken out into paragraph divs
url: http://web.mit.edu/newsoffice/2011/compare-recommendation-systems-0708.html

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,2 @@
test_description: nytimes article
url: http://thecaucus.blogs.nytimes.com/2011/07/12/mcconnell-proposal-gives-obama-power-to-increase-debt-limit/?hp

@ -0,0 +1,9 @@
test_description: multi-page article from nytimes
enabled: false
notes: multi-page not yet implemented
url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html
url_map:
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2: nytimes-000-orig-2.html
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=3: nytimes-000-orig-3.html
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=4: nytimes-000-orig-4.html
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=5: nytimes-000-orig-5.html

File diff suppressed because it is too large Load Diff

@ -0,0 +1,6 @@
<div id="article"><article><p>Put another way, Democrats reacted to the &#8220;grand bargain&#8221; proposed by President Obama and House Speaker John Boehner by squawking, complaining and highlighting elements they didn&#8217;t like. This is known throughout the world as the way to begin a process of negotiation.</p><p>Republicans, by contrast, answered with a definitive &#8220;no&#8221; and then covered their ears. Given the looming Aug. 2 deadline for default if the debt ceiling is not raised, the proper term for this approach is blackmail.</p><p>Yet the &#8220;both sides are to blame&#8221; narrative somehow gained currency after <a href="http://www.washingtonpost.com/business/economy/boehner-abandons-efforts-to-reach-comprehensive-debt-reduction-deal/2011/07/09/gIQARUJ55H_story.html">Boehner announced Saturday</a> that House Republicans would not support any increase in revenue, period. A false equivalence was drawn between the absolute Republican rejection of &#8220;revenue-positive&#8221; tax reform and the less-than-absolute Democratic opposition to &#8220;benefit cuts&#8221; in Medicare and Social Security.</p><p>The bogus story line is that the radical right-wing base of the GOP and the radical left-wing base of the Democratic Party are equally to blame for sinking the deal. </p><p>Leave aside, for the moment, the fact that in the Obama-Boehner proposal, there would be roughly three dollars&#8217; worth of budget cuts for every dollar of new revenue. Don&#8217;t pause to ask whether it makes sense to slash government spending when the economy is still sputtering out of the worst recession in decades. Instead, focus narrowly on the politics of the deal.</p><p>It is true that House Minority Leader Nancy Pelosi howled like a blindsided politician when she learned that entitlement programs were on the table. But her objections &#8212; and those of Democrats in general &#8212; are philosophical and tactical, not absolute.</p><p>Progressives understand that Medicare and Social Security are not sustainable on their current trajectories; in the long term, both must have their revenue and costs brought into balance. Pelosi&#8217;s position is that each program should be addressed with an eye toward sustainability &#8212; not as a part of a last-minute deal for a hike in the debt ceiling that covers us for two or three years.</p><p>It&#8217;s also true that Democrats believe they can win back a passel of House seats next year by highlighting the GOP plan to convert Medicare into a voucher program. They don&#8217;t want Republicans to be able to point and say, &#8220;See, the Democrats want to cut Medicare, too.&#8221;</p><p>There&#8217;s nothing in these Democratic objections, however, that couldn&#8217;t be creatively finessed. You can claim you haven&#8217;t actually &#8220;cut&#8221; a benefit, for example, if what you&#8217;ve done is restrained the rate at which its cost will grow. You can offset spending with new revenue, and you can do so in a way that gives low-income taxpayers a break. Democrats left the door open and these options could have been explored.</p><p>The story on the Republican side is entirely different. There are ways to finesse a &#8220;no new taxes&#8221; pledge, too. Instead of raising tax rates, you close loopholes in the name of reform; you add an enhancement here, a &#8220;user fee&#8221; there, and you can manage to get the revenue you need and still claim you haven&#8217;t voted to raise taxes.</p><p>But Republicans are taking the position that not a cent of new revenue can be raised, no matter the euphemism. Some Democrats, yes, are being scratchy and cantankerous. But Republicans are refusing to negotiate at all. That&#8217;s not the same thing.</p><p>I understand why President Obama, <a href="http://projects.washingtonpost.com/obama-speeches/speech/736/">in his news conference Monday</a>, chided &#8220;each side&#8221; for taking a &#8220;maximalist position.&#8221; For political and practical reasons, it&#8217;s advantageous for him to be seen as an honest broker.</p><p>Meanwhile, though, the clock ticks toward Aug. 2 and the possibility of a catastrophic default becomes more real. And no one should be confused about what the president confronts: On one side, grousing and grumbling. On the other, a brick wall. </p><p>
<i>
<a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Eugene Robinson will be online</a> to chat with readers at 1 p.m. Eastern time Tuesday. <a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Submit your questions</a> before or during the discussion.</i>
</p></article></div>

@ -0,0 +1,2 @@
test_description: washingtonpost.com op-ed
url: http://www.washingtonpost.com/opinions/dont-blame-both-sides-for-debt-impasse/2011/07/11/gIQA0XDg9H_story.html?hpid=z1
Loading…
Cancel
Save