diff --git a/regression_test.py b/regression_test.py new file mode 100644 index 0000000..b7582c0 --- /dev/null +++ b/regression_test.py @@ -0,0 +1,307 @@ +""" +This module provides a regression test for results of running the readability +algorithm on a variety of different real-world examples. For each page in the +test suite, a benchmark was captured that represents the current readability +results. Note that these are not necessarily ideal results, just the ones used +as a benchmark. + +This allows you to tweak and change the readability algorithm and see how it +changes existing results, hopefully for the better. +""" +from lxml.html import builder as B +import lxml.html +import lxml.html.diff +import os +import os.path +import re +import readability +import sys +import unittest +import yaml + +YAML_EXTENSION = '.yaml' +ORIGINAL_SUFFIX = '-orig.html' +READABLE_SUFFIX = '-rdbl.html' +RESULT_SUFFIX = '-result.html' +DIFF_SUFFIX = '-diff.html' + +TEST_DATA_PATH = 'regression_test_data' +TEST_OUTPUT_PATH = 'regression_test_output' +TEST_SUMMARY_PATH = os.path.join(TEST_OUTPUT_PATH, 'index.html') + +SUMMARY_CSS = ''' +table, th, td { + border: 1px solid black; + border-collapse: collapse; + font-family: Georgia, 'Times New Roman', serif; +} +table { + margin: auto; +} +.skipped { + color: gray; +} +td, th { + font-size: 1.2em; + border: 1px solid black; + padding: 3px 7px 2px 7px; +} +th { + font-size: 16px; + text-align: left; + padding-top: 5px; + padding-bottom: 4px; +} +''' + +READABILITY_CSS = ''' +#article { + margin: 0 auto; + max-width: 705px; + min-width: 225px; + font-family: Georgia, 'Times New Roman', serif; + font-size: 19px; + line-height: 29px; +} + +#article p { + font-size: 19px; + line-height: 29px; + margin: 19px 0px 19px 0px; +} + +ins { + background-color: #C6F7C3; + text-decoration: none; +} + +ins img { + border-width: 3px; + border-style: dotted; + border-color: #51B548; +} + +del { + background-color: #F7C3C3; + text-decoration: none; +} + +del img { + border-width: 3px; + border-style: dotted; + border-color: #D12626; +} +''' + +class ReadabilityTest: + + def __init__( + self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path + ): + self.dir_path = dir_path + self.enabled = enabled + self.name = name + self.desc = desc + self.notes = notes + self.orig_path = orig_path + self.rdbl_path = rdbl_path + +class ReadabilityTestData: + + def __init__(self, test, orig_html, rdbl_html): + self.test = test + self.orig_html = orig_html + self.rdbl_html = rdbl_html + +class ReadabilityTestResult: + + def __init__(self, test_data, result_html, diff_html): + self.test_data = test_data + self.result_html = result_html + self.diff_html = diff_html + +def read_yaml(path): + with open(path, 'r') as f: + return yaml.load(f) + +def make_path(dir_path, name, suffix): + return os.path.join(dir_path, ''.join([name, suffix])) + +def make_readability_test(dir_path, name, spec_dict): + if 'enabled' in spec_dict: + enabled = spec_dict['enabled'] + else: + enabled = True + if 'notes' in spec_dict: + notes = spec_dict['notes'] + else: + notes = '' + return ReadabilityTest( + dir_path, + enabled, + name, + spec_dict['test_description'], + notes, + make_path(dir_path, name, ORIGINAL_SUFFIX), + make_path(dir_path, name, READABLE_SUFFIX) + ) + +def load_test_data(test): + if test.enabled: + orig = open(test.orig_path, 'r').read() + rdbl = open(test.rdbl_path, 'r').read() + return ReadabilityTestData(test, orig, rdbl) + else: + return None + +def load_readability_tests(dir_path, files): + yaml_files = [f for f in files if f.endswith(YAML_EXTENSION)] + yaml_paths = [os.path.join(dir_path, f) for f in yaml_files] + names = [re.sub('.yaml$', '', f) for f in yaml_files] + spec_dicts = [read_yaml(p) for p in yaml_paths] + return [ + make_readability_test(dir_path, name, spec_dict) + for (name, spec_dict) in zip(names, spec_dicts) + ] + +def execute_test(test_data): + if test_data is None: + return None + else: + doc = readability.Document(test_data.orig_html) + summary = doc.summary() + diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html) + return ReadabilityTestResult(test_data, summary.html, diff) + +def element_string_lengths(elems): + return [len(e.xpath('string()')) for e in elems] + +class ResultSummary(): + + def __init__(self, result): + doc = lxml.html.fragment_fromstring(result.diff_html) + + insertions = doc.xpath('//ins') + insertion_lengths = element_string_lengths(insertions) + self.insertions = sum(insertion_lengths) + self.insertion_blocks = len(insertions) + + deletions = doc.xpath('//del') + deletion_lengths = element_string_lengths(deletions) + self.deletions = sum(deletion_lengths) + self.deletion_blocks = len(deletions) + pass + +def make_summary_row(test, result): + def data(suffix): + return os.path.join('..', TEST_DATA_PATH, test.name + suffix) + def output(suffix): + return test.name + suffix + if test.enabled: + s = ResultSummary(result) + return B.TR( + B.TD(test.name), + B.TD('%d (%d)' % (s.insertions, s.insertion_blocks)), + B.TD('%d (%d)' % (s.deletions, s.deletion_blocks)), + B.TD( + B.A('original', href = data(ORIGINAL_SUFFIX)), + ' ', + B.A('benchmark', href = output(READABLE_SUFFIX)), + ' ', + B.A('result', href = output(RESULT_SUFFIX)), + ' ', + B.A('diff', href = output(DIFF_SUFFIX)) + ), + B.TD(test.notes) + ) + else: + return B.TR( + B.CLASS('skipped'), + B.TD('%s (SKIPPED)' % test.name), + B.TD('N/A'), + B.TD('N/A'), + B.TD('N/A'), + B.TD(test.notes) + ) + +def make_summary_doc(tests_w_results): + tbody = B.TBODY( + B.TR( + B.TH('Test Name'), + B.TH('Inserted (in # of blocks)'), + B.TH('Deleted (in # of blocks)'), + B.TH('Links'), + B.TH('Notes') + ) + ) + for (test, result) in tests_w_results: + row = make_summary_row(test, result) + tbody.append(row) + return B.HTML( + B.HEAD( + B.TITLE('Readability Test Summary'), + B.STYLE(SUMMARY_CSS, type = 'text/css') + ), + B.BODY( + B.TABLE( + tbody + ) + ) + ) + +def write_summary(path, tests_w_results): + doc = make_summary_doc(tests_w_results) + with open(path, 'w') as f: + f.write(lxml.html.tostring(doc)) + +def add_css(doc): + style = B.STYLE(READABILITY_CSS, type = 'text/css') + head = B.HEAD(style, content = 'text/html; charset=utf-8') + doc.insert(0, head) + +def write_output_fragment(fragment, output_dir_path, test_name, suffix): + doc = lxml.html.document_fromstring(fragment) + add_css(doc) + html = lxml.html.tostring(doc) + file_name = ''.join([test_name, suffix]) + path = os.path.join(output_dir_path, file_name) + with open(path, 'w') as f: + f.write(html) + +def write_result(output_dir_path, result): + test_name = result.test_data.test.name + specs = [ + (result.test_data.rdbl_html, READABLE_SUFFIX), + (result.diff_html, DIFF_SUFFIX), + (result.result_html, RESULT_SUFFIX) + ] + for (html, suffix) in specs: + write_output_fragment(html, output_dir_path, test_name, suffix) + +def print_test_info(test): + name_string = '%s' % test.name + if test.enabled: + skipped = '' + else: + skipped = ' (SKIPPED)' + print('%20s: %s%s' % (name_string, test.desc, skipped)) + +def run_readability_tests(): + files = os.listdir(TEST_DATA_PATH) + tests = load_readability_tests(TEST_DATA_PATH, files) + test_datas = [load_test_data(t) for t in tests] + results = [execute_test(t) for t in test_datas] + for (test, result) in zip(tests, results): + print_test_info(test) + if result: + write_result(TEST_OUTPUT_PATH, result) + write_summary(TEST_SUMMARY_PATH, zip(tests, results)) + +def main(): + if len(sys.argv) > 1 and sys.argv[1] == 'unittest': + del sys.argv[1] + return unittest.main() + run_readability_tests() + +if __name__ == '__main__': + main() diff --git a/regression_test_data/arstechnica-000-orig.html b/regression_test_data/arstechnica-000-orig.html new file mode 100644 index 0000000..1dec754 --- /dev/null +++ b/regression_test_data/arstechnica-000-orig.html @@ -0,0 +1,664 @@ + + + + + June Web browser stats: Rapid Release edition + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ +
+
+ +
+ + +
+ +
+ +
+

The Web

+
+ +
+
+

June Web browser stats: Rapid Release edition

+ + +
+ + +
+ + + +
+

June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different Firefox 5. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.

+ +

Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.

+ + +

The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points—that's a loss representing 20% of its market share. Our own Ryan Paul liked Opera 11.50, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.

+ +

Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions—Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista—rather than making converts of the other browsers.

+ + + +

Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we agree with the decision to cut Windows XP off, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent—putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.

+ +

Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.

+ + + +

Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.

+ + + +

Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.

+ +

Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.

+ +

That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).

+ +

To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The most popular of these are 360 Secure Browser with about 19 percent share of the Chinese market, and Sogou high speed browser, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like Chrome Frame, that allows Internet Explorer users to use Chrome for some browser tabs.

+ +
Sogou browser running as Internet Explorer
+ +

These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.

+ +
Sogou browser using its embedded Chrome
+ +

The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.

+ +
+ +

Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."

+ + + + +
+ + + + + + +
+ + + + + + + + + +
+ + + + + + + +
+
+ + + +
+ +
+ + + + + + + + + + + + + diff --git a/regression_test_data/arstechnica-000-rdbl.html b/regression_test_data/arstechnica-000-rdbl.html new file mode 100644 index 0000000..e300cbb --- /dev/null +++ b/regression_test_data/arstechnica-000-rdbl.html @@ -0,0 +1,53 @@ +
+

June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different Firefox 5. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.

+ +

Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.

+ +
+

The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points—that's a loss representing 20% of its market share. Our own Ryan Paul liked Opera 11.50, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.

+ +

Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions—Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista—rather than making converts of the other browsers.

+ +
+ +

Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we agree with the decision to cut Windows XP off, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent—putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.

+ +

Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.

+ +
+ +

Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.

+ +
+ +

Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.

+ +

Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.

+ +

That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).

+ +

To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The most popular of these are 360 Secure Browser with about 19 percent share of the Chinese market, and Sogou high speed browser, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like Chrome Frame, that allows Internet Explorer users to use Chrome for some browser tabs.

+ +

Sogou browser running as Internet Explorer

+ +

These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.

+ +

Sogou browser using its embedded Chrome

+ +

The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.

+ +
+ +

Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."

+ + + + +
+ + + + + + +
diff --git a/regression_test_data/arstechnica-000.yaml b/regression_test_data/arstechnica-000.yaml new file mode 100644 index 0000000..0b33706 --- /dev/null +++ b/regression_test_data/arstechnica-000.yaml @@ -0,0 +1,2 @@ +test_description: standard article from arstechnica +url: http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars diff --git a/regression_test_data/businessinsider-000-orig.html b/regression_test_data/businessinsider-000-orig.html new file mode 100644 index 0000000..be80ec3 --- /dev/null +++ b/regression_test_data/businessinsider-000-orig.html @@ -0,0 +1,3602 @@ + + + + + How 20 Years Of Failed Designs From Microsoft Shaped Its Newest Product + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ + + + + + + +
+
+
+
+
+ +
+
+ +
+ Business Insider + Business Insider + + +
+
+ + + + + + +
+
+
+
+
+ +
    +
  • +
  • +
  • + + +
  • + +
+
+
+
+
+ +
+ +
+ +
+ + + + + + + + + +
+
+ +
+
+ + +

How 20 Years Of Failed Designs From Microsoft Shaped Its Newest Product

+ +
+
+ + + + + +
+ +
+
+

Windows 8 start screen

Image: AllThingsD

+

See Also:

+ +
+ + +
Last month, Microsoft showed what the next version of Windows will look like on tablets.

+

Although it was only a canned demo, and Windows 8 won't ship until next year at the earliest, it showed that Microsoft is doing some fresh thinking about interface design.

+

But unless you follow the company closely, you might not realize that Windows 8 descends from a long line of other Microsoft ideas.

+

A lot of these were experimental products. Some never made it to market. None became popular enough to challenge the old "icons on a desktop" model that's been around since the early 1990s, and has spread from the Mac and Windows to the mobile world.

+

But they are all a part of Microsoft's history -- and intellectual property -- and they all contributed to the thinking behind Windows 8.

+

Join us as we take a look back at the Windows 8 family tree.

+ + + +
+
+ + + +
+ + + +

+ Please follow SAI on Twitter and Facebook. +
Follow Matt Rosoff on Twitter. +

+ + + + + +
+
+ +
+ +
+ +

+
+
+ + +
+
+ + Share: +

+ + + + + + + +
+ + + + + +
+
+
+
+
+
+
+ Matt Rosoff is Silicon Alley Insider's West Coast Editor
+ Contact: + +
+
e-mail:
+
+ +
AIM:
+
mattrosoff
+ + + +
+ + Subscribe to his + + RSS feed + + | + + twitter feed +
+
+
+
+ +
+
+ + + +
+ + + + +
+ + +
+

The Water Cooler
+
+
+ Receive email updates on new comments! +
+ +
+ 32 Comments + +

+
+ +
+ + +
+
+
+
+ + + +
+ + + + geez + + + + + on + + Jul 5, 1:12 PM + said: + + + +
+ + +
+ Yeah...20 years of crappy design.
+
+That's a great pedigree.
+
+Nobody cares about MSFT anymore. They are like GM during the 80's. A company focused on market share, not product. We know how that turned out.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + freddy bee + + + + + on + + Jul 5, 2:06 PM + said: + + + +
+ + @geez: + +
+ 'we know how that turned out'...
+
+do you mean billions in dividends, billions in the bank, and continued growth?
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ +
+ + + lxa374 + + + + + on + + Jul 5, 2:32 PM + said: + + + +
+ + @geez: + +
+ Wow. you are so delusional! Just b/c the press gives Windows a bad name, it doesnt mean it's the truth, and you shouldnt just jump on the band-wagon. You end up sounding like an idiot.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ +
+ + + jesushatesapple +
+ + What are these? Strikes! Earn three of them in a month, and you'll be sent + to the Penalty Box for 24 hours. How do you earn strikes? Write comments that our editors kick to the Bleachers. + Want to get rid of the strikes and start fresh? Write excellent comments that our editors promote + to the Board Room. + +
+
+ + (URL) + + + on + + Jul 5, 4:51 PM + said: + + + +
+ + @geez: + +
+ Jesus Says: This is a typical comment from an Apple Disciple.
+
+METRO UI, Zune's UI and WP7s GUI, just got even better, Lync with Skype with WP7 will be formidable - and an iPhone and RIM killer rolled into one. The Sanyo Infobar 3 (bing or google iida.jp) is straight out of the METRO UI school or design.
+
+All Apple needs to do now to beat Google, MS, Samsung and HTC is to invent their own social network. That way the Apple People can stop bothering the rest of the Earthlings with their insufferable hubris about how paying $2000 for 18 month old Intel processors in a shiny "unibody" box makes them somehow superior to the rest of mankind.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + Gret + + + + + on + + Jul 6, 9:36 PM + said: + + + +
+ + @jesushatesapple: + +
+ "...and an iPhone and RIM killer rolled into one."
+
+Umm... RIM (or BlackBerry) is already dead, at least in the consumer market.
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ +
+ + + radix + + + + + on + + Jul 5, 1:27 PM + said: + + + +
+ + +
+ The real question has nothing to do with the eye candy. Are they going to re-write the kernel and file structure?
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + Depression + + + + + on + + Jul 5, 1:27 PM + said: + + + +
+ + +
+ 20 years of failure rolled into one EPIC soon to be FAIL .. awsome
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + that_does_it + + + + + on + + Jul 5, 1:28 PM + said: + + + +
+ + +
+ this ugly tile interface will be what finally drives me away from windows. goodbye and good riddance.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ +
+ + + radix + + + + + on + + Jul 5, 1:35 PM + said: + + + +
+ + @that_does_it: + +
+ Actually, you'll be able to choose icons or tiles.
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ + + + Stu + + + + + on + + Jul 5, 1:29 PM + said: + + + +
+ + +
+ I can still remember good old WebTV, Bill Gates pretty much saying the Internet was a fad and dialup MSN was where the future was, and most recently the Zune. Without the OS division they would have gone bust years ago.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + Sam + + + + + on + + Jul 5, 2:04 PM + said: + + + +
+ + @Stu: + +
+ I also remember Bill Gates and Steve Jobs sitting in the same room, Bill trying to convince Jobs that tablet was the way to go, but Jobs insisted that people wanted keyboards.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + nerdbert + + + + + on + + Jul 5, 3:08 PM + said: + + + +
+ + @Sam: + +
+ Yeah, but doing tablets the way Bill wanted to sucked, or didn't you watch the slideshow of doomed products and demos?
+
+This is one of those cases where Steve was right: keyboards made sense up until the time when computing got cheap enough to make a table the right answer for most consumers.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + nerdbert + + + + + on + + Jul 5, 3:21 PM + said: + + + +
+ + @Stu: + +
+ The OS division is the cash cow for the company, and it's the bludgeon that keeps the PC industry at MS's beck and call. But personally I like their dev tools and think that the combo of superior dev tools and workable OS that's the key to third party aps that really make the platform popular.
+
+The problem for MS is that every other industry has seen how MS treats its corporate "allies" and actively looks for alternatives and that makes it hard to break out of the traditional computing environment. WinMob7 isn't half bad but nobody wants to be in MS's thrall anymore. With Android a viable alternative they're more willing to tie their future to Google than MS. It's a case of the stranger you don't know is better than the devil you do.
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ + + + Charlie + + + + + on + + Jul 5, 1:34 PM + said: + + + +
+ + +
+ Yes, let's all "Think Different" with Apple.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + freddy bee + + + + + on + + Jul 5, 3:51 PM + said: + + + +
+ + @Charlie: + +
+ so funny... from different thinkers to today's followers...
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ + + + FMS + + + + + on + + Jul 5, 2:01 PM + said: + + + +
+ + +
+ What a great advertising story line: Twenty years of failures brought us to this point. You, look at Zune, Xbox, SPOT, Encarta, bing, UMPC, Slate, WinMo, WinPho7, Vista, etc., etc.
+
+This is like saying that a rogues gallery of crap design is good for new products.
+
+M$H!T is still M$H!T.
+
+Always was and always will be.
+
+The bloated, rotting dungheap is just more obviously bloated and rotting.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + Troll Hunter + + + + + on + + Jul 5, 7:21 PM + said: + + + +
+ + @FMS: + +
+ Your bridge is missing you, please return to your home.
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ + + + bobobob + + + + + on + + Jul 5, 2:26 PM + said: + + + +
+ + +
+ Why did you choose a parody picture for the Bob slide? It was stupid enough you had to edit and explain it... youre an idiot.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + nerdbert + + + + + on + + Jul 5, 3:10 PM + said: + + + +
+ + @bobobob: + +
+ No, Bob was stupid/bad enough that they had to explain it was a parody and not an actual image. People couldn't have told the difference without the hint.
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ + + + Jungle Jim + + + + + on + + Jul 5, 2:31 PM + said: + + + +
+ + +
+ I can see it now, "the Attack of the Zombie Concepts" coming soon to a PC near you. This is company so exalted that Vista couldn't make the grade. You couldn't make this stuff up !
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + Stu + + + + + on + + Jul 5, 2:32 PM + said: + + + +
+ + +
+ I love how you guys take one idiot writers opinion as gospel. If dominating the PC market for the last 20 years is your idea of failed design than sign me up for some of that failure. Also, becoming more and more dominant in the console market and at one time being one of the dominators in the mobile market and trying to be so again. Let's also not forget dominating the workspace. I guess we can also mention the incredible developer ecosystem that MSFT has that is unmatched by anyone.
+
+You all enjoy your Linux, Macs, Androids and ios... I have no problem with any of the above, but don't be surprised when MSFT gains in the areas they're currently weak in and continues dominating where they have for a long time. Most intelligent and unbiased people see changes in Microsoft and how they starting to approach the consumer market and can see the good in that for all consumers. Those same people also know those claims of 20 years of failed design are just as ignorant as they are untrue. It's funny that some of the products, ie Zune, you people point out as failures in design, we're some of MSFT's best consumer designed products. You see, a product not selling well doesn't necessarily mean the design was a bad one, that's the way it works out sometimes. If not getting a strong hold on the market = bad design than Apple, Linux, Opera, etc. wouldn't exist right now.
+
+Enjoy your hating. I enjoy seeing all the products, from all the companies and the open source community that I now have to choose from. Hating or loving one over the other won't make any of them better/worse, it will simply give all of us more options. Whichever products I do pick to use, you can bet that I won't feel superior/inferior and I won't find it necessary to hate everything that I don't use or everyone who chooses differently. To each their own. All that being said, don't be surprised to see MSFT staying a major player for a long time to come, in many avenues. You don't have to like or hate them to have common sense.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + Max Peck + + + + + on + + Jul 7, 9:56 AM + said: + + + +
+ + @Stu: + +
+ Well said, Stu.
+
+-Max
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + +
+ Comment flagged as offensive. +
+ + +
+
+
+
+ + + +
+ + + + Gordon + + + + + on + + Jul 5, 3:02 PM + said: + + + +
+ + +
+ Best of Breed -
+Chrome Browser
+Android Phone
+Gmail
+Windows 7
+MS Office
+
+i'll dump Windows and Office if o have to. i already have one foot out the door.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + Prince Albert + + + + + on + + Jul 5, 3:24 PM + said: + + + +
+ + +
+ To me, it looks simply ugly.
+ +
+
+ +
+ Reply +
+
+ +
+ + +
+
+
+
+ + + +
+ + + + Max Peck + + + + + on + + Jul 7, 9:58 AM + said: + + + +
+ + @Prince Albert: + +
+ I don't care for Metro, either ... I think it's ugly. However it's only one front-end choice that Windows 8 will have. If and when I upgrade to it I'll just use Aero. No big deal. Everybody is really in a froth over this one!
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+ + + +
+ + + + Greg McGavock + + + (URL) + + + on + + Jul 5, 3:33 PM + said: + + + +
+ + +
+ I've heard from a number of sources that say that the Metro theme will be the default on all versions of Windows 8 shipped on desktops and laptops as well. Designing an entire operating system's interface for what will probably amount to less than 1% of the devices it's actually sold on... yeah, that makes a lot of sense.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + tablet + + + (URL) + + + on + + Jul 5, 9:44 PM + said: + + + +
+ + +
+ MS's OS are always failures. They alway's walk the apple's way, never ahead of apple's
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + Stu + + + + + on + + Jul 6, 3:43 AM + said: + + + +
+ + +
+ Yep, you gotta love that almost 90% market share failure. Like I said before, if that's failure than sign me up for some of that. I'm pretty sure the good people over at Apple, Google, etc. would like to be signed up for some of that failure too.
+
+For the, "If this, if that, (insert scenario)" people, enjoy your new OS and whatever other new software you may choose to use. However, don't be surprised when those metro ui interface imitations start to land on those products too. Did you really think that static grid-icons on a screen was going to last forever? I think 20+ years is enough, it's time for new innovation in design and don't be surprised when the copycats jump on board. That's the way the industry works. One group comes up with a new design or concept and the others tend to follow suit and you don't have to be a market leader to get that following. Just ask the Opera/Chrome developers. That's just one of many, many examples that could be pointed out. The metro ui is a very suitable design for the touch screen world that we're migrating to. Sure, there will be changes and enhancements as time goes on and everyone will put their own spin on it, but I'd get used to similar offerings from MSFT's competitors if I were you.
+
+Also, for those who like to comment, but seem to have little info about what's expected in things like Windows 8, let me fill you in a bit. The info. out right now is that Windows 8 will let you choose to use the new ui or to use the more, "Windows past" icon ui. I think anyone with some modicum of common sense can see how that would be a wise move from MSFT. For instance: The metro ui may not appeal to the corporate world as much as the consumer world. Plus, it give long-time Window's users the option to stick with what they know, but still gain the newest features and security measures that new OS's tend to bring. So, if your going to use another product, but all means, have fun with it, but don't try to justify it to yourself with reasons that are unlikely to exist. Just say you want to move on and anyone else can respect that, but when you seem to have little knowledge of what your options will be, it just makes you look like the typical sheep some people can be.
+
+Personally, I love the new direction MSFT is going in and for the first time in years, they seem to be thinking more and more consumer friendly. That's not an easy task for a company who has to appeal to business the way MSFT does and I commend the effort. Believe me, or don't, but Apple, Google and any other group would suffer the same balancing act if they dominated the corporate world the way Microsoft does. Corporate and consumers are very different beasts and it's not always easy to appeal to both, yet Microsoft has kept a large following in both sectors and anyone who doesn't see the skill it takes to do that, has a lot to learn my friends.
+ +
+
+ +
+ Reply +
+
+ +
+ +
+ Comment flagged as offensive. +
+
+
+
+ + + +
+
+
+
+ + + +
+ + + + Conrad S + + + + + on + + Jul 6, 9:23 AM + said: + + + +
+ + +
+ Interesting article, however I feel that it misses three key points:
+
+1) The reason that any (happened to be Apple) mobile application was wildly successful was not the design, but the wide availability of high-speed wireless connectivity. Had that existed in 1991 when the Microsoft Tablet was introduced, the tablet would probably have started the handheld revolution (although others would have jumped on quickly - like Apple did once high-speed wireless connectivity became widespread).
+
+2) Microsoft does not seem to understand the KISS principle (keep it simple stupid). Instead of a long explanation, go to google.com (picture, text box, button), or your iPhone (square piece of glass, button, icons). Microsoft tried to squeeze a full-size version of Windows into a 3x4 inch square. Even Windows phone 7 has excessive graphics and text. Hopefully Windows 8 will be nicknamed Windows KISS.
+
+3) New markets need an innovator, not a maintainer. In 1985, Microsoft incorporated what IBM and Xerox already had to create Windows; IBM and Xerox seemed to be in maintenance mode as opposed to innovation mode and Microsoft took advantage of that. And now, the innovative torch is handed to a new generation with Apple capitalizing on Microsoft's iterations at mobile application design - and market timing (availability of high-speed wireless) - to become the innovator.
+ +
+
+ +
+ Reply +
+
+ +
+
+ + + +
+
+
+
+ + + +
+ + + + TinaD + + + + + on + + Jul 6, 3:19 PM + said: + + + +
+ + +
+ An ugly design but it's work, who need and beautiful design but useless?
+
+http://www.yensao.co
+ +
+
+ +
+ Reply +
+
+ +
+
+ +
+ +
+ + + +
+ + + + + +
+

+ Join the discussion with Business Insider +
+ Login With Facebook + +

+ + +
+
+ + + + + +
+ +
+ + +
+
+ + +
+
+ + +
+ + +
+ + +
+ + + + + +
+ +
+
+
+ +
+ + + + + + +
+ + + + +
+ +
+
+ + + + +
+
+
+ + + + + + + +
+
+
+

Get SAI Emails & Alerts

+ + +

Learn More »

+ + +
+
+
+ + + + +
+

Advertisement

+ + +
+ + + + +
+

+ Your Money +

+
+ +
+
+ + + + + + + + + + +
+ + Get Business Insider Mobile + +
+ + + + + +
+ + + + + +
+ + + + + + + +
+ +
+ + + + + + + + +
+

Thanks to our partners

+
+ Datapipe + OpenX + Catchpoint - Web Performance Monitoring + Ooyala + Ad-Juster + Financial Content +
+
+
+ + + + +
+ + +
+
+ + + + +
+ +
+ +
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/regression_test_data/businessinsider-000-rdbl.html b/regression_test_data/businessinsider-000-rdbl.html new file mode 100644 index 0000000..d5751f9 --- /dev/null +++ b/regression_test_data/businessinsider-000-rdbl.html @@ -0,0 +1,11 @@ +
+ + +

+ Yep, you gotta love that almost 90% market share failure. Like I said before, if that's failure than sign me up for some of that. I'm pretty sure the good people over at Apple, Google, etc. would like to be signed up for some of that failure too.

+For the, "If this, if that, (insert scenario)" people, enjoy your new OS and whatever other new software you may choose to use. However, don't be surprised when those metro ui interface imitations start to land on those products too. Did you really think that static grid-icons on a screen was going to last forever? I think 20+ years is enough, it's time for new innovation in design and don't be surprised when the copycats jump on board. That's the way the industry works. One group comes up with a new design or concept and the others tend to follow suit and you don't have to be a market leader to get that following. Just ask the Opera/Chrome developers. That's just one of many, many examples that could be pointed out. The metro ui is a very suitable design for the touch screen world that we're migrating to. Sure, there will be changes and enhancements as time goes on and everyone will put their own spin on it, but I'd get used to similar offerings from MSFT's competitors if I were you.

+Also, for those who like to comment, but seem to have little info about what's expected in things like Windows 8, let me fill you in a bit. The info. out right now is that Windows 8 will let you choose to use the new ui or to use the more, "Windows past" icon ui. I think anyone with some modicum of common sense can see how that would be a wise move from MSFT. For instance: The metro ui may not appeal to the corporate world as much as the consumer world. Plus, it give long-time Window's users the option to stick with what they know, but still gain the newest features and security measures that new OS's tend to bring. So, if your going to use another product, but all means, have fun with it, but don't try to justify it to yourself with reasons that are unlikely to exist. Just say you want to move on and anyone else can respect that, but when you seem to have little knowledge of what your options will be, it just makes you look like the typical sheep some people can be.

+Personally, I love the new direction MSFT is going in and for the first time in years, they seem to be thinking more and more consumer friendly. That's not an easy task for a company who has to appeal to business the way MSFT does and I commend the effort. Believe me, or don't, but Apple, Google and any other group would suffer the same balancing act if they dominated the corporate world the way Microsoft does. Corporate and consumers are very different beasts and it's not always easy to appeal to both, yet Microsoft has kept a large following in both sectors and anyone who doesn't see the skill it takes to do that, has a lot to learn my friends.

+ +
+
\ No newline at end of file diff --git a/regression_test_data/businessinsider-000.yaml b/regression_test_data/businessinsider-000.yaml new file mode 100644 index 0000000..7485117 --- /dev/null +++ b/regression_test_data/businessinsider-000.yaml @@ -0,0 +1,3 @@ +test_description: businessinsider article +notes: missed the article completely; got a long comment instead +url: http://www.businessinsider.com/where-windows-8-came-from-microsoft-ui-ideas-that-never-took-off-2011-7 diff --git a/regression_test_data/cnet-000-orig.html b/regression_test_data/cnet-000-orig.html new file mode 100644 index 0000000..4ba82e4 --- /dev/null +++ b/regression_test_data/cnet-000-orig.html @@ -0,0 +1,777 @@ + + + Best free alternatives to top-selling software | How To - CNET + + + + + + +
+ + +
What do you think about CNET How To? +
advertisement
+
+ +
July 11, 2011 10:35 AM PDT

Best free alternatives to top-selling software

+

Later this month I'll be canceling my subscription to a leading security suite that runs on two of my home-office PCs. I'll replace it with Microsoft's free Security Essentials, which I've been using on my notebook since I bought it two years ago. I realized several months ago that I simply no longer needed to spend money for the convenience of an all-in-one security app.

That got me thinking: Is there any software that the average PC user needs to pay for? Most of us bought our current operating system--usually Windows or Mac OS X--as part of the purchase of the computer itself. Do-it-yourselfers have Linux as a free-OS alternative.

The programs we use for work, such as +Microsoft Office and specialty apps like Adobe Photoshop or Intuit's Quicken/QuickBooks accounting software, are likely provided by our employer. (People who work from home and/or for themselves have to buy their own software, but they can at least write off the cost of the programs they use in their work.)

What about all those commercial security suites and system utilities? I ran down Amazon's list of the 20 best-selling software titles to find those for which no viable free alternative is available. Granted, my criteria are pretty broad: the freebie has to offer only the basic functionality of the fee-based product and an interface that won't stymie the average user.

Excluding +Mac OS X Snow Leopard (number 8 on the list) and two +Windows 7 Home Premium upgrades (standard and three-user family pack at 10 and 11, respectively), only two titles on Amazon's top 20 have no free equivalent that I'm aware of: Honest Technology's VHS to DVD Deluxe, which tops the Amazon list, and Nuance Communications' Dragon Naturally Speaking, which comes in at number 18.

Here's a rundown of Amazon's top-selling programs and their free counterparts.

Keep Office on the shelf
It's no surprise that Microsoft Office 2010 takes four of the top 20 spots on software-sales list: Office 2010 Home & Student is number 2, Office 2010 Home & Business is 12th, Office for Mac 2011 Home & Student Family Pack is 13th, and Office for Mac 2011 Home & Student 1 Pack comes in 17th.

In September 2009, I described several free alternatives to the Office suite and to the individual apps bundled in Office. My favorites remain OpenOffice.org--despite its bulk--and the Jarte word processor, which is based on the WordPad app that's bundled in Windows.

OpenOffice.org and other alternative suites support the standard Office file formats, although not Office 2007/2010 XML file types (.docx, .xlsx, and .pptx). An advantage for many people is the programs' use of the old-style menus rather than the Office ribbons. Personally, I'm accustomed to the ribbon look and have no problem switching between the new and old interfaces.

Since the introduction of the free Google Cloud Connect add-on for Office earlier this year, I've come to depend on the ability to sync Word documents, Excel spreadsheets, and PowerPoint presentations with Google Docs automatically. There's no version of Cloud Connect for OpenOffice.org, but a rough equivalent is to use the free DropBox service, which lets you save up to 2GB of data online (pay versions support up to 100GB for $20 a month of $200 a year).

The service adds a DropBox folder to your PC that you access in Windows Explorer just like any other folder. It's easy to share whole folders or individual files by sending people links via e-mail. The files are accessible from any Internet-connected device, including iPads and smart phones, using an Explorer-like directory.

+DropBox file list +

The free DropBox service lets you access and share files easily from any Web-enabled device (the PC interface is shown).

+(Credit: +screenshot by Dennis O'Reilly/CNET) +

Using the default OpenOffice.org file formats can cause problems when you share files with people who don't use the suite, so it's safest to stick with the more-universal .doc, .xls, and .ppt formats when creating files in OpenOffice.org or other Office alternatives. Documents, spreadsheets, and presentations created in OpenOffice.org that you save as Office files work without a hitch in Word, Excel, and PowerPoint. Only your accountant will know the difference.

Take the freeware approach to security
Security programs take 6 of the top 20 spots on Amazon's software-sales list: Norton 360 (No. 3), Kaspersky Internet Security (No. 6), Norton Internet Security 1 user-3 PCs (No. 9), Norton AntiVirus (No. 14), McAfee Total Protection (No. 15), and Norton Internet Security for a singe PC (No. 19).

As I mentioned above, I will soon replace the commercial security suite I've been using on the PCs in my home office with Microsoft's free alternative, Security Essentials. Vendors of commercial security apps are quick to point out the many other benefits their products provide, including backups and other system-maintenance tools. There may be a convenience benefit in taking the all-in-one approach, but the fact is, you can keep your PC safe and running smoothly without spending a penny for extra software.

Getting by with free maintenance tools
Last fall I wrote about a commercial utility program I thought was worth its $40 price. Several readers commented that the software caused them more problems than it solved, and since then I've heard from one reader who blames the program for wiping out her laptop PC entirely.

That experience helped convince me that most PC users have no need to pay for any application or online service that promises to fix their machine or improve its performance. It was heartening to see that no special-purpose system utility made Amazon's list of the 20 best-selling titles. That's not to say the tools that come with Windows are necessarily best of breed.

In particular, I prefer the free Easeus Todo Backup to the backup utility built into Windows, as I explained in March 2010. Back in February 2009 I compared several free Windows system tools, including the popular CCleaner. And in April 2010, I compared CCleaner with the free version of IObit's Advanced SystemCare.

Other free alternatives to popular commercial apps
You can't expect a free program to provide the range of features and functionality offered by such programs as Adobe Photoshop and QuickBooks Pro. But if you can get by with less, you may find freeware that meets your needs--with the two noteworthy exceptions I mentioned above (VHS TO DVD Deluxe and Dragon Naturally Speaking).

For example, No. 4 on Amazon's software-sales list is Adobe'sPhotoshop Elements image-processing application. Yet the free, open-source GIMP image editor provides all the photo-editing and touch-up tools many amateur photographers require.

The same may not be true for the $299 Adobe Photoshop Lightroom, which is No. 7 on the Amazon software-sales list. Professional photographers and graphic artists use Lightroom to finish, organize, and manage their images. Many of Lightroom's most powerful and useful tools simply aren't available in GIMP or any other freebie.

Similarly, people who keep a business' books may require the professional accounting tools in QuickBooks Pro (No. 5 on the Amazon sales list) and Quicken Deluxe (No. 20).

If your accounting needs are simpler, Express Accounts Free Accounting Software from NCH Software may accommodate your bookkeeping requirements. Note that I haven't tried the program, and the vendor offers the free version to entice you into upgrading to the Pro version as well as to the company's other commercial apps. As many users of "free" software have found, you often pay a price for using the programs that isn't obvious from the outset.

I was surprised to see Parallels Software's Parallels Desktop for Mac at No. 16 on the list. I always assumed most Mac users who want to run Windows or Linux on their systems would use Apple's free Boot Camp utility. But Parallels Desktop lets you switch between OSes without having to reboot, according to CNET editor Jason Parker's review.

Jason points out several other useful Parallels features: the program automatically recognizes file types associated with a particular OS and switches to that system when you open the file. And gamers will appreciate the enhanced video playback of Windows games run on Macs under Parallels. The program also lets you manage Windows apps via Mac OS's Spaces, Expose, and other features, according to Jason.

Sometimes, there's just no substitute for the convenience and functionality of commercial software. But more and more frequently PC users can find a free alternative to programs they once paid for. The next time you go shopping for software, consider whether you can save some dough by going the freeware route.

E-mail Dennis O'Reilly

If you have a question or comment for Dennis O'Reilly, you can submit it here. However, because our editors and writers receive hundreds of requests, we cannot tell you when you may receive a response.

Submit your question or comment here: 0 of 1500 characters

Dennis O'Reilly has covered PCs and other technologies in print and online since 1985. Along with more than a decade as editor for Ziff-Davis's Computer Select, Dennis edited PC World's award-winning Here's How section for more than seven years. He is a member of the CNET blog Network, and is not an employee of CNET.

+ + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + +

Comments

Add a Comment (Log in or register) (25 Comments)
  • prev
  • next
by SustainedHavoc (108 comments ) July 11, 2011 11:10 AM PDT
Mentioning 'nagware' without actually mentioning 'nagware'? There's TOO many quality open source programs to ever have to deal with 'free-but-not-really' software again. Kind of expected to see a mention of Sourceforge or eConsultant, with the usual YMMV warning.
I've used MS Security Essentials since I got 'converted' from MS Live One Care before its' passing, and have been pretty satisfied. Add and run Spybot or Malwarebytes for a little more spyware detection, and have been completely trouble free.
Reply to this comment
by Chrisboombastic (15 comments ) July 11, 2011 1:52 PM PDT
Yap me to MS Security Essentials makes my pc runs faster then when I ran Norton 360 or McAfee that I was getting free from Comcast.
+
by frankwick (389 comments ) July 11, 2011 11:13 AM PDT
+1 for MIcrosoft Security Essentials. It's fast and efficient and you won't even know it is there.
Reply to this comment 5 people like this comment
by pokeredfaced (14 comments ) July 11, 2011 4:14 PM PDT
Definitely, before I was used to spend money in using Symantec and McAfee's security suites but only to found that they are nothing but resource hog bloatwares where in a scan would consume 250000 K memory...unlike MSE which runs quitely and a scan would only consume 10,000K memory...the only thing I didn't like about mse is that its updates aren't as automatic as others but still the best AV i had ever used
2 people like this comment
+
by solitare_pax (5362 comments ) July 11, 2011 11:14 AM PDT
Of note, Openoffice is available for both Windows and Mac; and features a basic draw program in addition to word processing and spreadsheet functions. +
+
Inkscape offers a free vector-design program, and Scribus offers a free desktop Publishing program; both are good for light-duty work I will admit, and again, they are available in Windows & Mac versions - and Scribus has a Unix version as well I believe.
Reply to this comment 1 person likes this comment
by Indian_art (50 comments ) July 11, 2011 9:37 PM PDT
Don't forget LibreOffice, its:
* Great productivity software
* Cross Platform (Windows, Mac & Linux)
* Free
* Supported in many languages
* Helpful & passionate community support worlldwide
* Supported by Giants like Google
* Great free extensions like:
http://code.google.com/p/ooo2gd/downloads/detail?name=ooo2gd_3.0.0.oxt&can=2&q=
that let you quickly & easily upload to Google Docs
3 people like this comment
+
by genghizkhan91 (3 comments ) July 12, 2011 7:19 AM PDT
If I am to understand it correctly, Libreoffice is preferred over Openoffice at this point of time. They're cleaning Openoffice's code right now.
1 person likes this comment
+
by Yelonde (2172 comments ) July 12, 2011 9:44 AM PDT
The reason LibreOffice is "preferred" over openoffice is due to philosophical reasons more than reasons of functionality. Average users really don't care about Oracle's acquisition (and subsequent release) of the software, and they probably have no idea, and have never heard of the Document Foundation.

That said, LibreOffice is great, and it has replaced openoffice simply because I dislike Oracle's practices.
1 person likes this comment
+
by coldReactive (483 comments ) July 11, 2011 11:18 AM PDT
I wish there was a free flash editor that could bend lines like flash does. There is an AJAX flash editor, but it can't bend lines.
Reply to this comment 1 person likes this comment
by coldReactive (483 comments ) July 11, 2011 11:19 AM PDT
And Inkscape can't bend lines the same as flash does (the entire line at once, no matter where you bend from.)
1 person likes this comment
+
by dustinsc (111 comments ) July 11, 2011 11:23 AM PDT
Parallels has another good free alternative in Virtualbox. It's not super speedy or convenient, but for simple tasks you need to use Windows for, Virtualbox is fine and doesn't require you to reboot your entire computer.
Reply to this comment
by jtjenkins213 (7 comments ) July 11, 2011 11:27 AM PDT
Be weary of where you install some of this software - I know for a fact that you cannot install MS Security Essentials in a classroom/education environment as it is against the EULA. Same goes for most "free" anti-virus solutions on the net. The EULA specifically states that it is for individual use only.
So "free but not really" doesn't apply to just nagware.
Reply to this comment
by SustainedHavoc (108 comments ) July 11, 2011 11:38 AM PDT
While true, educational/volume pricing is almost free anyway, IF your procurement department knows what they're doing.
1 person likes this comment
+
by ckm5 (24 comments ) July 11, 2011 11:40 AM PDT
Parellels can be replaced with Virtual Box. It does roughly the same thing, e.g. running a OS in a window, although it has less integration with the OSX UI than Parallels.

The open alternative to Quickbooks is probably either SQL Ledger or GnuCash. Of course, if you have any sense, you'll just use an online service like Xero or Freshbooks. Neither are free, but considering the criticality of a company's books, it's probably worth paying for.

GIMP is probably overkill for most users, and Picassa has lots of image management tools plus library management. If you use Linux, F-Spot is a good alternative. For an alternative to LightRoom, check out Blue Marine.

Finally, for anti-virus software, there is the cross-platform ClamAV.
Reply to this comment
by zyxxy (1774 comments ) July 11, 2011 11:53 AM PDT
If you are going to downgrade all the way from GIMP to Picassa, Windows Live Photo does much of what Picassa does, and just works better for me, particularly when emailing photos so someone or uploading for printing. It is also free, though Windows only.

For $30 once and never again, pogoplug software on your PC is almost like magic.
+
by danjferguson (5 comments ) July 11, 2011 12:36 PM PDT
If you got a Mac, it has all the music and photo creation tools already installed. The iCloud service will eliminate the need for programs such as Drop Box, which will seamlessly integrate the iCloud into the OSX experience. The only thing I've added to my Mac was Microsoft Office, which has unparalleled usability compared to Open Office. Lucky for me I won a free copy from my job before I had to shell out money for it. The Apple published tools are ok, but Microsoft is the premier office application builder. The Ribbon was the best idea ever, and they continue to amaze me with just how refined their applications are. Also, you don't need security or utility software on a Mac, so there is no need to even worry about all that.. The only other things I've added are chat programs, Skype and Google Chrome/Firefox for those pesky pages that just don't like Safari.
Reply to this comment
by WinOSXBuntu (557 comments ) July 12, 2011 6:32 AM PDT
True that, but iCloud is OS X only, if you want to use cloud storage with Mac, Windows and Linux then Dropbox all the way (Although I've recently switched over to LaCie's Wuala service with Dropbox changing its ToS). I would quite happily like to see some of the apps I use on Windows available on OS X at the least. In the last at the moment: +
+
" Internet Explorer 9 - I like Chrome however, IE9 on OS X would be just perfect for me. +
" Digsby - It only seems to have a Windows version available and it is a brilliant utility to help me keep track of Facebook messages and Gmail messages. +
" Expression Web 4 - I absolutely hate Adobe's bug ridden software and have boycotted them and if I could see this utility in particular on OS X, I would be able to move back and forward between my Windows and OS X computers with ease. +
" Paint.NET - Hell of a utility, and I'm positive it might be able to run on OS X using Mono however I've never tried it. +
+
There is of course a few other minor applications like Visual Studio 2010, PeerBlock and WinRAR which I'd also like to see on OS X, plus having avast! 6's web blocking feature would be pretty useful to keep those ads at bay regardless of which browser I use. +
+
If those applications could come to OS X, I would sorted for life really.
1 person likes this comment
+
by ldtiry (76 comments ) July 11, 2011 12:38 PM PDT
I'm pretty sure OpenOffice and LibreOffice both support the Microsoft OFFICE XML file formats. I have pushed my coworkers to use the LibreOffice instead of our older Office suite because of this fact.
Reply to this comment 1 person likes this comment
by graham.tapper (2 comments ) July 11, 2011 2:21 PM PDT
As far as freeware security software is concerned, in my opinion the combination of Avast! Antivirus and Malwarebytes' AntiMalware gives me reliable security. I've been using them for years without a single problem. For reliable, easy to use and flexible backup I rely on ASCOMP's Synchredible and wouldn't swap it for anything else. And, like ckm5, I'm a big fan of Oracle's free virtualisation - VirtualBox - which I use to test out new operating systems such as Ubuntu Linux, and to provide a totally secure, sandboxed environment for anything that might possibly have security issues. You can reboot to a secure system image every time. No need for VMWare!
Reply to this comment 1 person likes this comment
by Emprovision (145 comments ) July 11, 2011 2:28 PM PDT
Personally, I don't mind paying for Windows and Office, but I've switched entirely to free antivirus-- both Microsoft Security Essentials and AVG have both served me well. As for image processing, I can get away with free software for my simple needs.
Reply to this comment 2 people like this comment
by Vegasexcitement (3 comments ) July 11, 2011 8:23 PM PDT
I use quality and virus checked Freeware on four websites and all my forums. Like with all things, I check it first for function and viruses. I have yet to be burned. I may get it next Monday :), but over the years I have had not a single problem. I always check it for problems.
Anyone that would pay so much money for expensive programs when a fully functional option is available is just nuts. Freeware is good, I always look for a fully functional freeware option.
Those that disagree, ....send me 600 dollars via PP. hahahaha

Don, in Vegas
1 person likes this comment
+
by Silmarunya (173 comments ) July 12, 2011 8:27 AM PDT
It's probably been said here before, but why on earth recommend OpenOffice rather than LibreOffice? Most OO.org devs quit the project and moved to LibreOffice, which has is now the 'main' fork. +
+
Apart from the faster update cycle and more active support, LibreOffice also has some marked improvements over OO.org. First of all, the code is optimized so that it is smaller and faster than OO.org (well, optimized... pruned is a better term really). Second, handling of Office 2007/2010 files is vastly improved (oh yes, where on earth does the writer of this article get the idea from that docx/pptx/xlsx files aren't supported?)
Reply to this comment 1 person likes this comment
by satcomer (188 comments ) July 12, 2011 8:54 AM PDT
Why no love for the free Mac virtualization program VirtualBox? This free program is well worth a look.
Reply to this comment
by Silmarunya (173 comments ) July 12, 2011 11:06 AM PDT
It's not for Mac alone: it works on Linux and Windows as well (actually, if I recall correctly, Mac support was the last to be added). But indeed, a great piece of software that can compete with the closed source but also free VMWare Player easily.
+
by m_chan1 (103 comments ) July 12, 2011 10:55 AM PDT
Thank you for the suggestions!

TOO many people (general public) think that MS Office should be the "norm" [think: BUSINESSES] "just because everyone has it and uses it"!!
Um... IF everyone jumps off a bridge, would YOU jump off a bridge?? Sound familiar?!
It's THIS type of thinking that MS Office that it's dominant in many places!

I still use WordPerfect as it's SO much easier to use than MS Word (despite how reviews from Cnet and other PC mags give it less than stellar reviews >:( ) though I like Excel but am looking for alternatives. OpenOffice is OK but nothing special.
Reply to this comment
(25 Comments)
  • prev
  • next
 
advertisement
Iframe/JavaScript: +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Ask a Question

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Share a tip, question, or comment below:

+ +
+
+ + + + + + + + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + +
+ +
+ +
+ +
+

Sorry, there was a problem submitting your post. Please try again.

+

Sorry, there was a problem generating the preview. Please try again.

+

Duplicate posts are not allowed in the forums. Please edit your post and submit again.

+ Submit + Preview + Cancel +
+ +
+

Thank you, , your post has been submitted and will appear on our site shortly.

+
+ +
+

Thank you, , your post has been submitted.

+ + > Click here to view your post. + > Manage your tracked discussions. + > Track this discussion. + +
+
+
+ + + + + + + + + + + + + + + + + + + +
+

Recent Questions

+

See all

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

How To topics

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Saved Items

+

See all

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + +
  • +

    Please log in or sign up to start saving items.

    +
  • + + + + + + +
+ +
+
+
+ + + + + + + +
+
+
+ + \ No newline at end of file diff --git a/regression_test_data/cnet-000-rdbl.html b/regression_test_data/cnet-000-rdbl.html new file mode 100644 index 0000000..1d3e15f --- /dev/null +++ b/regression_test_data/cnet-000-rdbl.html @@ -0,0 +1,9 @@ +
+

Later this month I'll be canceling my subscription to a leading security suite that runs on two of my home-office PCs. I'll replace it with Microsoft's free Security Essentials, which I've been using on my notebook since I bought it two years ago. I realized several months ago that I simply no longer needed to spend money for the convenience of an all-in-one security app.

That got me thinking: Is there any software that the average PC user needs to pay for? Most of us bought our current operating system--usually Windows or Mac OS X--as part of the purchase of the computer itself. Do-it-yourselfers have Linux as a free-OS alternative.

The programs we use for work, such as +Microsoft Office and specialty apps like Adobe Photoshop or Intuit's Quicken/QuickBooks accounting software, are likely provided by our employer. (People who work from home and/or for themselves have to buy their own software, but they can at least write off the cost of the programs they use in their work.)

What about all those commercial security suites and system utilities? I ran down Amazon's list of the 20 best-selling software titles to find those for which no viable free alternative is available. Granted, my criteria are pretty broad: the freebie has to offer only the basic functionality of the fee-based product and an interface that won't stymie the average user.

Excluding +Mac OS X Snow Leopard (number 8 on the list) and two +Windows 7 Home Premium upgrades (standard and three-user family pack at 10 and 11, respectively), only two titles on Amazon's top 20 have no free equivalent that I'm aware of: Honest Technology's VHS to DVD Deluxe, which tops the Amazon list, and Nuance Communications' Dragon Naturally Speaking, which comes in at number 18.

Here's a rundown of Amazon's top-selling programs and their free counterparts.

Keep Office on the shelf
It's no surprise that Microsoft Office 2010 takes four of the top 20 spots on software-sales list: Office 2010 Home & Student is number 2, Office 2010 Home & Business is 12th, Office for Mac 2011 Home & Student Family Pack is 13th, and Office for Mac 2011 Home & Student 1 Pack comes in 17th.

In September 2009, I described several free alternatives to the Office suite and to the individual apps bundled in Office. My favorites remain OpenOffice.org--despite its bulk--and the Jarte word processor, which is based on the WordPad app that's bundled in Windows.

OpenOffice.org and other alternative suites support the standard Office file formats, although not Office 2007/2010 XML file types (.docx, .xlsx, and .pptx). An advantage for many people is the programs' use of the old-style menus rather than the Office ribbons. Personally, I'm accustomed to the ribbon look and have no problem switching between the new and old interfaces.

Since the introduction of the free Google Cloud Connect add-on for Office earlier this year, I've come to depend on the ability to sync Word documents, Excel spreadsheets, and PowerPoint presentations with Google Docs automatically. There's no version of Cloud Connect for OpenOffice.org, but a rough equivalent is to use the free DropBox service, which lets you save up to 2GB of data online (pay versions support up to 100GB for $20 a month of $200 a year).

The service adds a DropBox folder to your PC that you access in Windows Explorer just like any other folder. It's easy to share whole folders or individual files by sending people links via e-mail. The files are accessible from any Internet-connected device, including iPads and smart phones, using an Explorer-like directory.

+DropBox file list

The free DropBox service lets you access and share files easily from any Web-enabled device (the PC interface is shown).

+(Credit: +screenshot by Dennis O'Reilly/CNET) +

Using the default OpenOffice.org file formats can cause problems when you share files with people who don't use the suite, so it's safest to stick with the more-universal .doc, .xls, and .ppt formats when creating files in OpenOffice.org or other Office alternatives. Documents, spreadsheets, and presentations created in OpenOffice.org that you save as Office files work without a hitch in Word, Excel, and PowerPoint. Only your accountant will know the difference.

Take the freeware approach to security
Security programs take 6 of the top 20 spots on Amazon's software-sales list: Norton 360 (No. 3), Kaspersky Internet Security (No. 6), Norton Internet Security 1 user-3 PCs (No. 9), Norton AntiVirus (No. 14), McAfee Total Protection (No. 15), and Norton Internet Security for a singe PC (No. 19).

As I mentioned above, I will soon replace the commercial security suite I've been using on the PCs in my home office with Microsoft's free alternative, Security Essentials. Vendors of commercial security apps are quick to point out the many other benefits their products provide, including backups and other system-maintenance tools. There may be a convenience benefit in taking the all-in-one approach, but the fact is, you can keep your PC safe and running smoothly without spending a penny for extra software.

Getting by with free maintenance tools
Last fall I wrote about a commercial utility program I thought was worth its $40 price. Several readers commented that the software caused them more problems than it solved, and since then I've heard from one reader who blames the program for wiping out her laptop PC entirely.

That experience helped convince me that most PC users have no need to pay for any application or online service that promises to fix their machine or improve its performance. It was heartening to see that no special-purpose system utility made Amazon's list of the 20 best-selling titles. That's not to say the tools that come with Windows are necessarily best of breed.

In particular, I prefer the free Easeus Todo Backup to the backup utility built into Windows, as I explained in March 2010. Back in February 2009 I compared several free Windows system tools, including the popular CCleaner. And in April 2010, I compared CCleaner with the free version of IObit's Advanced SystemCare.

Other free alternatives to popular commercial apps
You can't expect a free program to provide the range of features and functionality offered by such programs as Adobe Photoshop and QuickBooks Pro. But if you can get by with less, you may find freeware that meets your needs--with the two noteworthy exceptions I mentioned above (VHS TO DVD Deluxe and Dragon Naturally Speaking).

For example, No. 4 on Amazon's software-sales list is Adobe'sPhotoshop Elements image-processing application. Yet the free, open-source GIMP image editor provides all the photo-editing and touch-up tools many amateur photographers require.

The same may not be true for the $299 Adobe Photoshop Lightroom, which is No. 7 on the Amazon software-sales list. Professional photographers and graphic artists use Lightroom to finish, organize, and manage their images. Many of Lightroom's most powerful and useful tools simply aren't available in GIMP or any other freebie.

Similarly, people who keep a business' books may require the professional accounting tools in QuickBooks Pro (No. 5 on the Amazon sales list) and Quicken Deluxe (No. 20).

If your accounting needs are simpler, Express Accounts Free Accounting Software from NCH Software may accommodate your bookkeeping requirements. Note that I haven't tried the program, and the vendor offers the free version to entice you into upgrading to the Pro version as well as to the company's other commercial apps. As many users of "free" software have found, you often pay a price for using the programs that isn't obvious from the outset.

I was surprised to see Parallels Software's Parallels Desktop for Mac at No. 16 on the list. I always assumed most Mac users who want to run Windows or Linux on their systems would use Apple's free Boot Camp utility. But Parallels Desktop lets you switch between OSes without having to reboot, according to CNET editor Jason Parker's review.

Jason points out several other useful Parallels features: the program automatically recognizes file types associated with a particular OS and switches to that system when you open the file. And gamers will appreciate the enhanced video playback of Windows games run on Macs under Parallels. The program also lets you manage Windows apps via Mac OS's Spaces, Expose, and other features, according to Jason.

Sometimes, there's just no substitute for the convenience and functionality of commercial software. But more and more frequently PC users can find a free alternative to programs they once paid for. The next time you go shopping for software, consider whether you can save some dough by going the freeware route.

\ No newline at end of file diff --git a/regression_test_data/cnet-000.yaml b/regression_test_data/cnet-000.yaml new file mode 100644 index 0000000..d49ff45 --- /dev/null +++ b/regression_test_data/cnet-000.yaml @@ -0,0 +1,2 @@ +test_description: cnet article +url: http://howto.cnet.com/8301-11310_39-20078249-285/best-free-alternatives-to-top-selling-software/?tag=epicStories diff --git a/regression_test_data/deadspin-000-orig.html b/regression_test_data/deadspin-000-orig.html new file mode 100644 index 0000000..ddd3c2d --- /dev/null +++ b/regression_test_data/deadspin-000-orig.html @@ -0,0 +1,1011 @@ + + + + + + Would You Kill A Stranger To Save Football? + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ Newer Stories + + + + More Stories + + + + +
+ + +
+ +
+ + + +
+
+ + +
+ +
+ + + + + diff --git a/regression_test_data/deadspin-000-rdbl.html b/regression_test_data/deadspin-000-rdbl.html new file mode 100644 index 0000000..58ece10 --- /dev/null +++ b/regression_test_data/deadspin-000-rdbl.html @@ -0,0 +1,9 @@ +
+

+ + Avatar for Drew Magary + + —Would You Kill A Stranger To Save Football?Time for your weekly edition of the Deadspin Funbag. Find more of Drew's stuff at KSK or on Twitter. Preorder Drew's new book, The Postmortal, right here. Email the Funbag here. Today, we're covering poop, "Chopped", threesomes, and more.

Last week, I went to the gym and went inside the locker room to change before I worked out (POWER LIFTING GRRR STRONG!), and when I went to pull my gym clothes out of the bag a pair of my kid's underwear went flying out of the bag onto the floor. They had gotten there by accident during laundry folding (I fold laundry with virtually no care or interest), but now I was completely freaked out because I'm a 34-year-old man and everyone just saw a six-year-old's panties go flying out of my bag. Sure, buddy. Sure, it just got "mixed up" in the laundry. Sure, you don't run to your car to go sniffing a child's underwear at the first available opportunity. FUCKING BABYRAPER!

So I grabbed that shit and stuffed it back in my bag. I thought about making an aside to the strangers around me, like, "Goddamn kid's shit gets everywhere, AMIRITE?!" But I didn't say anything because that would have just made it more suspicious. I just jammed it back in fast as I could and got the hell out of there. Then, when I came back to the locker room afterwards, I made sure the undies were under the towel in the bag and not in plain sight. The moral of the story is this: Before leaving your house, ALWAYS CHECK YOURSELF FOR CHILDREN'S UNDERPANTS. You don't want to be caught with them on you. Your letters:

Chunk:

I just watched that movie "The Box", where a person is told that someone will die every time they push a button. This got me thinking: what would happen if the NFL were told that someone would die if the Super Bowl were played? Would they still play the game? Would it take a certain number of people to die before the NFL would consider canceling the game? Would the NFL be more open-minded if it were just a Monday night game or the Pro Bowl instead of the Super Bowl?

I think Jerry Richardson would push the button regardless of whether or not it endangered the Super Bowl. I think he'd be the kind of fellow that enjoyed having a death button on hand at all times. Because when you're a billionaire, the thrill of normal things like money and love tends to wear off. After a while, you probably develop a taste for blood. FETCH THE BOX, CONCORD.

But seriously, if all of this were done in confidence, and someone asked Roger Goodell to do it to save the Super Bowl, and that he would be killed if he ever leaked it to the press, I think he probably pushes the button. I sure hope he pushes the button. I don't give a shit about some asshole dying. I want the Super Bowl to happen. If someone told me I could end the lockout simply by drinking a cup of urine that was not my own, I'd probably do it. That's how warped my priorities are. I would never donate a kidney to my cousin, but I'd drink pee to end the lockout.

Joe:

Is Casey Anthony hot? Or is it just the bad girl image she has because she killed someone?

I don't think she's hot, but she definitely seems like she's a good time. Unless you're a helpless baby, in which case she's probably not as fun to hang out with.

I was not well-versed in the Casey Anthony trail when the verdict came down. The name kept popping up on Twitter every now and then and I was like, "What the fuck is that?" Then the verdict came down and everyone went batshit, so then I had to go back and retroactively build up my outrage for the case, which is annoying. I'd much prefer to come about my anger organically.

Woody:

Found in the cafeteria of a hospital.

Would You Kill A Stranger To Save Football?

CHICKEN TENDER DONG! I'd still eat it. Greedily.

Tom:

Simple question, which musical act, if resurrected in their primes, with all the original members, would fetch the most money if they went on tour today? In my mind, there is only one real answer: Elvis.

It can't be anyone else, not the Beatles, not Zeppelin, not anyone. No person still has that fanatical a following after they have been dead for over 30 years. They could charge $1000 for the cheapest tickets, and people would kill each other to buy them, including me. But that's not my dream scenario for a resurrected musical act; that would have to either be AC/DC with Bon Scott as the lead singer, or Queen with Freddie Mercury at the height of his powers. What would be your dream, resurrected band scenario?

I think Michael Jackson would probably be able to rival Elvis in terms of what he could command from concertgoers if he came back to life and went out on tour. I mean, you've seen Michael Jackson fans. They're all batshit nuts. I think Frank Sinatra would also command a huge ticket price. Then you can go back in time and dig up older musical figures like Mozart. Or you could venture into hip hop and pick Biggie Smalls or whoever. And don't forget Kurt Cobain. All of those artists have a following devoted enough to charge whatever they deem fit, and then you'd have Ticketmaster tacking on a 43% resurrection fee even though they didn't help at all in the reanimation process. I fucking hate Ticketmaster. But yeah, Elvis seems like the best bet for highest price, with the Beatles close behind.

As for me, if they found a way to bring Cliff Burton back and threw him on tour with Metallica, you couldn't drain my bank account fast enough. If the evil suits at Ticketmaster went up to me and were like, "We're gonna bring Cliff Burton back from the dead, but to do it, you have to pay us $4,000 and give Ian O'Connor a handjob" (and that's precisely what they would ask), well then gimme Ian's dick this instant. I got some heavy petting to do.

Brett:

Several months back I was in need of a wifi connection in a restaurant to download a work file. There happened to be an apartment building next door and little did I know the worst person in the world inhabited it as evidenced by the name of their router.

Would You Kill A Stranger To Save Football?

JEEZUS! THASS RAYCESS! He didn't even have to password-protect his network, he knew you'd never click on it with a name that ghastly.

I do get a sordid thrill out of piggybacking onto a complete stranger's wifi signal. Most of them are protected now, but once in a while you'll be in some friend's apartment or something and you'll get a signal you can use, and God, I feel like the most powerful hacker in Norway when I do that. They don't know I'm in there, BUT I AM! And I'm lookin' at boobs! HOT DOG!

Just once, I'd like to guess a stranger's wifi password and get it right. I'd feel like Theo in Die Hard.

Phatty:

So I'm not even sure how the conversation came up, but I recently got into a discussion about pooping and pissing at the same time. I'm not talking about having explosive diarrhea where anything is possible, but just a normal, relaxing, solid-poo bathroom visit. I've always been able to do both at once and just assumed this was normal. The person I was talking to acted like I was a freak. Do I have a special ability to evacuate from both sides at once or is my friend the weird one? I guess he takes turns?

I don't think there's any set biological rule. I too can evacuate both my bladder and my bowels simultaneously (boy, talking about that makes me hungry!). I both cases, you're bearing down in the pelvic floor region, so there's no reason you can't do both at the same time, unless you have very fat thighs which restrict the flow of urine while you're in a seated position (I know the feeling, I assure you). There's no wrong way to go about it.

Matt:

I served four years in the US Air Force and every few months we'd get a call from our Chief Master Sergeant's office to report to the Armory for a drug test. There would be around 200 Airmen all crammed in a room waiting to use the latrine, and Air Force policy was that someone had to witness the urine entering the container. Rather than subjecting their own to this, the government would hire 5 dudes to stand in the stall and verify the test. There was something like 5,000 Airmen assigned to our base and they rotated these drug tests so that everyone was tested every few months - 200 or so at a time. My math says that's about 20,000 dicks in a one year period...this has to be the worst job ever right?

Could be worse. You could be forced to look at people's feet. All dicks are pretty much the same, but people's feet can vary in all sorts of new and horrifying ways.

By the way, how does anyone piss while that's going on? That's a whole world of stagefright right there. Not only do you have a bunch of rowdy soldiers outside waiting for you to finish up so that they can go and get on with their lives, but then you've got the monitor staring straight at your dick, and you've got to worry about failing the test (even if you're clean, which would be the cruelest trick of all) and you're envisioning getting kicked out of the Air Force for doing the yamyam… How the fuck do you piss with that going on? I can barely piss at an airport. If I had to do that, the urine would rush up to my brain and I'd die of a fucking stroke. Our men and women of the armed forces are brave in ways you don't even think about!

Bob:

I work nights and my wife works a normal schedule, so I spend a lot of time during the day with my one and a half year old daughter. Since I do most of the shopping I have to take my daughter along with me to the store, usually big box retail and grocery stores. Mid way through one of our recent trips, I found myself having to drop a deuce. Now the urgency this time wasn't so pressing that I couldn't comfortably make it home, however, it really got me thinking.

What if I found myself in the middle of a store with my daughter sitting there in the shopping cart and I really had to take a dump? I'm talking no waiting, walking funny, bubble guts kind of dump where making it home is totally off the table. What could I do? Obviously, the shopping cart isn't coming in the stall with me and my daughter runs around like crazy when taken out of cart. She is still too young to have any grasp on simple commands such as "No, don't touch that, stay put, don't belly crawl into the adjacent stall while daddy shits out his soul."

Have you ever formulated a viable contingency plan for such a difficult situation?

I've had to do it before. It's horrible. You have to take the kid out of the cart and bring them into the bathroom with you. Then you have to spend the entire shit begging your kid to not touch anything or pull out the entire TP roll, and they never listen to either command. Sometimes, you have to forcefully hold the kid's hand, and they're trying to squirm away from your grip while you sit there trying to push out a big brown koala bear and if you let go of their hand they'll go flying back into the wall because they don't understand the principles of inertia yet. It's a terrible way to spend what ought to be a peaceful, pleasant part of your day. And it's even worse once multiple kids get involved. I mean, they're always one second away from drinking out of the toilet and getting Hep C.

I get mad at my bowels now if I have to shit at inopportune times. Like, a shit will well up inside the Giant store, and I'll yell out, "God dammit," and no one knows I'm yelling at my own rectum but that's precisely what I'm doing. No one should ever have to shit angry. But that's parenting. It's just year after year of angry shitting.

Kids will also pull this trick too: It's morning in the house and everything is quiet. They're playing quietly or watching TV or whatever, and you think to yourself, "Hey, this would be a good time to poop! So then you go sit down to poop and the SECOND the poop is halfway out of your asshole, one of the kids screams. Every. Fucking. Time. It's like magic. If you ever need your child's attention, just start shitting. They'll come running.

HALFTIME!

Would You Kill A Stranger To Save Football?

Dan:

I ride a bus to and from work every day. I, of course, see a wide spectrum of human life on this bus but the one that appalls me the most is the white, hippy looking dude with dreads and big ass gauges in his ears. What the hell would possess someone to stretch out his ear lobes so a truck can drive through them.

I don't know. I find ear gauges terrifying to behold. I look at one and I get phantom pain in my earlobes. It's even worse when you go to the dentist's office there's a copy of National Geographic in there and there's some guy on the cover with gauges fifty times the size of a hipsters. I'm talking about a dude who put a fucking ping pong paddle inside his bottom lip. I can't handle it. How did he stretch it out that far? Was it gradual process? Did he expand the hole year after year? Does the ever take the gauge out and go jump-roping with his own lip? I don't care if someone elects to do it to themselves. Live free and all that. But it HURTS to look at. It really does.

Justin:

This was truly horrifying. I was driving along on the interstate and suddenly there is blood on my windshield. Not bug blood but red, human blood. Took me a while to realize it had come from the jackass in front of me with his window down. But I guess it isn't all bad, I at least have the comfort of knowing he will probably succumb to whatever head cancer/ebola virus/AIDS that is causing him to spit bloody loogies out his window.

Would You Kill A Stranger To Save Football?

GAHHHHHHHHH!!!!

Cal C:

When you're jacking it to the thought of banging two chicks at once, do you ever stumble over the fact that the girls don't know each other? For whatever reason, when I tried picturing a three-way among me, a chick I know from work, and a chick I knew in school, it just wouldn't work. I kept thinking, "They don't even know each other, why would they double up on me?"

In short: Does masturbation fantasy require plausibility, or should the girl-on-girl action have been enough?

I know I do suffer from a similar imaginary threesome anxiety wherein I have a hard time picturing what to do to ensure both girls are happy while I'm doing my thing. Is everyone taken care of here? Are everyone's needs being met? Do we need to switch out every thrust? Could one of you orally pleasure the other so that the daisy chain is complete? I think people who actually do have threesomes are far too cool to worry about any of these things, which is why I've never had one and never will. My other question about a threesome is… where does everyone sleep afterwards? Someone's gotta leave that bed. In fact, everyone should really go their separate ways after that. Has anyone ever had a threesome and then had to spend the night in the same bed with the two other people? That would be horrible.

Anyway, the second part of Cal's question is a whole other animal. No masturbation fantasy requires plausibility. But, as a seasoned masturbator, I have found that the closer the fantasy is to being reality, the better the jerk session. For example, let's say there's a girl in class that you really like and therefore would like to masturbate to. If she doesn't know who you are, you're still gonna masturbate to her. Ah, but if she FLIRTS with you one day, and suddenly the whole idea of having real live person sex with her goes from a nonstarter to a legit possibility, well then that's a special time you get to have with your penis. In many ways, actually hooking up with her will RUIN the goodness of that jerk. In the real hookup, she never takes out a horse crop midway through. Total letdown.

Banks:

When I was a kid, I was allergic to milk. Not deathly allergic, but enough to where I didn't drink it. Instead of milk, I grew up putting orange juice in my Golden Grahams, Lucky Charms, Fruity Pebbles, etc. This wasn't a big deal at home, but whenever I had a friend stay over or ate breakfast somewhere else, I would get these horrified looks like I was pouring pure leprosy into my morning bowl. Nobody would ever try it, saying that it looks completely disgusting.

Now I can only enjoy cereal if I have orange juice in it. The milk just doesn't cut it. The question still lingers, though: Would a normal milk-drinker think a bowl of cereal with orange juice is at least palatable?

There have been a handful of occasions where I have been absent-minded and poured the wrong liquid into my cereal instead of the cup it was originally destined for. I've done it with water and with orange juice. Both times, I wondered if maybe I had stumbled onto an entirely new way to enjoy cereal and tried one bite. The waterlogged cereal was predictably horrible. But the orange juice cereal was halfway decent. I think I had it with Honey Nut Cheerios. I didn't mind it, but I was so used to milk with my cereal that my brain instinctively resisted the change. THE FUCK IS THIS SHIT? So I stopped after a bite or two and went back to milk, because I preferred it that way.

It's odd what we consider proper food combinations and meal occasions. A Pop Tart, if you think about it, is just as legitimate as a dessert after dinner as it is a breakfast choice. In fact, it's probably more appropriate as a dessert. But no one eats Pop Tarts for dessert (except very smart obese people) because it doesn't seem like the right time to eat one. You eat muffins for breakfast. You eat cupcakes for dessert. Why? What difference does it make? All my life I've been repulsed by people who eat cold pizza for breakfast only because I am prejudiced against pizza's ability to be a breakfast food, when really it's no better or worse a food for that occasion than an Egg McMuffin. I am like a breakfast racist.

Randy:

Have you ever watched the show Chopped?

Oh, God yes. Scott Conant… what an asshole. Since when are red onions the placenta of Satan? Anyway, go on.

I was wondering how you think you would do on a cooking show like that. My friends and I all think we would be screwed if they pulled out any kind of that rare crap that no one has even heard of because we already can't really cook. I think I could do fairly well though if I got a basket of normal food even if it all was really random. I would probably just use a ton of butter or bacon to ensure that my concoction tastes good. I think I could do fairly well on Chopped if it was against normal men who don't really know how to cook.

I think I'd cut myself at least nine times with the kitchen knife. That happens all the time on the show, when a chef cuts themselves, then doesn't put on a glove and bleeds all over the goddamn food and expects the judges to eat and the judges are like, "Fuck that. I'm not eating your herpes burger."

I watch that show all the time. In case you aren't familiar, "Chopped" gives its contests between 20 and 30 minutes (which is nothing) to make a gourmet dish out of four surprise ingredients, and the ingredients are almost always something weird and, in the case of dessert, completely repulsive for dessert (brie, soy sauce, celery, etc.). And it's impossible to watch the show without instantly formulating a plan in your own mind for a dish (A NAPOLEON! OF COURSE!) and then yelling at the chefs for not making the dish you envisioned. I also wonder how I'd do with the time constraints, and there's just no fucking way I'd be able to put actual food on the plate within that time frame. I cook dinner for my family pretty much every night, and dinner always ends up being served late because I fail to RESPECT THE CLOCK. Who knew short ribs took longer the three minutes to braise? NOT I.

Also, and apologies for dwelling on this show, but Aaron Sanchez is a real prick when it comes to people mispronouncing Mexican ingredients. "It hurts me when you mispronounce toe-mah-tee-yos, for Latin cuisine is so close to my heart." Relax, dickhead. You're not President of Mexico. You'll fucking live.

Edward:

Then move to Cali you d-bag.

Would You Kill A Stranger To Save Football?

No shit.

Ben:

Has anyone ever pooped in the Stanley Cup?

A simple Google search show that Kris Draper's daughter pooped in the Cup (1 girl 1 cup), which is bit different from a grown person getting loaded and taking a full on growler into the thing, but still. Pooping in the Cup is pooping in the cup. I'm sure it's been masturbated into and people have been bent over it and assfucked. It's the Stanley Cup. It's meant to be enjoyed.

Time for your email of the week.

John:

I used to work in a hospital as a clerk (ordering bloodwork, organizing charts) but I always ended up getting snaked into patient care. This included 'delivering' patients to the morgue after they passed as a favor. Which was more anti-climactic than it sounds. (Disclaimer: We always took great care of the patients post-mortem).

I usually went down alone with the body and the morgue tech would help me get him/her into the fridge. So one time I was bringing someone's grandma down. The elevator rides with dead bodies are the tits. I wouldn't talk to them or anything. But I imagined them awakening and me lighting their zombified corpse on fire. I shit-talked them on the way. As a warning.

Anyway, I got down there, and the morgue tech was being a dick and wouldn't help me load the body into the fridge. It's not easy doing this alone because you have to pull out the 'tray' and transfer the body from the stretcher to the tray. And a minimum of 2 people is usually required. I called my supervisor and they told me it would be an hour before they sent someone down. Fuck that noise.

I went to go do it myself. I cradled someone else's grandma, slipped on some unidentified substance, and dropped her on the floor. She BROKE. A lot. I picked her up awkwardly and got her onto the tray and slammed the fridge. I always wondered if the funeral director who picked her up asked "Jesus, did she get hit by a bus?" Oh well. Just remember: There is a time after your elderly loved ones have passed and before they head to the funeral home that someone brings them to the hospital morgue. This is one of those stories.

Remind me to never die.

+
+ +
\ No newline at end of file diff --git a/regression_test_data/deadspin-000.yaml b/regression_test_data/deadspin-000.yaml new file mode 100644 index 0000000..2961978 --- /dev/null +++ b/regression_test_data/deadspin-000.yaml @@ -0,0 +1,2 @@ +test_description: deadspin article +url: http://deadspin.com/5820463/would-you-kill-a-stranger-to-save-football diff --git a/regression_test_data/espn-000-orig.html b/regression_test_data/espn-000-orig.html new file mode 100644 index 0000000..a03db26 --- /dev/null +++ b/regression_test_data/espn-000-orig.html @@ -0,0 +1,993 @@ + + + + +Roger Clemens trial -- defense to challenge validity of investigation - ESPN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+ +
+
+ + + +
+
+ + +
+
+ +
+
+
    +
  • +
+
Updated: July 12, 2011, 4:52 PM ET
+
+
+

Roger Clemens' defense sets strategy

+
+
+ + + +ESPN.com news services
+
+
+ + +

+WASHINGTON -- Roger Clemens' attorney revealed Tuesday that the ex-baseball star plans to begin his defense against charges of lying to Congress by questioning if the lawmakers' investigation into whether he used performance-enhancing drugs was proper.

Clemens attorney Michael Attanasio said in court that the hearing the House Oversight and Government Reform Committee held in February 2008 had nothing to do with Congress' responsibility for legislation. He said the hearing was only concerned with airing a "credibility contest" between Clemens and his longtime trainer, Brian McNamee, who said he injected the pitcher with steroids and human growth hormone.

+

Clemens denied those allegations and has been charged with perjury, false statements and obstruction of Congress. The obstruction count charges Clemens with making 15 false or misleading statements to the committee, including his repeated denials he didn't take performance-enhancing drugs during his 24-season career and even whether he attended a 1998 pool party at then-Toronto Blue Jays teammate Jose Canseco's home in Miami.

McNamee says he saw Clemens and admitted steroids user Canseco talking at the party with another man and that after they returned to Canada, Clemens asked McNamee to inject him with steroids for the first time.

+Clemens and Canseco say Clemens was never at the party but was golfing at the time. Attanasio said that dispute suggests how improper the whole inquiry was and that jurors should be able to determine whether a "he said, he said debate" between Clemens and McNamee was a legitimate congressional concern.

"We're going to have a mini-trial on whether Roger Clemens went swimming," Attanasio said. "We're going to have a trial in U.S. District Court, Congress is going to have a hearing on these things? That's our point."

Assistant U.S. attorney Daniel Butler responded that the committee has responsibility for oversight that is broad and goes beyond legislation. He said steroids in baseball is a drug matter and pointed out that a 2005 hearing into the issue led to legislation to regulate steroids and triggered Major League Baseball to commission a report by former Sen. George Mitchell into the extent of the problem in the league.

The Mitchell report was released in December 2007 and named Clemens and 85 other current and former ballplayers as using drugs. Clemens denied the allegations and Butler pointed out that leaders of the House committee said they needed to investigate Clemens' denials to determine what weight to give the Mitchell report and its recommendations.

Attanasio argued that if the committee's purpose was to come full circle on the Mitchell report, it had done so with a January 2008 hearing featuring testimony by Mitchell, baseball commissioner Bud Selig and former players union director Donald Fehr.

"That ship had left. That work was done. And now it becomes a question between Mr. Clemens and Mr. McNamee," Attanasio said.

But U.S. District Judge Reggie Walton said if "one of the icons of baseball" was taking exception to the Mitchell report, "it seems to me that Congress has the authority to hold hearings to determine which view is correct."

Attanasio said the issue will be addressed in testimony from the first two witnesses prosecutors plan to call after opening arguments Wednesday morning. He said the first will be retired House Parliamentarian Charles Johnson, followed by Phil Barnett, who was chief counsel for the committee at the time it investigated Clemens.

The dispute over the committee's proper role came as Walton considered what preliminary instructions to give the jury, which was seated Tuesday afternoon after 3½ days of screening potential members.

The jury of 10 women and two men includes a woman whose cousin, former outfielder Al Bumbry, was a coach for the Boston Red Sox when Clemens played for the team. Another woman on the jury said she believes Philadelphia Eagles quarterback Michael Vick was "done wrong" in his criminal conviction in connection with dogfighting.

Four other people were seated as alternate jurors in case any of the 12 can't serve.

Prosecutors and Clemens' defense team removed 20 people from the pool of 36 jurors, offering no public explanation for their decisions.

Clemens' attorney pressed potential jurors not to hold it against Clemens if he chooses not to testify, his strongest hint yet that the ex-pitcher might not take the stand.

Walton also said he was upset to read a New York Daily News item that members of Clemens' family have been criticizing McNamee and other government witnesses on Twitter and elsewhere online. The judge has a gag order on parties involved in the case, but he said he doesn't have any authority over anyone who isn't before him and hopes that those that are were not involved.

Clemens' attorney Rusty Hardin said he would look into it but that it's been "extremely difficult" for Clemens' family to see harsh criticisms of the baseball star online and in the media and not be able to respond.

Information from The Associated Press was used in this report. +

+ +
+
+ + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ +
+
    +
  • + + + + +
  • +
  • @TJQuinnESPN Lesson 1 learned after five days: this is Judge Reggie B. Walton's courtroom. Make no mistake. 3 minutes ago
  • +
  • @TJQuinnESPN Judge telling jurors story about juror he had who was telling family about case until the daughter called the judge and ratted him out. 4 minutes ago
  • +
  • @TJQuinnESPN Judge elling jurors story about juror he had who was telling family about case until the daughter called the judge and ratted him out. 5 minutes ago
  • +
  • @TJQuinnESPN Judge warning jury about avoiding newspaper and tv reports and perils of social media for the modern courtroom. No "twitting." 8 minutes ago
  • +
  • @TJQuinnESPN Interesting to those who follow jury demographics: Bonds jury mostly white, Clemens jury mostly African-American. 14 minutes ago
  • +
  • @TJQuinnESPN 10 women, 2 men, nine African American, three white. 19 minutes ago
  • +
  • @TJQuinnESPN Things seem to be ok. Four days to pick a jury, but can't say it was haphazard. 25 minutes ago
  • +
  • @TJQuinnESPN But wait! More pre-trial drama. As soon as judge declares jury selected, one has an issue. Conferencing now. 26 minutes ago
  • +
  • @TJQuinnESPN The jury is seated. 28 minutes ago
  • +
  • @TJQuinnESPN This hour with its hushed whispers & paper shuffling is prob the quietest the courtroom will be, but the case might be won or lost right now 31 minutes ago
  • +
  • @TJQuinnESPN Still in process of striking jurors, having new ones fill seats, evaluating & striking again. Only voice is clerk calling in more jurors. 31 minutes ago
  • +
  • @TJQuinnESPN And... selecting... selecting... selecting... huddling... conferring... whispering... about an hour ago
  • +
  • @TJQuinnESPN #Clemens Judge dismisses her. Has a couple of spares handy. about an hour ago
  • +
  • @TJQuinnESPN Ready to begin strikes process. Judge asks if any jurors had media contact about case since this started - there's one. Discussing. about an hour ago
  • +
  • @TJQuinnESPN Attys huddling now as they prepare for strikes. Will avoid obvious pitching reference. about an hour ago
  • +
  • @TJQuinnESPN Judge makes joke about Hardin "misremembering" something. Hardin notes it's in the dictionary. Note the year: http://t.co/mPkpmA5 about an hour ago
  • +
  • @TJQuinnESPN AUSA Durham Q's man, he's OK with him. Judge says no cause to strike the man. about an hour ago
  • +
  • @TJQuinnESPN Man says he understands now why Cong. investigates. This matters. Defense will make the case Cong had no business holding hearings. about an hour ago
  • +
  • @TJQuinnESPN Man explains if Cong pass legislation, OK. If investigating PED use, should be left up to law enforcement. Judge explaining not the case. about an hour ago
  • +
  • @TJQuinnESPN #Clemens judge brings back penultimate prosp juror who said would hold against gov't if no legislation. about an hour ago
  • +
  • + + + + +
  • +
+
+ +
+ + + + + + + + + + + + +
+ + + + + +
+ +
+ + + + + + +
+ +
+ +
+ + + + + + + +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/regression_test_data/espn-000-rdbl.html b/regression_test_data/espn-000-rdbl.html new file mode 100644 index 0000000..487ca63 --- /dev/null +++ b/regression_test_data/espn-000-rdbl.html @@ -0,0 +1,31 @@ +
+
+

Updated: July 12, 2011, 4:52 PM ET

+
+

+

Roger Clemens' defense sets strategy

+ + +
+ + +

+WASHINGTON -- Roger Clemens' attorney revealed Tuesday that the ex-baseball star plans to begin his defense against charges of lying to Congress by questioning if the lawmakers' investigation into whether he used performance-enhancing drugs was proper.

Clemens attorney Michael Attanasio said in court that the hearing the House Oversight and Government Reform Committee held in February 2008 had nothing to do with Congress' responsibility for legislation. He said the hearing was only concerned with airing a "credibility contest" between Clemens and his longtime trainer, Brian McNamee, who said he injected the pitcher with steroids and human growth hormone.

+

Clemens denied those allegations and has been charged with perjury, false statements and obstruction of Congress. The obstruction count charges Clemens with making 15 false or misleading statements to the committee, including his repeated denials he didn't take performance-enhancing drugs during his 24-season career and even whether he attended a 1998 pool party at then-Toronto Blue Jays teammate Jose Canseco's home in Miami.

McNamee says he saw Clemens and admitted steroids user Canseco talking at the party with another man and that after they returned to Canada, Clemens asked McNamee to inject him with steroids for the first time.

+Clemens and Canseco say Clemens was never at the party but was golfing at the time. Attanasio said that dispute suggests how improper the whole inquiry was and that jurors should be able to determine whether a "he said, he said debate" between Clemens and McNamee was a legitimate congressional concern.

"We're going to have a mini-trial on whether Roger Clemens went swimming," Attanasio said. "We're going to have a trial in U.S. District Court, Congress is going to have a hearing on these things? That's our point."

Assistant U.S. attorney Daniel Butler responded that the committee has responsibility for oversight that is broad and goes beyond legislation. He said steroids in baseball is a drug matter and pointed out that a 2005 hearing into the issue led to legislation to regulate steroids and triggered Major League Baseball to commission a report by former Sen. George Mitchell into the extent of the problem in the league.

The Mitchell report was released in December 2007 and named Clemens and 85 other current and former ballplayers as using drugs. Clemens denied the allegations and Butler pointed out that leaders of the House committee said they needed to investigate Clemens' denials to determine what weight to give the Mitchell report and its recommendations.

Attanasio argued that if the committee's purpose was to come full circle on the Mitchell report, it had done so with a January 2008 hearing featuring testimony by Mitchell, baseball commissioner Bud Selig and former players union director Donald Fehr.

"That ship had left. That work was done. And now it becomes a question between Mr. Clemens and Mr. McNamee," Attanasio said.

But U.S. District Judge Reggie Walton said if "one of the icons of baseball" was taking exception to the Mitchell report, "it seems to me that Congress has the authority to hold hearings to determine which view is correct."

Attanasio said the issue will be addressed in testimony from the first two witnesses prosecutors plan to call after opening arguments Wednesday morning. He said the first will be retired House Parliamentarian Charles Johnson, followed by Phil Barnett, who was chief counsel for the committee at the time it investigated Clemens.

The dispute over the committee's proper role came as Walton considered what preliminary instructions to give the jury, which was seated Tuesday afternoon after 3½ days of screening potential members.

The jury of 10 women and two men includes a woman whose cousin, former outfielder Al Bumbry, was a coach for the Boston Red Sox when Clemens played for the team. Another woman on the jury said she believes Philadelphia Eagles quarterback Michael Vick was "done wrong" in his criminal conviction in connection with dogfighting.

Four other people were seated as alternate jurors in case any of the 12 can't serve.

Prosecutors and Clemens' defense team removed 20 people from the pool of 36 jurors, offering no public explanation for their decisions.

Clemens' attorney pressed potential jurors not to hold it against Clemens if he chooses not to testify, his strongest hint yet that the ex-pitcher might not take the stand.

Walton also said he was upset to read a New York Daily News item that members of Clemens' family have been criticizing McNamee and other government witnesses on Twitter and elsewhere online. The judge has a gag order on parties involved in the case, but he said he doesn't have any authority over anyone who isn't before him and hopes that those that are were not involved.

Clemens' attorney Rusty Hardin said he would look into it but that it's been "extremely difficult" for Clemens' family to see harsh criticisms of the baseball star online and in the media and not be able to respond.

Information from The Associated Press was used in this report. +

+ + + + +
+ +
\ No newline at end of file diff --git a/regression_test_data/espn-000.yaml b/regression_test_data/espn-000.yaml new file mode 100644 index 0000000..d79f54a --- /dev/null +++ b/regression_test_data/espn-000.yaml @@ -0,0 +1,2 @@ +test_description: espn article +url: http://sports.espn.go.com/mlb/news/story?id=6760720 diff --git a/regression_test_data/mit-000-orig.html b/regression_test_data/mit-000-orig.html new file mode 100644 index 0000000..fc25e21 --- /dev/null +++ b/regression_test_data/mit-000-orig.html @@ -0,0 +1,246 @@ + + + + + + + + + + + Improving recommendation systems - MIT News Office + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
massachusetts institute of technology
+ + + + +
+

Improving recommendation systems

+
Researchers believe that comparing products, rather than rating them on an absolute scale, will lead to algorithms that better predict customers’ preferences.
+
+
+
+
+

today's news

Computer learns language by playing games


"Civilization" is a strategy game in which players build empires by, among +other things, deciding where to found cities and deploy armies. +
Image courtesy of Sid Meier's Civilization V

By basing its strategies on the text of a manual, a computer infers the meanings of words without human supervision.

Recreating human livers, in mice

July 12, 2011

While you’re up, print me a solar cell

July 11, 2011
+
+

similar stories

Drug discovery, Netflix style?

April 13, 2010
+
+
+
+
+
+ + +
+ Graphic: Christine Daniloff +
+ + +
+
+ +
+
+ + +
+ comment +
+ +
+ + + print +
+ +
+
+
+
+
+
+ +Recommendation algorithms are a vital part of today’s Web, the basis of the targeted advertisements that account for most commercial sites’ revenues and of services such as Pandora, the Internet radio site that tailors song selections to listeners’ declared preferences. The DVD rental site Netflix deemed its recommendation algorithms important enough that it offered a million-dollar prize to anyone who could improve their predictions by 10 percent.

But Devavrat Shah, the Jamieson Career Development Associate Professor of Electrical Engineering and Computer Science in MIT’s Laboratory of Information and Decisions Systems, thinks that the most common approach to recommendation systems is fundamentally flawed. Shah believes that, instead of asking users to rate products on, say, a five-star scale, as Netflix and Amazon do, recommendation systems should ask users to compare products in pairs. Stitching the pairwise rankings into a master list, Shah argues, will offer a more accurate representation of consumers’ preferences.

In a series of papers (paper 1 | paper 2 | paper 3) published over the last few years, Shah, his students Ammar Ammar and Srikanth Jagabathula, and Vivek Farias, an associate professor at the MIT Sloan School of Management, have demonstrated algorithms that put that theory into practice. Besides showing how the algorithms can tailor product recommendations to customers, they’ve also built a website that uses the algorithms to help large groups make collective decisions. And at an Institute for Operations Research and Management Sciences conference in June, they presented a version of their algorithm that had been tested on detailed data about car sales collected over the span of a year by auto dealers around the country. Their algorithm predicted car buyers’ preferences with 20 percent greater accuracy than existing algorithms.

Calibration conundrum

One of the problems with basing recommendations on ratings, Shah explains, is that an individual’s rating scale will tend to fluctuate. “If my mood is bad today, I might give four stars, but tomorrow I’d give five stars,” he says. “But if you ask me to compare two movies, most likely I will remain true to that for a while.”

Similarly, ratings scales may vary between people. “Your three stars might be my five stars, or vice versa,” Shah says. “For that reason, I strongly believe that comparison is the right way to capture this.”

Moreover, Shah explains, anyone who walks into a store and selects one product from among the three displayed on a shelf is making an implicit comparison. So in many contexts, comparison data is actually easier to come by than ratings.

Shah believes that the advantages of using comparison as the basis for recommendation systems are obvious but that the computational complexity of the approach has prevented its wide adoption. The results of thousands — or millions — of pairwise comparisons could, of course, be contradictory: Some people may like "Citizen Kane" better than "The Godfather," but others may like "The Godfather" better than "Citizen Kane." The only sensible way to interpret conflicting comparisons is statistically. But there are more than three million ways to order a ranking of only 10 movies, and every one of them may have some probability, no matter how slight, of representing the ideal ordering of at least one ranker. Increase the number of movies to 20, and there are more ways to order the list than there are atoms in the universe.

Ordering out

So Shah and his colleagues make some assumptions that drastically reduce the number of possible orderings they have to consider. The first is simply to throw out the outliers. For example, Netflix’s movie-rental data assigns the Robin Williams vehicle "Patch Adams" the worst reviews, on average, of any film with a statistically significant number of ratings. So the MIT algorithm would simply disregard all the possible orderings in which "Patch Adams" ranked highly.

Even with the outliers eliminated, however, a large number of plausible orderings might remain. From that group, the MIT algorithm selects a subset: the smallest group of orderings that fit the available data. This approach can winnow an astronomically large number of orderings down to one that’s within the computational purview of a modern computer.

Finally, when the algorithm has arrived at a reduced number of orderings, it uses a movie’s rank in each of the orderings, combined with the probability of that ordering, to assign the movie an overall score. Those scores determine the final ordering.

Paat Rusmevichientong, an associate professor of information and operations management at the University of Southern California, thinks that the most interesting aspect of Shah’s work is the alternative it provides to so-called parametric models, which are more restrictive. These, he says, were “the state of the art up until 2008, when Professor Shah’s paper first came out.”

“They’ve really, substantially enlarged the class of choice models that you can work with,” Rusmevichientong says. “Before, people never thought that it was possible to have rich, complex choice models like this.”

The next step, Rusmevichientong says, is to test that type of model selection against real-world data. The analysis of car sales is an early example of that kind of testing, and the MIT researchers are currently working up a version of their conference paper for journal publication. “I’ve been waiting to see the paper,” Rusmevichientong says. “That sounds really exciting.”

+ + + + + + + + + +
Comments + + + + +
+ + + + +
+
+ + + + + + +
+
+
+ +
+ + +
+ +
+
+
+
+ +

tags

+ +
+ + + +
+ +
+ + + + + + + + + \ No newline at end of file diff --git a/regression_test_data/mit-000-rdbl.html b/regression_test_data/mit-000-rdbl.html new file mode 100644 index 0000000..f6d8d48 --- /dev/null +++ b/regression_test_data/mit-000-rdbl.html @@ -0,0 +1,5 @@ +

+ +Recommendation algorithms are a vital part of today’s Web, the basis of the targeted advertisements that account for most commercial sites’ revenues and of services such as Pandora, the Internet radio site that tailors song selections to listeners’ declared preferences. The DVD rental site Netflix deemed its recommendation algorithms important enough that it offered a million-dollar prize to anyone who could improve their predictions by 10 percent.

But Devavrat Shah, the Jamieson Career Development Associate Professor of Electrical Engineering and Computer Science in MIT’s Laboratory of Information and Decisions Systems, thinks that the most common approach to recommendation systems is fundamentally flawed. Shah believes that, instead of asking users to rate products on, say, a five-star scale, as Netflix and Amazon do, recommendation systems should ask users to compare products in pairs. Stitching the pairwise rankings into a master list, Shah argues, will offer a more accurate representation of consumers’ preferences.

In a series of papers (

paper 1

|

paper 2

|

paper 3

) published over the last few years, Shah, his students Ammar Ammar and Srikanth Jagabathula, and Vivek Farias, an associate professor at the MIT Sloan School of Management, have demonstrated algorithms that put that theory into practice. Besides showing how the algorithms can tailor product recommendations to customers, they’ve also built a

website

that uses the algorithms to help large groups make collective decisions. And at an Institute for Operations Research and Management Sciences conference in June, they presented a version of their algorithm that had been tested on detailed data about car sales collected over the span of a year by auto dealers around the country. Their algorithm predicted car buyers’ preferences with 20 percent greater accuracy than existing algorithms.

Calibration conundrum

One of the problems with basing recommendations on ratings, Shah explains, is that an individual’s rating scale will tend to fluctuate. “If my mood is bad today, I might give four stars, but tomorrow I’d give five stars,” he says. “But if you ask me to compare two movies, most likely I will remain true to that for a while.”

Similarly, ratings scales may vary between people. “Your three stars might be my five stars, or vice versa,” Shah says. “For that reason, I strongly believe that comparison is the right way to capture this.”

Moreover, Shah explains, anyone who walks into a store and selects one product from among the three displayed on a shelf is making an implicit comparison. So in many contexts, comparison data is actually easier to come by than ratings.

Shah believes that the advantages of using comparison as the basis for recommendation systems are obvious but that the computational complexity of the approach has prevented its wide adoption. The results of thousands — or millions — of pairwise comparisons could, of course, be contradictory: Some people may like "Citizen Kane" better than "The Godfather," but others may like "The Godfather" better than "Citizen Kane." The only sensible way to interpret conflicting comparisons is statistically. But there are more than three million ways to order a ranking of only 10 movies, and every one of them may have some probability, no matter how slight, of representing the ideal ordering of at least one ranker. Increase the number of movies to 20, and there are more ways to order the list than there are atoms in the universe.

Ordering out

So Shah and his colleagues make some assumptions that drastically reduce the number of possible orderings they have to consider. The first is simply to throw out the outliers. For example, Netflix’s movie-rental data assigns the Robin Williams vehicle "Patch Adams" the worst reviews, on average, of any film with a statistically significant number of ratings. So the MIT algorithm would simply disregard all the possible orderings in which "Patch Adams" ranked highly.

Even with the outliers eliminated, however, a large number of plausible orderings might remain. From that group, the MIT algorithm selects a subset: the smallest group of orderings that fit the available data. This approach can winnow an astronomically large number of orderings down to one that’s within the computational purview of a modern computer.

Finally, when the algorithm has arrived at a reduced number of orderings, it uses a movie’s rank in each of the orderings, combined with the probability of that ordering, to assign the movie an overall score. Those scores determine the final ordering.

Paat Rusmevichientong, an associate professor of information and operations management at the University of Southern California, thinks that the most interesting aspect of Shah’s work is the alternative it provides to so-called parametric models, which are more restrictive. These, he says, were “the state of the art up until 2008, when Professor Shah’s paper first came out.”

“They’ve really, substantially enlarged the class of choice models that you can work with,” Rusmevichientong says. “Before, people never thought that it was possible to have rich, complex choice models like this.”

The next step, Rusmevichientong says, is to test that type of model selection against real-world data. The analysis of car sales is an early example of that kind of testing, and the MIT researchers are currently working up a version of their conference paper for journal publication. “I’ve been waiting to see the paper,” Rusmevichientong says. “That sounds really exciting.”

+
+
\ No newline at end of file diff --git a/regression_test_data/mit-000.yaml b/regression_test_data/mit-000.yaml new file mode 100644 index 0000000..bb8baa6 --- /dev/null +++ b/regression_test_data/mit-000.yaml @@ -0,0 +1,3 @@ +test_description: mit news article +notes: links are broken out into paragraph divs +url: http://web.mit.edu/newsoffice/2011/compare-recommendation-systems-0708.html diff --git a/regression_test_data/nytimes-000-orig.html b/regression_test_data/nytimes-000-orig.html new file mode 100644 index 0000000..36c824e --- /dev/null +++ b/regression_test_data/nytimes-000-orig.html @@ -0,0 +1,115 @@ + McConnell Proposal Gives Obama Power to Increase Debt Limit - NYTimes.com +

Politics



July 12, 2011, 4:13 pm

McConnell Proposal Gives Obama Power to Increase Debt Limit

Mitch McConnell of Kentucky, the Senate Republican leader, left, and Jon Kyl, the Republican whip, unveiled a proposal Tuesday that would allow an increase in the debt ceiling.Susan Walsh/Associated PressMitch McConnell of Kentucky, the Senate Republican leader, left, and Jon Kyl, the Republican whip, unveiled a proposal Tuesday that would allow an increase in the debt ceiling.

The Senate Republican leader, Mitch McConnell of Kentucky, said Tuesday that a bipartisan budget deal with President Obama was probably out of reach, and he proposed a plan under which the president could increase the federal debt limit without prior Congressional approval for offsetting spending cuts.

Mr. McConnell’s proposal reflected a growing sense of pessimism on Capitol Hill about the prospects that Mr. Obama and Congressional leaders could come to terms on a budget deal before the government’s borrowing authority hit its limit on Aug. 2. The negotiators sat down for another round of talks at the White House on Tuesday afternoon.

In an interview with CBS News, Mr. Obama said he “cannot guarantee” that the government can pay benefits next month to Social Security recipients, veterans and the disabled if Congress does not increase the federal debt limit, raising the political stakes even as Republicans hardened their opposition to him.

Mr. McConnell’s proposal would give Mr. Obama sweeping power to increase the government’s borrowing authority, in increments, by up to $2.4 trillion — enough, it is estimated, to cover federal obligations through next year — only if Mr. Obama specifies spending cuts of equal amounts. But Congress would not have to approve the spending cuts prior to the debt-limit increase.

It is not clear whether House Republicans would sign on to such a measure, given their drive to extract deep spending cuts in return for any debt-limit increase.

Mr. McConnell, who after the midterm elections last November said Republicans’ goal would be to make Mr. Obama a one-term president, said in his Senate speech, “After years of discussions and months of negotiations, I have little question that as long as this president is in the Oval Office, a real solution is probably unattainable.”

At the White House, press secretary Jay Carney responded, “This president’s going to be in office for at least another 18 months, and I think the American people expect Congress to work with him.”

With the Aug. 2 deadline for raising the government’s $14.3 trillion debt limit just three weeks away, Tuesday seemed to mark a new low in the summer’s maneuvering between the White House and Congressional Republicans to agree to a debt-reduction package that would clear the way for a vote on the debt limit. The back-and-forth had people in both parties nervously eyeing the financial markets, fretful that the political fighting would unnerve them; to date, Wall Street has been generally complacent that the White House and Congress ultimately would come to an agreement, if only because failing to do so would hold such economic peril.

Representatives of major business groups, including the U.S. Chamber of Commerce and the Business Roundtable, put out public statements urging a bipartisan accord on raising the debt limit. They had met with the Treasury secretary, Timothy F. Geithner, on Monday and other business groups and individuals similarly have contacted the White House to volunteer help in reaching a compromise.

New York Mayor Michael R. Bloomberg opened an unrelated event on Tuesday morning by saying, “If America, for the first time in its history, defaults on its obligations, it would have a catastrophic effect on our financial system and on our credibility around the world. It would also take a serious toll on our economy, and that at a time when the nation is still trying to recover from the deep recession.”

“America’s good name and credit are just too important to be held hostage to Washington gridlock, and I hope that in the end cooler heads will prevail and an agreement will be reached quickly,” Mr. Bloomberg added.

In advance of what have become daily White House meetings, Congressional Republican leaders stood firm against raising taxes on the wealthy and businesses after 2013, as Mr. Obama demands as part of a “balanced package” with deep spending cuts, including in Medicare and Medicaid.

Mr. Obama, responding to a question in an interview with CBS News, said, “I cannot guarantee that those checks go out on August 3rd if we haven’t resolved this issue. Because there may simply not be the money in the coffers to do it.”

He added, “This is not just a matter of Social Security checks. These are veterans checks, these are folks on disability and their checks. There are about 70 million checks that go out.”


The Caucus BlackBerry Reader
Download the New Caucus App for BlackBerry »

Get up-to-the minute political news throughout the day.

TXT CAUCUS to 698698

Featured

  • Ron Paul Will Not Seek Re-Election
  • Who’s the Real Leader in Debt Talks?
  • In Debt Ceiling Fight, Obama Has the Edge
  • Best Political Quotes of the Weekend

McConnell Proposal Gives Obama Power to Increase Debt Limit

Mitch McConnell said Tuesday that a bipartisan budget deal with President Obama is probably out of reach, and he proposed a plan under which the president could increase the federal debt limit without prior Congressional approval for offsetting spending cuts.

Who’s the Real Leader in Debt Talks?

It would seem that the real fight in Washington may be about something something simple: who’s the real leader?

More From Congress »

Ron Paul Will Not Seek Re-Election

Ron Paul, the Texas Republican credited with paving the way for the Tea Party movement, will focus on his presidential bid.

Pawlenty Aims to Slow Bachmann in Iowa

Michele Bachmann is emerging as the new front-runner in Iowa, where polls show her at the top of the pack among Republican hopefuls, and she has become a target almost as quickly as she became a contender.

More From Elections »

The Caucus Click: Debt-Weary Daley

No rest for the White House chief of staff as the president heaps pressure on Republicans.

Caucus Video: Debt Negotiations Stalled; A Cautionary Tale From Minnesota

Jeff Zeleny on the latest from the stalled debt negotiations and Monica Davey reports on the government shutdown in Minnesota.

More From The White House »

Justice Scalia Goes to Capitol Hill

Justice Antonin Scalia speaks at a “constitutional seminar” for House members organized by Representative Michele Bachmann.

Justice Scalia to Speak to Tea Party Caucus on Separation of Powers

The closed-door session with a group of conservative lawmakers has drawn scrutiny for its ideological tone.

More From Supreme Court »

Justice Scalia Goes to Capitol Hill

Justice Antonin Scalia speaks at a “constitutional seminar” for House members organized by Representative Michele Bachmann.

Justice Scalia to Speak to Tea Party Caucus on Separation of Powers

The closed-door session with a group of conservative lawmakers has drawn scrutiny for its ideological tone.

More From Supreme Court »

FiveThirtyEight

Nate Silver’s Political Calculus
Nate Silver’s Political Calculus

FiveThirtyEight’s aims to help New York Times readers cut through the clutter of our data-rich world.

More Politics News

Obama Grasping Centrist Banner in Debt Impasse
By JACKIE CALMES

President Obama has been casting himself as a pragmatic centrist as negotiators try to reach a deal on the budget.

Budget Talks Beginning to Take On a Testy Air
By MARK LANDLER and CARL HULSE

President Obama challenged lawmakers to reconsider supporting a long-term fiscal deal.

Wisconsin’s Summer of Recall Elections Begins
By MONICA DAVEY

Voters went to the polls as part of the broadest recall effort in state history.

Effort in Senate to Close Offshore Tax Havens
By DAVID KOCIENIEWSKI

Two senior Democratic senators are pushing to help reduce the federal deficit by tightening rules that allow hedge funds, derivatives traders and corporations to skirt federal taxes.

Family of Robert F. Kennedy Rethinks His Place at Library
By ADAM CLYMER and DON VAN NATTA Jr.

As archivists prepare to make public 63 boxes of Robert F. Kennedy’s papers at the John F. Kennedy Library, his family members are having second thoughts about where they should be housed.

Archive

Recent Posts

July 12

McConnell Proposal Gives Obama Power to Increase Debt Limit

Mitch McConnell said Tuesday that a bipartisan budget deal with President Obama is probably out of reach, and he proposed a plan under which the president could increase the federal debt limit without prior Congressional approval for offsetting spending cuts.

July 12

Fiscal Puzzle: Trying to Balance a City Budget

By at least one rather unscientific measure, armchair budget-makers seem to find it easier to close shortfalls by raising taxes than do elected officials who must bear the political brunt of their actions.

July 12

Ron Paul Will Not Seek Re-Election

Ron Paul, the Texas Republican credited with paving the way for the Tea Party movement, will focus on his presidential bid.

July 12

Young Executives to Meet With Capital’s Old Guard

Seeking to attract a new generation of voters, the White House and U.S. Chamber of Commerce plan to meet with 125 young executives.

July 12

Herman Cain Can Sing!

Turns out, Herman Cain has a pretty good singing voice.

About The Caucus

The latest on President Obama, Congress and other political news from Washington and around the nation from the staff of The New York Times.

+ + \ No newline at end of file diff --git a/regression_test_data/nytimes-000-rdbl.html b/regression_test_data/nytimes-000-rdbl.html new file mode 100644 index 0000000..e53e435 --- /dev/null +++ b/regression_test_data/nytimes-000-rdbl.html @@ -0,0 +1 @@ +
Mitch McConnell of Kentucky, the Senate Republican leader, left, and Jon Kyl, the Republican whip, unveiled a proposal Tuesday that would allow an increase in the debt ceiling.Susan Walsh/Associated PressMitch McConnell of Kentucky, the Senate Republican leader, left, and Jon Kyl, the Republican whip, unveiled a proposal Tuesday that would allow an increase in the debt ceiling.

The Senate Republican leader, Mitch McConnell of Kentucky, said Tuesday that a bipartisan budget deal with President Obama was probably out of reach, and he proposed a plan under which the president could increase the federal debt limit without prior Congressional approval for offsetting spending cuts.

Mr. McConnell’s proposal reflected a growing sense of pessimism on Capitol Hill about the prospects that Mr. Obama and Congressional leaders could come to terms on a budget deal before the government’s borrowing authority hit its limit on Aug. 2. The negotiators sat down for another round of talks at the White House on Tuesday afternoon.

In an interview with CBS News, Mr. Obama said he “cannot guarantee” that the government can pay benefits next month to Social Security recipients, veterans and the disabled if Congress does not increase the federal debt limit, raising the political stakes even as Republicans hardened their opposition to him.

Mr. McConnell’s proposal would give Mr. Obama sweeping power to increase the government’s borrowing authority, in increments, by up to $2.4 trillion — enough, it is estimated, to cover federal obligations through next year — only if Mr. Obama specifies spending cuts of equal amounts. But Congress would not have to approve the spending cuts prior to the debt-limit increase.

It is not clear whether House Republicans would sign on to such a measure, given their drive to extract deep spending cuts in return for any debt-limit increase.

Mr. McConnell, who after the midterm elections last November said Republicans’ goal would be to make Mr. Obama a one-term president, said in his Senate speech, “After years of discussions and months of negotiations, I have little question that as long as this president is in the Oval Office, a real solution is probably unattainable.”

At the White House, press secretary Jay Carney responded, “This president’s going to be in office for at least another 18 months, and I think the American people expect Congress to work with him.”

With the Aug. 2 deadline for raising the government’s $14.3 trillion debt limit just three weeks away, Tuesday seemed to mark a new low in the summer’s maneuvering between the White House and Congressional Republicans to agree to a debt-reduction package that would clear the way for a vote on the debt limit. The back-and-forth had people in both parties nervously eyeing the financial markets, fretful that the political fighting would unnerve them; to date, Wall Street has been generally complacent that the White House and Congress ultimately would come to an agreement, if only because failing to do so would hold such economic peril.

Representatives of major business groups, including the U.S. Chamber of Commerce and the Business Roundtable, put out public statements urging a bipartisan accord on raising the debt limit. They had met with the Treasury secretary, Timothy F. Geithner, on Monday and other business groups and individuals similarly have contacted the White House to volunteer help in reaching a compromise.

New York Mayor Michael R. Bloomberg opened an unrelated event on Tuesday morning by saying, “If America, for the first time in its history, defaults on its obligations, it would have a catastrophic effect on our financial system and on our credibility around the world. It would also take a serious toll on our economy, and that at a time when the nation is still trying to recover from the deep recession.”

“America’s good name and credit are just too important to be held hostage to Washington gridlock, and I hope that in the end cooler heads will prevail and an agreement will be reached quickly,” Mr. Bloomberg added.

In advance of what have become daily White House meetings, Congressional Republican leaders stood firm against raising taxes on the wealthy and businesses after 2013, as Mr. Obama demands as part of a “balanced package” with deep spending cuts, including in Medicare and Medicaid.

Mr. Obama, responding to a question in an interview with CBS News, said, “I cannot guarantee that those checks go out on August 3rd if we haven’t resolved this issue. Because there may simply not be the money in the coffers to do it.”

He added, “This is not just a matter of Social Security checks. These are veterans checks, these are folks on disability and their checks. There are about 70 million checks that go out.”

\ No newline at end of file diff --git a/regression_test_data/nytimes-000.yaml b/regression_test_data/nytimes-000.yaml new file mode 100644 index 0000000..2d907e7 --- /dev/null +++ b/regression_test_data/nytimes-000.yaml @@ -0,0 +1,2 @@ +test_description: nytimes article +url: http://thecaucus.blogs.nytimes.com/2011/07/12/mcconnell-proposal-gives-obama-power-to-increase-debt-limit/?hp diff --git a/regression_test_data/nytimes-001.yaml b/regression_test_data/nytimes-001.yaml new file mode 100644 index 0000000..adcfd9f --- /dev/null +++ b/regression_test_data/nytimes-001.yaml @@ -0,0 +1,9 @@ +test_description: multi-page article from nytimes +enabled: false +notes: multi-page not yet implemented +url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html +url_map: + http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2: nytimes-000-orig-2.html + http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=3: nytimes-000-orig-3.html + http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=4: nytimes-000-orig-4.html + http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=5: nytimes-000-orig-5.html diff --git a/regression_test_data/washingtonpost-000-orig.html b/regression_test_data/washingtonpost-000-orig.html new file mode 100644 index 0000000..9b5e0bc --- /dev/null +++ b/regression_test_data/washingtonpost-000-orig.html @@ -0,0 +1,1802 @@ + + + + +Don’t blame ‘both sides’ for debt impasse - The Washington Post + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ + + + +
+ + + + + + + + +
+
+ In the News + +
    + +
+
+
+
+ +
+ + +
+ +
+
+
+
+
+ Eugene Robinson +
+
+
+
Eugene Robinson
+
+Opinion Writer +
+
+
+
+
+
+

Don’t blame ‘both sides’ for debt impasse

+
+ +
+ + + +
+
+
+ + +
+
+
+
+

Washington has many lazy habits, and one of the worst is a reflexive tendency to see equivalence where none exists. Hence the nonsense, being peddled by politicians and commentators who should know better, that “both sides” are equally at fault in the deadlocked talks over the debt ceiling.

+

This is patently false. The truth is that Democrats have made clear they are open to a compromise deal on budget cuts and revenue increases. Republicans have made clear they are not.

+
+ +
+
+

Loading...

+
+

Comments

+
+ +
+
+
+
+
+ +

+

+
+

+
+
+

Eugene Robinson

+

+Writes about politics and culture in twice-a-week columns and on the PostPartisan blog. +

+

Archive

+ +
+ +
+

Gallery

+ + +
+
+

Video

+
+ Speaker John Boehner says he agrees with President Barack Obama that the nation's borrowing limit must be raised to avoid a government default but insists that House Republicans won't back any deal with tax increases. (July 11) + +
+

Speaker John Boehner says he agrees with President Barack Obama that the nation's borrowing limit must be raised to avoid a government default but insists that House Republicans won't back any deal with tax increases. (July 11)

+
+
+

More on this Topic

+ +

View all Items in this Story

+ +
+ +
+ Live Q&A, 12:30 p.m. ET

Live Q&A, 12:30 p.m. ET

On the ethics of negotiation

+

+Ask Now +

+
+
+ + + + + +
+
+
+

Put another way, Democrats reacted to the “grand bargain” proposed by President Obama and House Speaker John Boehner by squawking, complaining and highlighting elements they didn’t like. This is known throughout the world as the way to begin a process of negotiation.

Republicans, by contrast, answered with a definitive “no” and then covered their ears. Given the looming Aug. 2 deadline for default if the debt ceiling is not raised, the proper term for this approach is blackmail.

Yet the “both sides are to blame” narrative somehow gained currency after Boehner announced Saturday that House Republicans would not support any increase in revenue, period. A false equivalence was drawn between the absolute Republican rejection of “revenue-positive” tax reform and the less-than-absolute Democratic opposition to “benefit cuts” in Medicare and Social Security.

The bogus story line is that the radical right-wing base of the GOP and the radical left-wing base of the Democratic Party are equally to blame for sinking the deal.

Leave aside, for the moment, the fact that in the Obama-Boehner proposal, there would be roughly three dollars’ worth of budget cuts for every dollar of new revenue. Don’t pause to ask whether it makes sense to slash government spending when the economy is still sputtering out of the worst recession in decades. Instead, focus narrowly on the politics of the deal.

It is true that House Minority Leader Nancy Pelosi howled like a blindsided politician when she learned that entitlement programs were on the table. But her objections — and those of Democrats in general — are philosophical and tactical, not absolute.

Progressives understand that Medicare and Social Security are not sustainable on their current trajectories; in the long term, both must have their revenue and costs brought into balance. Pelosi’s position is that each program should be addressed with an eye toward sustainability — not as a part of a last-minute deal for a hike in the debt ceiling that covers us for two or three years.

It’s also true that Democrats believe they can win back a passel of House seats next year by highlighting the GOP plan to convert Medicare into a voucher program. They don’t want Republicans to be able to point and say, “See, the Democrats want to cut Medicare, too.”

There’s nothing in these Democratic objections, however, that couldn’t be creatively finessed. You can claim you haven’t actually “cut” a benefit, for example, if what you’ve done is restrained the rate at which its cost will grow. You can offset spending with new revenue, and you can do so in a way that gives low-income taxpayers a break. Democrats left the door open and these options could have been explored.

The story on the Republican side is entirely different. There are ways to finesse a “no new taxes” pledge, too. Instead of raising tax rates, you close loopholes in the name of reform; you add an enhancement here, a “user fee” there, and you can manage to get the revenue you need and still claim you haven’t voted to raise taxes.

But Republicans are taking the position that not a cent of new revenue can be raised, no matter the euphemism. Some Democrats, yes, are being scratchy and cantankerous. But Republicans are refusing to negotiate at all. That’s not the same thing.

I understand why President Obama, in his news conference Monday, chided “each side” for taking a “maximalist position.” For political and practical reasons, it’s advantageous for him to be seen as an honest broker.

Meanwhile, though, the clock ticks toward Aug. 2 and the possibility of a catastrophic default becomes more real. And no one should be confused about what the president confronts: On one side, grousing and grumbling. On the other, a brick wall.

+ + +Eugene Robinson will be online to chat with readers at 1 p.m. Eastern time Tuesday. Submit your questions before or during the discussion. + +

+
+ + + +
+
+ +
+ + +
+
+
+
+

+
+
+
+ +

+

+
+
+ +
+ + +
+ + + + +
+ +
+ Weigh In + + Discussion Policy + + + About Discussions, Badges + +
+ + +
 
+
+ +
+
  • + + +
+ + +
+ + +
+
+
+
+ + + + +
+ +
+ + + +
+ + + + + +
+
+
+
+
+ + + +
+
+ Trove link goes here +
+
+

The Post Most: OpinionsMost-viewed stories, videos and galleries int he past two hours +

+ +
+
+

Today's Opinions Poll

+
+
+ + +
+ + + + + + + + + + + + + + +
+
+
+ + +
+ +
+ + + +
+ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/regression_test_data/washingtonpost-000-rdbl.html b/regression_test_data/washingtonpost-000-rdbl.html new file mode 100644 index 0000000..04e84d0 --- /dev/null +++ b/regression_test_data/washingtonpost-000-rdbl.html @@ -0,0 +1,6 @@ +

Put another way, Democrats reacted to the “grand bargain” proposed by President Obama and House Speaker John Boehner by squawking, complaining and highlighting elements they didn’t like. This is known throughout the world as the way to begin a process of negotiation.

Republicans, by contrast, answered with a definitive “no” and then covered their ears. Given the looming Aug. 2 deadline for default if the debt ceiling is not raised, the proper term for this approach is blackmail.

Yet the “both sides are to blame” narrative somehow gained currency after Boehner announced Saturday that House Republicans would not support any increase in revenue, period. A false equivalence was drawn between the absolute Republican rejection of “revenue-positive” tax reform and the less-than-absolute Democratic opposition to “benefit cuts” in Medicare and Social Security.

The bogus story line is that the radical right-wing base of the GOP and the radical left-wing base of the Democratic Party are equally to blame for sinking the deal.

Leave aside, for the moment, the fact that in the Obama-Boehner proposal, there would be roughly three dollars’ worth of budget cuts for every dollar of new revenue. Don’t pause to ask whether it makes sense to slash government spending when the economy is still sputtering out of the worst recession in decades. Instead, focus narrowly on the politics of the deal.

It is true that House Minority Leader Nancy Pelosi howled like a blindsided politician when she learned that entitlement programs were on the table. But her objections — and those of Democrats in general — are philosophical and tactical, not absolute.

Progressives understand that Medicare and Social Security are not sustainable on their current trajectories; in the long term, both must have their revenue and costs brought into balance. Pelosi’s position is that each program should be addressed with an eye toward sustainability — not as a part of a last-minute deal for a hike in the debt ceiling that covers us for two or three years.

It’s also true that Democrats believe they can win back a passel of House seats next year by highlighting the GOP plan to convert Medicare into a voucher program. They don’t want Republicans to be able to point and say, “See, the Democrats want to cut Medicare, too.”

There’s nothing in these Democratic objections, however, that couldn’t be creatively finessed. You can claim you haven’t actually “cut” a benefit, for example, if what you’ve done is restrained the rate at which its cost will grow. You can offset spending with new revenue, and you can do so in a way that gives low-income taxpayers a break. Democrats left the door open and these options could have been explored.

The story on the Republican side is entirely different. There are ways to finesse a “no new taxes” pledge, too. Instead of raising tax rates, you close loopholes in the name of reform; you add an enhancement here, a “user fee” there, and you can manage to get the revenue you need and still claim you haven’t voted to raise taxes.

But Republicans are taking the position that not a cent of new revenue can be raised, no matter the euphemism. Some Democrats, yes, are being scratchy and cantankerous. But Republicans are refusing to negotiate at all. That’s not the same thing.

I understand why President Obama, in his news conference Monday, chided “each side” for taking a “maximalist position.” For political and practical reasons, it’s advantageous for him to be seen as an honest broker.

Meanwhile, though, the clock ticks toward Aug. 2 and the possibility of a catastrophic default becomes more real. And no one should be confused about what the president confronts: On one side, grousing and grumbling. On the other, a brick wall.

+ + +Eugene Robinson will be online to chat with readers at 1 p.m. Eastern time Tuesday. Submit your questions before or during the discussion. + +

\ No newline at end of file diff --git a/regression_test_data/washingtonpost-000.yaml b/regression_test_data/washingtonpost-000.yaml new file mode 100644 index 0000000..8b158e9 --- /dev/null +++ b/regression_test_data/washingtonpost-000.yaml @@ -0,0 +1,2 @@ +test_description: washingtonpost.com op-ed +url: http://www.washingtonpost.com/opinions/dont-blame-both-sides-for-debt-impasse/2011/07/11/gIQA0XDg9H_story.html?hpid=z1 diff --git a/regression_test_output/.gitignore b/regression_test_output/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/regression_test_output/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index b4d1e10..64fbbb1 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -39,6 +39,11 @@ REGEXES = { 'tool|widget'), re.I), 'divToPElementsRe': re.compile( '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), + # Match: next, continue, >, >>, but not >|, as those usually mean last. + 'nextLink': re.compile(r'(next|weiter|continue|>[^\|]|$)', re.I), + 'prevLink': re.compile(r'(prev|earl|old|new|<)', re.I), + 'page': re.compile(r'pag(e|ing|inat)', re.I), + 'firstLast': re.compile(r'(first|last)', re.I) #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), #'trimRe': re.compile('^\s+|\s+$/'), @@ -94,6 +99,12 @@ def text_length(i): return len(clean(i.text_content() or "")) +def tags(node, *tag_names): + for tag_name in tag_names: + for e in node.findall('.//%s' % tag_name): + yield e + + def clean_segment_extension(segments, index, segment): if segment.find('.') == -1: return segment @@ -206,6 +217,120 @@ def find_base_url(url): return urlparse.urlunsplit(new_parts) +class CandidatePage(): + + def __init__(self, link_text, href): + self.link_text = link_text + self.href = href + self.score = 0 + +def same_domain(lhs, rhs): + split_lhs = urlparse.urlsplit(lhs) + split_rhs = urlparse.urlsplit(rhs) + if split_lhs.netloc == '' or split_rhs.netloc == '': + return True + else: + return split_lhs.netloc == split_rhs.netloc + +def strip_trailing_slash(s): + return re.sub(r'/$', '', s) + +def eval_possible_next_page_link( + parsed_urls, + url, + base_url, + candidates, + link + ): + raw_href = link.get('href') + + # If we've already seen this page, ignore it. + if raw_href is None: + return + + href = strip_trailing_slash(raw_href) + logging.debug('evaluating next page link: %s' % href) + + if href == base_url or href == url or href in parsed_urls: + return + + # If it's on a different domain, skip it. + if not same_domain(url, href): + logging.debug('rejecting %s: different domain' % href) + return + + link_text = clean(link.text_content() or '') + + if REGEXES['extraneous'].search(link_text) or len(link_text) > 25: + return + + href_leftover = href.replace(base_url, '') + if not re.search(r'\d', href_leftover): + return + + if href in candidates: + candidates[href].link_text += ' | ' + link_text + else: + candidates[href] = CandidatePage(link_text, href) + + candidate = candidates[href] + + if href.find(base_url) != 0: + candidate.score -= 25 + + link_class_name = link.get('class') or '' + link_id = link.get('id') or '' + link_data = ' '.join([link_text, link_class_name, link_id]) + + if REGEXES['nextLink'].search(link_data): + candidate.score += 50 + + if REGEXES['page'].search(link_data): + candidate.score += 25 + + if REGEXES['firstLast'].search(link_data): + if not REGEXES['nextLink'].search(candidate.link_text): + candidate.score -= 65 + + neg_re = REGEXES['negativeRe'] + ext_re = REGEXES['extraneous'] + if neg_re.search(link_data) or ext_re.search(link_data): + candidate.score -= 50 + + if REGEXES['prevLink'].search(link_data): + candidate.score -= 200 + + # TODO: Score ancestry. + # TODO: Score a bunch of other stuff. + +def find_next_page_link(parsed_urls, url, elem): + links = tags(elem, 'a') + base_url = find_base_url(url) + # candidates is a mapping from URLs to CandidatePage objects that represent + # information used to determine if a URL points to the next page in the + # article. + candidates = {} + for link in links: + eval_possible_next_page_link( + parsed_urls, + url, + base_url, + candidates, + link + ) + top_page = None + for url, page in candidates.items(): + logging.debug('next page score of %s: %s' % (url, page.score)) + if 50 <= page.score and (not top_page or top_page.score < page.score): + top_page = page + + if top_page: + parsed_urls.add(top_page.href) + return top_page.href + else: + return None + + class Document: """Class to build a etree document out of html.""" TEXT_LENGTH_THRESHOLD = 25 @@ -292,9 +417,9 @@ class Document: while True: self.html = self._parse(self.input_doc) - for i in self.tags(self.html, 'script', 'style'): + for i in tags(self.html, 'script', 'style'): i.drop_tree() - for i in self.tags(self.html, 'body'): + for i in tags(self.html, 'body'): i.set('id', 'readabilityBody') if ruthless: self.remove_unlikely_candidates() @@ -434,8 +559,10 @@ class Document: 'min_text_length', self.TEXT_LENGTH_THRESHOLD) candidates = {} + #self.debug(str([describe(node) for node in tags(self.html, "div")])) + ordered = [] - for elem in self.tags(self.html, "p", "pre", "td"): + for elem in tags(self.html, "p", "pre", "td"): self.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: @@ -540,14 +667,14 @@ class Document: elem.drop_tree() def transform_misused_divs_into_paragraphs(self): - for elem in self.tags(self.html, 'div'): + for elem in tags(self.html, 'div'): # transform
s that do not contain other block elements into

s if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))): self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) - for elem in self.tags(self.html, 'div'): + for elem in tags(self.html, 'div'): if elem.text and elem.text.strip(): p = fragment_fromstring('

') p.text = elem.text @@ -568,15 +695,6 @@ class Document: #print 'Dropped
at '+describe(elem) child.drop_tree() - def findNextPageLink(self, elem): - allLinks = self.tags(elem, ['a']) - baseUrl = self.find_base_url(self.options['url']) - - def tags(self, node, *tag_names): - for tag_name in tag_names: - for e in node.findall('.//%s' % tag_name): - yield e - def reverse_tags(self, node, *tag_names): for tag_name in tag_names: for e in reversed(node.findall('.//%s' % tag_name)): @@ -585,13 +703,13 @@ class Document: def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) - for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): + for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): class_weight = self.class_weight(header) link_density = self.get_link_density(header) if class_weight < 0 or link_density > 0.33: header.drop_tree() - for elem in self.tags(node, "form", "iframe", "textarea"): + for elem in tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean s,
    s, and
    s @@ -663,6 +781,26 @@ class Document: ' many s') to_remove = True + +# if el.tag == 'div' and counts['img'] >= 1 and to_remove: +# imgs = el.findall('.//img') +# valid_img = False +# self.debug(tounicode(el)) +# for img in imgs: +# +# height = img.get('height') +# text_length = img.get('text_length') +# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) +# if to_int(height) >= 100 or to_int(text_length) >= 100: +# valid_img = True +# self.debug("valid image" + tounicode(img)) +# break +# if valid_img: +# to_remove = False +# self.debug("Allowing %s" %el.text_content()) +# for desnode in tags(el, "table", "ul", "div"): +# allowed[desnode] = True + # don't really understand what this is doing. Originally # the i/j were =+ which sets the value to 1. I think that # was supposed to be += which would increment. But then @@ -670,6 +808,8 @@ class Document: # ever do one loop in each iteration and don't understand # it. Will have to investigate when we get to testing more # pages. + + #find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 @@ -694,7 +834,7 @@ class Document: if siblings and sum(siblings) > 1000: to_remove = False self.debug("Allowing %s" % describe(el)) - for desnode in self.tags(el, "table", "ul", "div"): + for desnode in tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: diff --git a/src/tests/helpers.py b/src/tests/helpers.py new file mode 100644 index 0000000..029c892 --- /dev/null +++ b/src/tests/helpers.py @@ -0,0 +1,15 @@ +import os + + +SAMPLES = os.path.join(os.path.dirname(__file__), 'samples') +REGRESSION_DATA = os.path.join(os.path.dirname(__file__), 'test_data') + + +def load_sample(filename): + """Helper to get the content out of the sample files""" + return open(os.path.join(SAMPLES, filename)).read() + + +def load_regression_data(filename): + """Get the content of a test_data regression file""" + return open(os.path.join(REGRESSION_DATA, filename)).read() diff --git a/src/tests/test_data/nytimes-next-page.html b/src/tests/test_data/nytimes-next-page.html new file mode 100644 index 0000000..c938ed1 --- /dev/null +++ b/src/tests/test_data/nytimes-next-page.html @@ -0,0 +1,975 @@ + + + + + + + + + + +The Dark Art of ‘Breaking Bad’ - NYTimes.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + +
    +
    + +
    + +Click Here +
    + +
    +
    +
    + + + + + + +
    +
    +
    + + +
    + +

    + +Magazine +

    + +
    + + + + +
    +
    + + + +
    + +
    + + +
    +
    +
    +
    + +
    +

    The Dark Art of ‘Breaking Bad’

    +
    +
    Robert Yager for The New York Times
    +

    Gilligan on the set with the actors Bryan Cranston and Aaron Paul.

    +
    + + + + +
    +
    +
    +
      +
    • comments
    • + +
    • +Print +
    • +
    • + Single Page +
    • + + + +
    • + + + + + + + + + +Reprints +
    • +
      +
    +
    + +
    +
    +
    +
    + + + + + + + + + +

    +In the first three seasons of the AMC series “Breaking Bad,” Aaron Paul — or rather, his meth-dealing character, Jesse Pinkman — has been slapped, mauled and beaten purple by, respectively, a hit man, a sociopath and a federal drug-enforcement agent. If he were a piñata, the candy would have poured out of this guy long ago. And apparently there is little mercy for Paul in the new season on the way. For there Paul was, one day in late May, standing on Tijeras Avenue in downtown Albuquerque, being tasered by a brawny man in sunglasses.

    +
    +
    + + +
    +
    Multimedia
    +
    +
    +
    + +
    +
    + +
    +
    +
    + +
    + +
    Robert Yager for The New York Times
    +

    The goal, Gilligan says, was to turn "Mr. Chips into Scarface."

    +
    + +
    +
    +
    +

    +The street had been blocked off, and a crew of dozens waited as the actors rehearsed the assault with Vince Gilligan, the creator, head writer and show runner, who was also directing the episode.

    +“Maybe we play this moment just a little longer, so we know for sure he got zapped,” Gilligan said. “Otherwise, Jesse would fight back more.”

    +“Yeah, I like that,” Paul said.

    +“And let’s go back to the brass-knuckle-looking taser,” Gilligan said.

    +“Fly in the brass-knuckle taser!” a nearby crew member shouted into a walkie-talkie.

    +As the cameras were moved into place, Gilligan, who is 44 and speaks in a lyrical Southern drawl, reminisced fondly about some of the torments he has inflicted on Jesse Pinkman. One of the most gruesome was a plunge through the roof of a Port-a-Potty in a junkyard in Season 2.

    +“The original version was that he was going to get bit by a guard dog,” Gilligan said, leaning up against a rail and squinting against the New Mexico sun. “But the guard dog would have cost us $25,000, and we didn’t have the money. So we came up with the $5,000 outhouse gag. Which is quite a bit more memorable.”

    +Mordantly amusing ordeals are a specialty on “Breaking Bad,” which begins its fourth season on July 17. Credit the show’s forbiddingly grim premise: A 50-year-old high-school chemistry teacher named Walter White (played by Bryan Cranston) finds out he has terminal lung cancer and starts making crystal meth, hoping to leave behind a nest egg for his son and pregnant wife. Walter, it emerges, is a chemistry wizard, and after teaming up with Pinkman, a burnout student he once flunked, the pair drive a ramshackle R.V. into the desert and confect the purest, most coveted meth that local dealers have ever known. With the death penalty of his diagnosis looming, Walt wakes from the slumber of an unfulfilling life, evolving from feckless drudge to reluctant part-time criminal, then gradually to something worse.

    +In its first season, “Breaking Bad” seemed like the story of the nuttiest midlife crisis ever, told with elements that felt vaguely familiar. The structure — felonious dad copes with stress of work and family; complications ensue — owed an obvious debt to “The Sopranos,” and the collision of regular people and colorfully violent thugs nodded to Tarantino. The story and setting were an update of the spaghetti Western, minus the cowboys and set in the present.

    +But it was soon clear that “Breaking Bad” was something much more satisfying and complex: a revolutionary take on the serial drama. What sets the show apart from its small-screen peers is a subtle metaphysical layer all its own. As Walter inches toward damnation, Gilligan and his writers have posed some large questions about good and evil, questions with implications for every kind of malefactor you can imagine, from Ponzi schemers to terrorists. Questions like: Do we live in a world where terrible people go unpunished for their misdeeds? Or do the wicked ultimately suffer for their sins?

    +Gilligan has the nerve to provide his own hopeful answer. “Breaking Bad” takes place in a universe where nobody gets away with anything and karma is the great uncredited player in the cast. This moral dimension might explain why “Breaking Bad” has yet to achieve pop cultural breakthrough status, at least on the scale of other cable hits set in decidedly amoral universes, like “True Blood” or “Mad Men,” AMC’s far-more-buzzed-about series that takes place in an ad agency in the ’60s. The total audience for “Breaking Bad” is only slightly smaller than that of “Mad Men” — 19.5 million versus 22.4 million cumulative viewers in their respective third seasons — but the top three markets for “Breaking Bad” are Albuquerque/Santa Fe, Kansas City and Memphis; neither New York nor Los Angeles are in its top 10. The show, in other words, doesn’t play on the coasts. It gets chatter, just not among what has long been considered the chattering class.

    +

    David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

    +
    +
    +
    + + +
    + +
    +
    + +
    +
    +
    +
    +
    +
    +
      +
    • comments
    • + +
    • +Print +
    • +
    • + Single Page +
    • + + + +
    • + + + + + + + + + +Reprints +
    • +
      +
    +
    +
    +
    + +
    +
    + +
    +
+ + + + +
+

+The New York Times and the Bay Area -- save 50% on home delivery plus FREE All Digital Access. + +

+
+

+ +
+ +
+
+
+
+
+
+
+
Get Free E-mail Alerts on These Topics
+
+ + + + + +
+
+
+
+ +
+ + + + + +
+ + +
+ +
+ +
+
+ +
+
+ +
+
+ + + +
Advertise on NYTimes.com
+ +
+ +
+
+ +
+ + +
+ +
+
+ +
+
+ +
+
+
+ +
+
+ +
+
+ + + +
+
+ +
+

+Advertisements

+
+
+
+ +
+
+
+
+
+
+
+
+ +
+
+ +
+ + + +
+ +
+
+ +
+
+ + +
+
+ +
+
+ +
+ + +
+ + + + + + +
+ + + +
+ +
+ +

+ Inside NYTimes.com

+
+ + +
+ + + + + + + + + + + + + + + + + +
+
+
+ Music » +
+
+ City Opera’s Troubled Vision Quest +
+
City Opera’s Troubled Vision Quest
+
+
+
+
+ Fashion & Style » +
+
+ Pumpkin Pie: Provocative or Just Tasty? +
+
Pumpkin Pie: Provocative or Just Tasty?
+
+
+
+
Opinion »
+

Bloggingheads: Libya and Impeachment

+

Glenn Greenwald of Salon.com and Ilya Somin of George Mason University School of Law debate options on Libya.

+
+
+
+
+ Fashion & Style » +
+
+ Model and Front-Runner +
+
Model and Front-Runner
+
+
+
+
+ Opinion » +
+
+ Op-Ed: Vive la Similarité +
+
Op-Ed: Vive la Similarité
+
+
+
+
+ N.Y. / Region » +
+
+ A Mormon Spectacle, Way Off Broadway +
+
A Mormon Spectacle, Way Off Broadway
+
+
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/test_readability.py b/src/tests/test_readability.py index 7c89579..4a6c884 100644 --- a/src/tests/test_readability.py +++ b/src/tests/test_readability.py @@ -1,5 +1,6 @@ import unittest +from helpers import load_regression_data from readability_lxml.readability import Document from readability_lxml import readability as r @@ -139,3 +140,18 @@ class TestFindBaseUrl(unittest.TestCase): ) ] self._run_urls(specs) + + +class TestFindNextPageLink(unittest.TestCase): + + def test_nytimes(self): + # This better work for the New York Times. + html = load_regression_data('nytimes-next-page.html') + expected = '/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1' + + doc = r.document_fromstring(html) + url = 'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html' + parsed_urls = {url} + actual = r.find_next_page_link(parsed_urls, url, doc) + logging.debug('next page link: ' + str(actual)) + diff --git a/src/tests/test_sample_articles.py b/src/tests/test_sample_articles.py index a52f5bd..fdb2c89 100644 --- a/src/tests/test_sample_articles.py +++ b/src/tests/test_sample_articles.py @@ -2,22 +2,15 @@ import os import unittest +from helpers import load_sample from readability_lxml.readability import Document - -SAMPLES = os.path.join(os.path.dirname(__file__), 'samples') - sample_list = [ 'nyt.sample.html', 'si-game.sample.html', ] -def load_sample(filename): - """Helper to get the content out of the sample files""" - return open(os.path.join(SAMPLES, filename)).read() - - def test_processes(): for article in sample_list: yield process_article, article