Compare commits
85 Commits
Author | SHA1 | Date |
---|---|---|
Richard Harding | d708744822 | 12 years ago |
Jerry Charumilind | eefb8e1125 | 12 years ago |
Richard Harding | c931a80ba8 | 12 years ago |
Jerry Charumilind | 883a02ad5d | 12 years ago |
Richard Harding | cfc6f94634 | 12 years ago |
Jerry Charumilind | 816c66482e | 12 years ago |
Richard Harding | 99d5fc0a87 | 12 years ago |
Jerry Charumilind | f02fe79840 | 12 years ago |
Richard Harding | 5cb4b8b8c0 | 12 years ago |
Jerry Charumilind | f8315d011c | 12 years ago |
Richard Harding | 99efa5c10b | 12 years ago |
Richard Harding | a012fd2362 | 12 years ago |
Jerry Charumilind | 3fe416a5d1 | 12 years ago |
Richard Harding | 8cadc4a958 | 12 years ago |
Richard Harding | 9765d13e90 | 12 years ago |
Jerry Charumilind | 32d1764e83 | 12 years ago |
Richard Harding | 0951647c8e | 12 years ago |
Richard Harding | ace51a6819 | 12 years ago |
Jerry Charumilind | 2505c78e5b | 12 years ago |
Richard Harding | edc0e4d4c6 | 12 years ago |
Jerry Charumilind | 6abc6f7ef2 | 12 years ago |
Jerry Charumilind | 1e30e33302 | 12 years ago |
Richard Harding | e8a6250605 | 12 years ago |
Jerry Charumilind | 62df35570d | 12 years ago |
Richard Harding | 29fceeb4b1 | 12 years ago |
Richard Harding | 6f8184be27 | 12 years ago |
Richard Harding | 9aef5e36b7 | 12 years ago |
Jerry Charumilind | 8988b6b767 | 12 years ago |
Jerry Charumilind | 7d097d5f11 | 12 years ago |
Jerry Charumilind | b04f75239c | 12 years ago |
Jerry Charumilind | c21f00b1ee | 12 years ago |
Richard Harding | 9fec245ae4 | 12 years ago |
Jerry Charumilind | 6af808bc14 | 12 years ago |
Jerry Charumilind | 7980ca84c9 | 12 years ago |
Richard Harding | a700bb8bd4 | 12 years ago |
Jerry Charumilind | bf203b5a4b | 12 years ago |
Jerry Charumilind | 65989b538a | 12 years ago |
Jerry Charumilind | 9b7e5bb327 | 12 years ago |
Jerry Charumilind | 068eba19ae | 12 years ago |
Richard Harding | 6d3ad559f6 | 12 years ago |
Jerry Charumilind | 5222ed0628 | 12 years ago |
Richard Harding | 6454fb3f37 | 12 years ago |
Richard Harding | 9366436861 | 12 years ago |
Richard Harding | 7dc373e9c5 | 12 years ago |
Richard Harding | b1966df1c3 | 12 years ago |
Richard Harding | 57694cb352 | 12 years ago |
Jerry Charumilind | b78d7e8501 | 12 years ago |
Richard Harding | a2b17e757c | 12 years ago |
Richard Harding | 3347f16d93 | 12 years ago |
Richard Harding | 93ac1111a1 | 12 years ago |
Richard Harding | 08660f6f0c | 12 years ago |
Richard Harding | 35792e7a59 | 12 years ago |
Richard Harding | aa51283dff | 12 years ago |
Richard Harding | a4b6957be2 | 12 years ago |
Richard Harding | b0063ffb3c | 12 years ago |
Richard Harding | 8091a75f00 | 12 years ago |
Richard Harding | 8f420bd950 | 12 years ago |
Richard Harding | 58c69651d3 | 12 years ago |
Richard Harding | 8b0210c4dc | 12 years ago |
Richard Harding | 0f9da8ace4 | 12 years ago |
Richard Harding | dc86283d83 | 12 years ago |
Richard Harding | 2ee2fe9536 | 12 years ago |
Richard Harding | ac5ef73e71 | 12 years ago |
Richard Harding | f5451356ee | 12 years ago |
Richard Harding | 509aed0d9f | 12 years ago |
Richard Harding | 273878214f | 12 years ago |
Richard Harding | 674e5f9ef2 | 12 years ago |
Richard Harding | 1c1cbaefa5 | 12 years ago |
Richard Harding | 7e57767070 | 12 years ago |
Richard Harding | 62e153eaf8 | 12 years ago |
Richard Harding | d11b928504 | 12 years ago |
Richard Harding | a6361854a9 | 12 years ago |
Richard Harding | b498df200b | 12 years ago |
Richard Harding | bbb60ed077 | 12 years ago |
Jerry Charumilind | cc0af7a105 | 13 years ago |
Jerry Charumilind | 82eabfc6b1 | 13 years ago |
Jerry Charumilind | cba19f209b | 13 years ago |
Jerry Charumilind | 18fa6b5146 | 13 years ago |
Jerry Charumilind | cdd30f625e | 13 years ago |
Jerry Charumilind | 7aac0f0855 | 13 years ago |
Jerry Charumilind | ac517834e6 | 13 years ago |
Jerry Charumilind | 01247903b8 | 13 years ago |
Jerry Charumilind | 33f935e39a | 13 years ago |
Jerry Charumilind | 7ceb8e6d7b | 13 years ago |
Jerry Charumilind | 8877754d7e | 13 years ago |
@ -0,0 +1,10 @@
|
||||
Yuri Baburov
|
||||
facundo
|
||||
gfxmonk
|
||||
Jan Weiß
|
||||
Jerry Charumilind
|
||||
Laurent Peuch
|
||||
Lee Semel
|
||||
Rick Harding
|
||||
Sean Brant
|
||||
Tim Cuthbertson
|
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
@ -1,50 +0,0 @@
|
||||
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
This is a python port of a ruby port of arc90's readability project
|
||||
|
||||
http://lab.arc90.com/experiments/readability/
|
||||
|
||||
In few words,
|
||||
Given a html document, it pulls out the main body text and cleans it up.
|
||||
It also can clean up title based on latest readability.js code.
|
||||
|
||||
Based on:
|
||||
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
|
||||
- Ruby port by starrhorne and iterationlabs
|
||||
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
|
||||
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
|
||||
- "BR to P" fix from readability.js which improves quality for smaller texts.
|
||||
- Github users contributions.
|
||||
|
||||
Installation::
|
||||
|
||||
easy_install readability-lxml
|
||||
or
|
||||
pip install readability-lxml
|
||||
|
||||
Usage::
|
||||
|
||||
from readability.readability import Document
|
||||
import urllib
|
||||
html = urllib.urlopen(url).read()
|
||||
readable_article = Document(html).summary()
|
||||
readable_title = Document(html).short_title()
|
||||
|
||||
Command-line usage::
|
||||
|
||||
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
|
||||
|
||||
|
||||
Document() kwarg options:
|
||||
|
||||
- attributes:
|
||||
- debug: output debug messages
|
||||
- min_text_length:
|
||||
- retry_length:
|
||||
- url: will allow adjusting links to be absolute
|
||||
|
||||
|
||||
Updates
|
||||
|
||||
- 0.2.5 Update setup.py for uploading .tar.gz to pypi
|
||||
|
@ -0,0 +1,92 @@
|
||||
readability_lxml
|
||||
================
|
||||
|
||||
This is a python port of a ruby port of `arc90's readability`_ project
|
||||
|
||||
Given a html document, it pulls out the main body text and cleans it up.
|
||||
It also can clean up title based on latest readability.js code.
|
||||
|
||||
|
||||
Inspiration
|
||||
-----------
|
||||
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
|
||||
- Ruby port by starrhorne and iterationlabs
|
||||
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
|
||||
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
|
||||
- "BR to P" fix from readability.js which improves quality for smaller texts.
|
||||
- Github users contributions.
|
||||
|
||||
|
||||
Try it out!
|
||||
-----------
|
||||
You can try out the parser by entering your test urls on the following test
|
||||
service.
|
||||
|
||||
http://readable.bmark.us
|
||||
|
||||
|
||||
Installation
|
||||
-------------
|
||||
::
|
||||
|
||||
$ easy_install readability-lxml
|
||||
# or
|
||||
$ pip install readability-lxml
|
||||
|
||||
|
||||
Usage
|
||||
------
|
||||
|
||||
Command Line Client
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
::
|
||||
|
||||
$ readability http://pypi.python.org/pypi/readability-lxml
|
||||
$ readability /home/rharding/sampledoc.html
|
||||
|
||||
As a Library
|
||||
~~~~~~~~~~~~
|
||||
::
|
||||
|
||||
from readability.readability import Document
|
||||
import urllib
|
||||
html = urllib.urlopen(url).read()
|
||||
readable_article = Document(html).summary()
|
||||
readable_title = Document(html).short_title()
|
||||
|
||||
You can also use the `get_summary_with_metadata` method to get back other
|
||||
metadata such as the confidence score found while processing the input.
|
||||
|
||||
::
|
||||
|
||||
doc = Document(html).summary_with_metadata()
|
||||
print doc.html
|
||||
print doc.confidence
|
||||
|
||||
|
||||
Optional `Document` keyword argument:
|
||||
|
||||
- attributes:
|
||||
- debug: output debug messages
|
||||
- min_text_length:
|
||||
- multipage: should we try to parse and combine multiple page articles?
|
||||
- retry_length:
|
||||
- url: will allow adjusting links to be absolute
|
||||
|
||||
|
||||
Test and BUild Status
|
||||
---------------------
|
||||
Tests are run against the package at:
|
||||
|
||||
http://build.bmark.us/job/readability-lxml/
|
||||
|
||||
You can view it for build history and test status.
|
||||
|
||||
|
||||
History
|
||||
-------
|
||||
|
||||
- `0.2.5` Update setup.py for uploading .tar.gz to pypi
|
||||
|
||||
|
||||
.. _arc90's readability: http://lab.arc90.com/experiments/readability/
|
@ -1 +0,0 @@
|
||||
from .readability import Document
|
@ -1,589 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
from lxml.etree import tostring
|
||||
from lxml.etree import tounicode
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import fragment_fromstring
|
||||
|
||||
from cleaners import clean_attributes
|
||||
from cleaners import html_cleaner
|
||||
from htmls import build_doc
|
||||
from htmls import get_body
|
||||
from htmls import get_title
|
||||
from htmls import shorten_title
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
log = logging.getLogger()
|
||||
|
||||
|
||||
REGEXES = {
|
||||
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
|
||||
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
|
||||
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
|
||||
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
|
||||
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
|
||||
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
||||
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
||||
#'trimRe': re.compile('^\s+|\s+$/'),
|
||||
#'normalizeRe': re.compile('\s{2,}/'),
|
||||
#'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'),
|
||||
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
|
||||
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
|
||||
}
|
||||
|
||||
|
||||
class Unparseable(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def describe(node, depth=1):
|
||||
if not hasattr(node, 'tag'):
|
||||
return "[%s]" % type(node)
|
||||
name = node.tag
|
||||
if node.get('id', ''):
|
||||
name += '#' + node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ', '.')
|
||||
if name[:4] in ['div#', 'div.']:
|
||||
name = name[3:]
|
||||
if depth and node.getparent() is not None:
|
||||
return name + ' - ' + describe(node.getparent(), depth - 1)
|
||||
return name
|
||||
|
||||
|
||||
def to_int(x):
|
||||
if not x:
|
||||
return None
|
||||
x = x.strip()
|
||||
if x.endswith('px'):
|
||||
return int(x[:-2])
|
||||
if x.endswith('em'):
|
||||
return int(x[:-2]) * 12
|
||||
return int(x)
|
||||
|
||||
|
||||
def clean(text):
|
||||
text = re.sub('\s*\n\s*', '\n', text)
|
||||
text = re.sub('[ \t]{2,}', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def text_length(i):
|
||||
return len(clean(i.text_content() or ""))
|
||||
|
||||
|
||||
class Document:
|
||||
"""Class to build a etree document out of html."""
|
||||
TEXT_LENGTH_THRESHOLD = 25
|
||||
RETRY_LENGTH = 250
|
||||
|
||||
def __init__(self, input, **options):
|
||||
"""Generate the document
|
||||
|
||||
:param input: string of the html content.
|
||||
|
||||
kwargs:
|
||||
- attributes:
|
||||
- debug: output debug messages
|
||||
- min_text_length:
|
||||
- retry_length:
|
||||
- url: will allow adjusting links to be absolute
|
||||
|
||||
"""
|
||||
self.input = input
|
||||
self.options = options
|
||||
self.html = None
|
||||
|
||||
def _html(self, force=False):
|
||||
if force or self.html is None:
|
||||
self.html = self._parse(self.input)
|
||||
return self.html
|
||||
|
||||
def _parse(self, input):
|
||||
doc = build_doc(input)
|
||||
doc = html_cleaner.clean_html(doc)
|
||||
base_href = self.options.get('url', None)
|
||||
if base_href:
|
||||
doc.make_links_absolute(base_href, resolve_base_href=True)
|
||||
else:
|
||||
doc.resolve_base_href()
|
||||
return doc
|
||||
|
||||
def content(self):
|
||||
return get_body(self._html(True))
|
||||
|
||||
def title(self):
|
||||
return get_title(self._html(True))
|
||||
|
||||
def short_title(self):
|
||||
return shorten_title(self._html(True))
|
||||
|
||||
def summary(self, html_partial=False):
|
||||
"""Generate the summary of the html docuemnt
|
||||
|
||||
:param html_partial: return only the div of the document, don't wrap
|
||||
in html and body tags.
|
||||
|
||||
"""
|
||||
try:
|
||||
ruthless = True
|
||||
while True:
|
||||
self._html(True)
|
||||
for i in self.tags(self.html, 'script', 'style'):
|
||||
i.drop_tree()
|
||||
for i in self.tags(self.html, 'body'):
|
||||
i.set('id', 'readabilityBody')
|
||||
if ruthless:
|
||||
self.remove_unlikely_candidates()
|
||||
self.transform_misused_divs_into_paragraphs()
|
||||
candidates = self.score_paragraphs()
|
||||
|
||||
best_candidate = self.select_best_candidate(candidates)
|
||||
|
||||
if best_candidate:
|
||||
article = self.get_article(candidates, best_candidate,
|
||||
html_partial=html_partial)
|
||||
else:
|
||||
if ruthless:
|
||||
log.debug("ruthless removal did not work. ")
|
||||
ruthless = False
|
||||
self.debug(
|
||||
("ended up stripping too much - "
|
||||
"going for a safer _parse"))
|
||||
# try again
|
||||
continue
|
||||
else:
|
||||
log.debug(
|
||||
("Ruthless and lenient parsing did not work. "
|
||||
"Returning raw html"))
|
||||
article = self.html.find('body')
|
||||
if article is None:
|
||||
article = self.html
|
||||
cleaned_article = self.sanitize(article, candidates)
|
||||
article_length = len(cleaned_article or '')
|
||||
retry_length = self.options.get(
|
||||
'retry_length',
|
||||
self.RETRY_LENGTH)
|
||||
of_acceptable_length = article_length >= retry_length
|
||||
if ruthless and not of_acceptable_length:
|
||||
ruthless = False
|
||||
# Loop through and try again.
|
||||
continue
|
||||
else:
|
||||
return cleaned_article
|
||||
except StandardError, e:
|
||||
log.exception('error getting summary: ')
|
||||
raise Unparseable(str(e)), None, sys.exc_info()[2]
|
||||
|
||||
def get_article(self, candidates, best_candidate, html_partial=False):
|
||||
# Now that we have the top candidate, look through its siblings for
|
||||
# content that might also be related.
|
||||
# Things like preambles, content split by ads that we removed, etc.
|
||||
sibling_score_threshold = max([
|
||||
10,
|
||||
best_candidate['content_score'] * 0.2])
|
||||
# create a new html document with a html->body->div
|
||||
if html_partial:
|
||||
output = fragment_fromstring('<div/>')
|
||||
else:
|
||||
output = document_fromstring('<div/>')
|
||||
best_elem = best_candidate['elem']
|
||||
for sibling in best_elem.getparent().getchildren():
|
||||
# in lxml there no concept of simple text
|
||||
# if isinstance(sibling, NavigableString): continue
|
||||
append = False
|
||||
if sibling is best_elem:
|
||||
append = True
|
||||
sibling_key = sibling # HashableElement(sibling)
|
||||
if sibling_key in candidates and \
|
||||
candidates[sibling_key]['content_score'] >= sibling_score_threshold:
|
||||
append = True
|
||||
|
||||
if sibling.tag == "p":
|
||||
link_density = self.get_link_density(sibling)
|
||||
node_content = sibling.text or ""
|
||||
node_length = len(node_content)
|
||||
|
||||
if node_length > 80 and link_density < 0.25:
|
||||
append = True
|
||||
elif node_length <= 80 \
|
||||
and link_density == 0 \
|
||||
and re.search('\.( |$)', node_content):
|
||||
append = True
|
||||
|
||||
if append:
|
||||
# We don't want to append directly to output, but the div
|
||||
# in html->body->div
|
||||
if html_partial:
|
||||
output.append(sibling)
|
||||
else:
|
||||
output.getchildren()[0].getchildren()[0].append(sibling)
|
||||
#if output is not None:
|
||||
# output.append(best_elem)
|
||||
return output
|
||||
|
||||
def select_best_candidate(self, candidates):
|
||||
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
||||
for candidate in sorted_candidates[:5]:
|
||||
elem = candidate['elem']
|
||||
self.debug("Top 5 : %6.3f %s" % (
|
||||
candidate['content_score'],
|
||||
describe(elem)))
|
||||
|
||||
if len(sorted_candidates) == 0:
|
||||
return None
|
||||
|
||||
best_candidate = sorted_candidates[0]
|
||||
return best_candidate
|
||||
|
||||
def get_link_density(self, elem):
|
||||
link_length = 0
|
||||
for i in elem.findall(".//a"):
|
||||
link_length += text_length(i)
|
||||
#if len(elem.findall(".//div") or elem.findall(".//p")):
|
||||
# link_length = link_length
|
||||
total_length = text_length(elem)
|
||||
return float(link_length) / max(total_length, 1)
|
||||
|
||||
def score_paragraphs(self, ):
|
||||
MIN_LEN = self.options.get(
|
||||
'min_text_length',
|
||||
self.TEXT_LENGTH_THRESHOLD)
|
||||
candidates = {}
|
||||
ordered = []
|
||||
for elem in self.tags(self._html(), "p", "pre", "td"):
|
||||
parent_node = elem.getparent()
|
||||
if parent_node is None:
|
||||
continue
|
||||
grand_parent_node = parent_node.getparent()
|
||||
|
||||
inner_text = clean(elem.text_content() or "")
|
||||
inner_text_len = len(inner_text)
|
||||
|
||||
# If this paragraph is less than 25 characters
|
||||
# don't even count it.
|
||||
if inner_text_len < MIN_LEN:
|
||||
continue
|
||||
|
||||
if parent_node not in candidates:
|
||||
candidates[parent_node] = self.score_node(parent_node)
|
||||
ordered.append(parent_node)
|
||||
|
||||
if grand_parent_node is not None and grand_parent_node not in candidates:
|
||||
candidates[grand_parent_node] = self.score_node(
|
||||
grand_parent_node)
|
||||
ordered.append(grand_parent_node)
|
||||
|
||||
content_score = 1
|
||||
content_score += len(inner_text.split(','))
|
||||
content_score += min((inner_text_len / 100), 3)
|
||||
#if elem not in candidates:
|
||||
# candidates[elem] = self.score_node(elem)
|
||||
|
||||
#WTF? candidates[elem]['content_score'] += content_score
|
||||
candidates[parent_node]['content_score'] += content_score
|
||||
if grand_parent_node is not None:
|
||||
candidates[grand_parent_node]['content_score'] += content_score / 2.0
|
||||
|
||||
# Scale the final candidates score based on link density. Good content
|
||||
# should have a relatively small link density (5% or less) and be
|
||||
# mostly unaffected by this operation.
|
||||
for elem in ordered:
|
||||
candidate = candidates[elem]
|
||||
ld = self.get_link_density(elem)
|
||||
score = candidate['content_score']
|
||||
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
|
||||
score,
|
||||
describe(elem),
|
||||
ld,
|
||||
score * (1 - ld)))
|
||||
candidate['content_score'] *= (1 - ld)
|
||||
|
||||
return candidates
|
||||
|
||||
def class_weight(self, e):
|
||||
weight = 0
|
||||
if e.get('class', None):
|
||||
if REGEXES['negativeRe'].search(e.get('class')):
|
||||
weight -= 25
|
||||
|
||||
if REGEXES['positiveRe'].search(e.get('class')):
|
||||
weight += 25
|
||||
|
||||
if e.get('id', None):
|
||||
if REGEXES['negativeRe'].search(e.get('id')):
|
||||
weight -= 25
|
||||
|
||||
if REGEXES['positiveRe'].search(e.get('id')):
|
||||
weight += 25
|
||||
|
||||
return weight
|
||||
|
||||
def score_node(self, elem):
|
||||
content_score = self.class_weight(elem)
|
||||
name = elem.tag.lower()
|
||||
if name == "div":
|
||||
content_score += 5
|
||||
elif name in ["pre", "td", "blockquote"]:
|
||||
content_score += 3
|
||||
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
|
||||
content_score -= 3
|
||||
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
|
||||
content_score -= 5
|
||||
return {
|
||||
'content_score': content_score,
|
||||
'elem': elem
|
||||
}
|
||||
|
||||
def debug(self, *a):
|
||||
if self.options.get('debug', False):
|
||||
log.debug(*a)
|
||||
|
||||
def remove_unlikely_candidates(self):
|
||||
for elem in self.html.iter():
|
||||
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
|
||||
if len(s) < 2:
|
||||
continue
|
||||
#self.debug(s)
|
||||
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
|
||||
self.debug("Removing unlikely candidate - %s" % describe(elem))
|
||||
elem.drop_tree()
|
||||
|
||||
def transform_misused_divs_into_paragraphs(self):
|
||||
for elem in self.tags(self.html, 'div'):
|
||||
# transform <div>s that do not contain other block elements into
|
||||
# <p>s
|
||||
#FIXME: The current implementation ignores all descendants that
|
||||
# are not direct children of elem
|
||||
# This results in incorrect results in case there is an <img>
|
||||
# buried within an <a> for example
|
||||
if not REGEXES['divToPElementsRe'].search(
|
||||
unicode(''.join(map(tostring, list(elem))))):
|
||||
#self.debug("Altering %s to p" % (describe(elem)))
|
||||
elem.tag = "p"
|
||||
#print "Fixed element "+describe(elem)
|
||||
|
||||
for elem in self.tags(self.html, 'div'):
|
||||
if elem.text and elem.text.strip():
|
||||
p = fragment_fromstring('<p/>')
|
||||
p.text = elem.text
|
||||
elem.text = None
|
||||
elem.insert(0, p)
|
||||
#print "Appended "+tounicode(p)+" to "+describe(elem)
|
||||
|
||||
for pos, child in reversed(list(enumerate(elem))):
|
||||
if child.tail and child.tail.strip():
|
||||
p = fragment_fromstring('<p/>')
|
||||
p.text = child.tail
|
||||
child.tail = None
|
||||
elem.insert(pos + 1, p)
|
||||
#print "Inserted "+tounicode(p)+" to "+describe(elem)
|
||||
if child.tag == 'br':
|
||||
#print 'Dropped <br> at '+describe(elem)
|
||||
child.drop_tree()
|
||||
|
||||
def tags(self, node, *tag_names):
|
||||
for tag_name in tag_names:
|
||||
for e in node.findall('.//%s' % tag_name):
|
||||
yield e
|
||||
|
||||
def reverse_tags(self, node, *tag_names):
|
||||
for tag_name in tag_names:
|
||||
for e in reversed(node.findall('.//%s' % tag_name)):
|
||||
yield e
|
||||
|
||||
def sanitize(self, node, candidates):
|
||||
MIN_LEN = self.options.get('min_text_length',
|
||||
self.TEXT_LENGTH_THRESHOLD)
|
||||
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
|
||||
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
|
||||
header.drop_tree()
|
||||
|
||||
for elem in self.tags(node, "form", "iframe", "textarea"):
|
||||
elem.drop_tree()
|
||||
allowed = {}
|
||||
# Conditionally clean <table>s, <ul>s, and <div>s
|
||||
for el in self.reverse_tags(node, "table", "ul", "div"):
|
||||
if el in allowed:
|
||||
continue
|
||||
weight = self.class_weight(el)
|
||||
if el in candidates:
|
||||
content_score = candidates[el]['content_score']
|
||||
#print '!',el, '-> %6.3f' % content_score
|
||||
else:
|
||||
content_score = 0
|
||||
tag = el.tag
|
||||
|
||||
if weight + content_score < 0:
|
||||
self.debug("Cleaned %s with score %6.3f and weight %-3s" %
|
||||
(describe(el), content_score, weight, ))
|
||||
el.drop_tree()
|
||||
elif el.text_content().count(",") < 10:
|
||||
counts = {}
|
||||
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
|
||||
counts[kind] = len(el.findall('.//%s' % kind))
|
||||
counts["li"] -= 100
|
||||
|
||||
# Count the text length excluding any surrounding whitespace
|
||||
content_length = text_length(el)
|
||||
link_density = self.get_link_density(el)
|
||||
parent_node = el.getparent()
|
||||
if parent_node is not None:
|
||||
if parent_node in candidates:
|
||||
content_score = candidates[parent_node]['content_score']
|
||||
else:
|
||||
content_score = 0
|
||||
#if parent_node is not None:
|
||||
#pweight = self.class_weight(parent_node) + content_score
|
||||
#pname = describe(parent_node)
|
||||
#else:
|
||||
#pweight = 0
|
||||
#pname = "no parent"
|
||||
to_remove = False
|
||||
reason = ""
|
||||
|
||||
#if el.tag == 'div' and counts["img"] >= 1:
|
||||
# continue
|
||||
if counts["p"] and counts["img"] > counts["p"]:
|
||||
reason = "too many images (%s)" % counts["img"]
|
||||
to_remove = True
|
||||
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
|
||||
reason = "more <li>s than <p>s"
|
||||
to_remove = True
|
||||
elif counts["input"] > (counts["p"] / 3):
|
||||
reason = "less than 3x <p>s than <input>s"
|
||||
to_remove = True
|
||||
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
|
||||
reason = "too short content length %s without a single image" % content_length
|
||||
to_remove = True
|
||||
elif weight < 25 and link_density > 0.2:
|
||||
reason = "too many links %.3f for its weight %s" % (
|
||||
link_density, weight)
|
||||
to_remove = True
|
||||
elif weight >= 25 and link_density > 0.5:
|
||||
reason = "too many links %.3f for its weight %s" % (
|
||||
link_density, weight)
|
||||
to_remove = True
|
||||
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
|
||||
reason = "<embed>s with too short content length, or too many <embed>s"
|
||||
to_remove = True
|
||||
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
|
||||
# imgs = el.findall('.//img')
|
||||
# valid_img = False
|
||||
# self.debug(tounicode(el))
|
||||
# for img in imgs:
|
||||
#
|
||||
# height = img.get('height')
|
||||
# text_length = img.get('text_length')
|
||||
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
|
||||
# if to_int(height) >= 100 or to_int(text_length) >= 100:
|
||||
# valid_img = True
|
||||
# self.debug("valid image" + tounicode(img))
|
||||
# break
|
||||
# if valid_img:
|
||||
# to_remove = False
|
||||
# self.debug("Allowing %s" %el.text_content())
|
||||
# for desnode in self.tags(el, "table", "ul", "div"):
|
||||
# allowed[desnode] = True
|
||||
|
||||
#find x non empty preceding and succeeding siblings
|
||||
i, j = 0, 0
|
||||
x = 1
|
||||
siblings = []
|
||||
for sib in el.itersiblings():
|
||||
#self.debug(sib.text_content())
|
||||
sib_content_length = text_length(sib)
|
||||
if sib_content_length:
|
||||
i =+ 1
|
||||
siblings.append(sib_content_length)
|
||||
if i == x:
|
||||
break
|
||||
for sib in el.itersiblings(preceding=True):
|
||||
#self.debug(sib.text_content())
|
||||
sib_content_length = text_length(sib)
|
||||
if sib_content_length:
|
||||
j =+ 1
|
||||
siblings.append(sib_content_length)
|
||||
if j == x:
|
||||
break
|
||||
#self.debug(str(siblings))
|
||||
if siblings and sum(siblings) > 1000:
|
||||
to_remove = False
|
||||
self.debug("Allowing %s" % describe(el))
|
||||
for desnode in self.tags(el, "table", "ul", "div"):
|
||||
allowed[desnode] = True
|
||||
|
||||
if to_remove:
|
||||
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
|
||||
(content_score, describe(el), weight, reason))
|
||||
#print tounicode(el)
|
||||
#self.debug("pname %s pweight %.3f" %(pname, pweight))
|
||||
el.drop_tree()
|
||||
|
||||
for el in ([node] + [n for n in node.iter()]):
|
||||
if not self.options.get('attributes', None):
|
||||
#el.attrib = {} #FIXME:Checkout the effects of disabling this
|
||||
pass
|
||||
|
||||
return clean_attributes(tounicode(node))
|
||||
|
||||
|
||||
class HashableElement():
|
||||
def __init__(self, node):
|
||||
self.node = node
|
||||
self._path = None
|
||||
|
||||
def _get_path(self):
|
||||
if self._path is None:
|
||||
reverse_path = []
|
||||
node = self.node
|
||||
while node is not None:
|
||||
node_id = (node.tag, tuple(node.attrib.items()), node.text)
|
||||
reverse_path.append(node_id)
|
||||
node = node.getparent()
|
||||
self._path = tuple(reverse_path)
|
||||
return self._path
|
||||
path = property(_get_path)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.path)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == other.path
|
||||
|
||||
def __getattr__(self, tag):
|
||||
return getattr(self.node, tag)
|
||||
|
||||
|
||||
def main():
|
||||
from optparse import OptionParser
|
||||
parser = OptionParser(usage="%prog: [options] [file]")
|
||||
parser.add_option('-v', '--verbose', action='store_true')
|
||||
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if not (len(args) == 1 or options.url):
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
file = None
|
||||
if options.url:
|
||||
import urllib
|
||||
file = urllib.urlopen(options.url)
|
||||
else:
|
||||
file = open(args[0], 'rt')
|
||||
enc = sys.__stdout__.encoding or 'utf-8'
|
||||
try:
|
||||
print Document(file.read(),
|
||||
debug=options.verbose,
|
||||
url=options.url).summary().encode(enc, 'replace')
|
||||
finally:
|
||||
file.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,26 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
version = "0.3.0"
|
||||
install_requires = [
|
||||
"chardet",
|
||||
"lxml",
|
||||
]
|
||||
tests_require = [
|
||||
'coverage',
|
||||
'nose',
|
||||
'pep8',
|
||||
'PyYaml',
|
||||
]
|
||||
|
||||
|
||||
setup(
|
||||
name="readability-lxml",
|
||||
version="0.2.5",
|
||||
version=version,
|
||||
author="Yuri Baburov",
|
||||
author_email="burchik@gmail.com",
|
||||
description="fast python port of arc90's readability tool",
|
||||
test_suite = "tests.test_article_only",
|
||||
long_description=open("README").read(),
|
||||
keywords='readable read parse html document readability',
|
||||
long_description=open("README.rst").read(),
|
||||
license="Apache License 2.0",
|
||||
url="http://github.com/buriy/python-readability",
|
||||
package_dir={'': 'readability'},
|
||||
packages=find_packages('readability', exclude=["*.tests", "*.tests.*"]),
|
||||
install_requires=[
|
||||
"chardet",
|
||||
"lxml"
|
||||
],
|
||||
classifiers=[
|
||||
"Environment :: Web Environment",
|
||||
"Intended Audience :: Developers",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
],
|
||||
],
|
||||
url="http://github.com/buriy/python-readability",
|
||||
packages=find_packages('src', exclude=["*.tests", "*.tests.*"]),
|
||||
package_dir = {'': 'src'},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
install_requires=install_requires,
|
||||
tests_require=tests_require,
|
||||
extras_require={'test': tests_require},
|
||||
test_suite = "nose.collector",
|
||||
entry_points={
|
||||
'console_scripts':
|
||||
['readability=readability_lxml:client.main']
|
||||
},
|
||||
)
|
||||
|
@ -0,0 +1,3 @@
|
||||
VERSION = '0.2.5'
|
||||
|
||||
import client
|
@ -1,32 +1,38 @@
|
||||
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
||||
# strip out a set of nuisance html attributes that can mess up rendering in
|
||||
# RSS feeds
|
||||
import re
|
||||
from lxml.html.clean import Cleaner
|
||||
|
||||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
|
||||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
|
||||
'background[-a-z]*', 'on*']
|
||||
single_quoted = "'[^']+'"
|
||||
double_quoted = '"[^"]+"'
|
||||
non_space = '[^ "\'>]+'
|
||||
htmlstrip = re.compile("<" # open
|
||||
"([^>]+) " # prefix
|
||||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||
htmlstrip = re.compile("<" # open
|
||||
"([^>]+) " # prefix
|
||||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||
"([^>]*)" # postfix
|
||||
">" # end
|
||||
, re.I)
|
||||
">", # end
|
||||
re.I)
|
||||
|
||||
|
||||
def clean_attributes(html):
|
||||
while htmlstrip.search(html):
|
||||
html = htmlstrip.sub('<\\1\\2>', html)
|
||||
return html
|
||||
|
||||
|
||||
def normalize_spaces(s):
|
||||
if not s: return ''
|
||||
"""replace any sequence of whitespace
|
||||
characters with a single space"""
|
||||
"""replace any sequence of whitespace characters with a single space"""
|
||||
if not s:
|
||||
return ''
|
||||
return ' '.join(s.split())
|
||||
|
||||
|
||||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||
style=True, links=True, meta=False, add_nofollow=False,
|
||||
page_structure=False, processing_instructions=True, embedded=False,
|
||||
frames=False, forms=False, annoying_tags=False, remove_tags=None,
|
||||
page_structure=False, processing_instructions=True,
|
||||
embedded=False, frames=False, forms=False,
|
||||
annoying_tags=False, remove_tags=None,
|
||||
remove_unknown_tags=False, safe_attrs_only=False)
|
@ -0,0 +1,69 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from readability_lxml import VERSION
|
||||
from readability_lxml.readability import Document
|
||||
|
||||
|
||||
def parse_args():
|
||||
desc = "fast python port of arc90's readability tool"
|
||||
parser = argparse.ArgumentParser(description=desc)
|
||||
parser.add_argument('--version',
|
||||
action='version', version=VERSION)
|
||||
|
||||
parser.add_argument('-v', '--verbose',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Increase logging verbosity to DEBUG.')
|
||||
|
||||
parser.add_argument('-m', '--metadata',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='print all metadata as well as content for the content')
|
||||
|
||||
parser.add_argument('path', metavar='P', type=str, nargs=1,
|
||||
help="The url or file path to process in readable form.")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
target = args.path[0]
|
||||
|
||||
if target.startswith('http') or target.startswith('www'):
|
||||
is_url = True
|
||||
url = target
|
||||
else:
|
||||
is_url = False
|
||||
url = None
|
||||
|
||||
if is_url:
|
||||
import urllib
|
||||
target = urllib.urlopen(target)
|
||||
else:
|
||||
target = open(target, 'rt')
|
||||
|
||||
enc = sys.__stdout__.encoding or 'utf-8'
|
||||
|
||||
try:
|
||||
doc = Document(target.read(),
|
||||
debug=args.verbose,
|
||||
url=url)
|
||||
if args.metadata:
|
||||
m = doc.summary_with_metadata()
|
||||
print m.title()
|
||||
print m.short_title()
|
||||
print m.confidence
|
||||
print m.html.encode(enc, 'replace')
|
||||
else:
|
||||
print doc.summary().encode(enc, 'replace')
|
||||
|
||||
finally:
|
||||
target.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,25 +1,32 @@
|
||||
uids = {}
|
||||
|
||||
|
||||
def save_to_file(text, filename):
|
||||
f = open(filename, 'wt')
|
||||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
f.write("""
|
||||
<meta http-equiv="Content-Type"
|
||||
content="text/html; charset=UTF-8"
|
||||
/>""")
|
||||
f.write(text.encode('utf-8'))
|
||||
f.close()
|
||||
|
||||
uids = {}
|
||||
|
||||
def describe(node, depth=2):
|
||||
if not hasattr(node, 'tag'):
|
||||
return "[%s]" % type(node)
|
||||
name = node.tag
|
||||
if node.get('id', ''): name += '#'+node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ','.')
|
||||
if node.get('id', ''):
|
||||
name += '#' + node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ', '.')
|
||||
if name[:4] in ['div#', 'div.']:
|
||||
name = name[3:]
|
||||
if name in ['tr', 'td', 'div', 'p']:
|
||||
if not node in uids:
|
||||
uid = uids[node] = len(uids)+1
|
||||
uid = uids[node] = len(uids) + 1
|
||||
else:
|
||||
uid = uids.get(node)
|
||||
name += "%02d" % (uid)
|
||||
if depth and node.getparent() is not None:
|
||||
return name+' - '+describe(node.getparent(), depth-1)
|
||||
return name + ' - ' + describe(node.getparent(), depth - 1)
|
||||
return name
|
@ -1,21 +1,27 @@
|
||||
import logging
|
||||
import re
|
||||
import chardet
|
||||
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
def get_encoding(page):
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
return enc # can't guess
|
||||
try:
|
||||
diff = text.decode(enc, 'ignore').encode(enc)
|
||||
sizes = len(diff), len(text)
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
||||
# 99% of utf-8
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
|
||||
return enc
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
#print '->', enc, "%.2f" % res['confidence']
|
||||
# print '->', enc, "%.2f" % res['confidence']
|
||||
if enc == 'MacCyrillic':
|
||||
enc = 'cp1251'
|
||||
return enc
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,22 @@
|
||||
import urllib2
|
||||
|
||||
|
||||
class UrlFetch():
|
||||
"""
|
||||
A class for fetching URLs. This provides a layer of abstraction that can
|
||||
be easily replaced for testing.
|
||||
"""
|
||||
|
||||
def urlread(self, url):
|
||||
return urllib2.urlopen(url).read()
|
||||
|
||||
|
||||
class MockUrlFetch(UrlFetch):
|
||||
|
||||
def __init__(self, urldict):
|
||||
self._urldict = urldict
|
||||
|
||||
def urlread(self, url):
|
||||
path = self._urldict[url]
|
||||
with open(path, 'r') as f:
|
||||
return f.read()
|
@ -0,0 +1,174 @@
|
||||
"""
|
||||
This program facilitates the creation of a regression test case as used by the
|
||||
test module. It uses the current readability algorithm to capture a benchmark
|
||||
and construct a new test case.
|
||||
|
||||
"""
|
||||
import argparse
|
||||
import errno
|
||||
import os
|
||||
import os.path
|
||||
import urllib2
|
||||
import yaml
|
||||
|
||||
from readability_lxml import readability
|
||||
from readability_lxml import urlfetch
|
||||
|
||||
from regression import (
|
||||
TEST_DATA_PATH,
|
||||
ORIGINAL_SUFFIX,
|
||||
READABLE_SUFFIX,
|
||||
YAML_EXTENSION,
|
||||
adjust_url_map,
|
||||
read_yaml
|
||||
)
|
||||
|
||||
|
||||
OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
|
||||
|
||||
|
||||
def y_or_n(question):
|
||||
while True:
|
||||
response = raw_input(question).strip()
|
||||
if len(response) > 0:
|
||||
return response[0] in ['y', 'Y']
|
||||
|
||||
|
||||
def write_file(test_name, suffix, data):
|
||||
path = os.path.join(TEST_DATA_PATH, test_name + suffix)
|
||||
mode = 0644
|
||||
try:
|
||||
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
|
||||
except OSError as e:
|
||||
if e.errno == errno.EEXIST:
|
||||
if y_or_n(OVERWRITE_QUESTION % path):
|
||||
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, mode)
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
raise e
|
||||
f = os.fdopen(fd, 'w')
|
||||
f.write(data)
|
||||
return True
|
||||
|
||||
|
||||
def write_original(test_name, orig):
|
||||
return write_file(test_name, ORIGINAL_SUFFIX, orig)
|
||||
|
||||
|
||||
def write_readable(test_name, orig, options):
|
||||
rdbl_doc = readability.Document(orig, **options)
|
||||
summary = rdbl_doc.summary()
|
||||
return write_file(test_name, READABLE_SUFFIX, summary.html)
|
||||
|
||||
|
||||
def read_spec(test_name):
|
||||
yaml_path = os.path.join(
|
||||
TEST_DATA_PATH,
|
||||
test_name + YAML_EXTENSION
|
||||
)
|
||||
return read_yaml(yaml_path)
|
||||
|
||||
def read_orig(test_name, url = None):
|
||||
"""
|
||||
Reads the original HTML for a given test. If a url is provided, the HTML
|
||||
is fetched from it. Otherwise, we look for an existing local copy. This
|
||||
returns a pair: (HTML string, True iff the HTML has been or is already
|
||||
stored in a local copy).
|
||||
"""
|
||||
if url:
|
||||
orig = urllib2.urlopen(url).read()
|
||||
write_result = write_file(test_name, ORIGINAL_SUFFIX, orig)
|
||||
return orig, write_result
|
||||
else:
|
||||
orig_path = os.path.join(
|
||||
TEST_DATA_PATH,
|
||||
test_name + ORIGINAL_SUFFIX
|
||||
)
|
||||
orig = open(orig_path).read()
|
||||
return orig, True
|
||||
|
||||
def create(args):
|
||||
# TODO: Make this work for multi-page articles.
|
||||
spec_dict = {'url': args.url, 'test_description': args.test_description}
|
||||
spec = yaml.dump(spec_dict, default_flow_style = False)
|
||||
if not write_file(args.test_name, YAML_EXTENSION, spec):
|
||||
return False
|
||||
orig = urllib2.urlopen(url).read()
|
||||
if not write_original(args.test_name, orig):
|
||||
return False
|
||||
if not write_readable(args.test_name, orig):
|
||||
return False
|
||||
return True
|
||||
|
||||
def genbench(args):
|
||||
spec_dict = read_spec(args.test_name)
|
||||
if args.refetch:
|
||||
url = spec_dict['url']
|
||||
else:
|
||||
url = None
|
||||
url_map = adjust_url_map(spec_dict.get('url_map', dict()))
|
||||
fetcher = urlfetch.MockUrlFetch(url_map)
|
||||
options = {'url': spec_dict['url'], 'urlfetch': fetcher}
|
||||
orig, success = read_orig(args.test_name, url)
|
||||
if not success:
|
||||
return False
|
||||
rdbl_doc = readability.Document(orig, **options)
|
||||
summary = rdbl_doc.summary()
|
||||
if not write_file(args.test_name, READABLE_SUFFIX, summary.html):
|
||||
return False
|
||||
return True
|
||||
|
||||
DESCRIPTION = 'Create a readability regression test case.'
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description = DESCRIPTION)
|
||||
subparsers = parser.add_subparsers(help = 'available subcommands')
|
||||
|
||||
parser_create = subparsers.add_parser(
|
||||
'create',
|
||||
help = 'create an entirely new test'
|
||||
)
|
||||
parser_create.add_argument(
|
||||
'url',
|
||||
metavar = 'url',
|
||||
help = 'the url for which to generate a test'
|
||||
)
|
||||
parser_create.add_argument(
|
||||
'test_name',
|
||||
metavar = 'test-name',
|
||||
help = 'the name of the test'
|
||||
)
|
||||
parser_create.add_argument(
|
||||
'test_description',
|
||||
metavar = 'test-description',
|
||||
help = 'the description of the test'
|
||||
)
|
||||
parser_create.set_defaults(func = create)
|
||||
|
||||
parser_genbench = subparsers.add_parser(
|
||||
'genbench',
|
||||
help = 'regenerate the benchmark for an existing test'
|
||||
)
|
||||
parser_genbench.add_argument(
|
||||
'test_name',
|
||||
metavar = 'test-name',
|
||||
help = 'the name of the test'
|
||||
)
|
||||
parser_genbench.add_argument(
|
||||
'--refetch',
|
||||
dest = 'refetch',
|
||||
action = 'store_const',
|
||||
const = True,
|
||||
default = False,
|
||||
help = 'if set, original html is refetched from the url'
|
||||
)
|
||||
parser_genbench.set_defaults(func = genbench)
|
||||
|
||||
args = parser.parse_args()
|
||||
result = args.func(args)
|
||||
if not result:
|
||||
print('test was not fully generated')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,15 @@
|
||||
import os
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
REGRESSION_DATA = os.path.join(os.path.dirname(__file__), 'regression_test_data')
|
||||
|
||||
|
||||
def load_sample(filename):
|
||||
"""Helper to get the content out of the sample files"""
|
||||
return open(os.path.join(SAMPLES, filename)).read()
|
||||
|
||||
|
||||
def load_regression_data(filename):
|
||||
"""Get the content of a test_data regression file"""
|
||||
return open(os.path.join(REGRESSION_DATA, filename)).read()
|
@ -0,0 +1,354 @@
|
||||
"""
|
||||
This module provides a regression test for results of running the readability
|
||||
algorithm on a variety of different real-world examples. For each page in the
|
||||
test suite, a benchmark was captured that represents the current readability
|
||||
results. Note that these are not necessarily ideal results, just the ones used
|
||||
as a benchmark.
|
||||
|
||||
This allows you to tweak and change the readability algorithm and see how it
|
||||
changes existing results, hopefully for the better.
|
||||
|
||||
"""
|
||||
import logging
|
||||
import lxml.html
|
||||
import lxml.html.diff
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
import unittest
|
||||
import yaml
|
||||
|
||||
from lxml.html import builder as B
|
||||
from readability_lxml import readability
|
||||
from readability_lxml import urlfetch
|
||||
|
||||
|
||||
DIFF_SUFFIX = '-diff.html'
|
||||
ORIGINAL_SUFFIX = '-orig.html'
|
||||
READABLE_SUFFIX = '-rdbl.html'
|
||||
RESULT_SUFFIX = '-result.html'
|
||||
YAML_EXTENSION = '.yaml'
|
||||
|
||||
TESTDIR = os.path.dirname(__file__)
|
||||
TEST_DATA_PATH = os.path.join(TESTDIR, 'regression_test_data')
|
||||
TEST_OUTPUT_PATH = os.path.join(TESTDIR, 'regression_test_output')
|
||||
TEST_SUMMARY_PATH = os.path.join(TEST_OUTPUT_PATH, 'index.html')
|
||||
|
||||
SUMMARY_CSS = '''
|
||||
table, th, td {
|
||||
border: 1px solid black;
|
||||
border-collapse: collapse;
|
||||
font-family: Georgia, 'Times New Roman', serif;
|
||||
}
|
||||
table {
|
||||
margin: auto;
|
||||
}
|
||||
.skipped {
|
||||
color: gray;
|
||||
}
|
||||
td, th {
|
||||
font-size: 1.2em;
|
||||
border: 1px solid black;
|
||||
padding: 3px 7px 2px 7px;
|
||||
}
|
||||
th {
|
||||
font-size: 16px;
|
||||
text-align: left;
|
||||
padding-top: 5px;
|
||||
padding-bottom: 4px;
|
||||
}
|
||||
'''
|
||||
|
||||
READABILITY_CSS = '''
|
||||
#article {
|
||||
margin: 0 auto;
|
||||
max-width: 705px;
|
||||
min-width: 225px;
|
||||
font-family: Georgia, 'Times New Roman', serif;
|
||||
font-size: 19px;
|
||||
line-height: 29px;
|
||||
}
|
||||
|
||||
#article p {
|
||||
font-size: 19px;
|
||||
line-height: 29px;
|
||||
margin: 19px 0px 19px 0px;
|
||||
}
|
||||
|
||||
ins {
|
||||
background-color: #C6F7C3;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
ins img {
|
||||
border-width: 3px;
|
||||
border-style: dotted;
|
||||
border-color: #51B548;
|
||||
}
|
||||
|
||||
del {
|
||||
background-color: #F7C3C3;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
del img {
|
||||
border-width: 3px;
|
||||
border-style: dotted;
|
||||
border-color: #D12626;
|
||||
}
|
||||
'''
|
||||
|
||||
class ReadabilityTest:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dir_path,
|
||||
enabled,
|
||||
name,
|
||||
url,
|
||||
desc,
|
||||
notes,
|
||||
url_map,
|
||||
orig_path,
|
||||
rdbl_path
|
||||
):
|
||||
self.dir_path = dir_path
|
||||
self.enabled = enabled
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.desc = desc
|
||||
self.notes = notes
|
||||
self.url_map = url_map
|
||||
self.orig_path = orig_path
|
||||
self.rdbl_path = rdbl_path
|
||||
|
||||
|
||||
class ReadabilityTestData:
|
||||
|
||||
def __init__(self, test, orig_html, rdbl_html):
|
||||
self.test = test
|
||||
self.orig_html = orig_html
|
||||
self.rdbl_html = rdbl_html
|
||||
|
||||
|
||||
class ReadabilityTestResult:
|
||||
|
||||
def __init__(self, test_data, result_html, diff_html):
|
||||
self.test_data = test_data
|
||||
self.result_html = result_html
|
||||
self.diff_html = diff_html
|
||||
|
||||
|
||||
def read_yaml(path):
|
||||
with open(path, 'r') as f:
|
||||
return yaml.load(f)
|
||||
|
||||
|
||||
def make_path(dir_path, name, suffix):
|
||||
return os.path.join(dir_path, ''.join([name, suffix]))
|
||||
|
||||
|
||||
def adjust_url_map(url_map):
|
||||
adjusted = dict()
|
||||
for k, v in url_map.items():
|
||||
adjusted[k] = os.path.join(TEST_DATA_PATH, v)
|
||||
return adjusted
|
||||
|
||||
|
||||
def make_readability_test(dir_path, name, spec_dict):
|
||||
enabled = spec_dict.get('enabled', True)
|
||||
notes = spec_dict.get('notes', '')
|
||||
url_map = adjust_url_map(spec_dict.get('url_map', dict()))
|
||||
return ReadabilityTest(
|
||||
dir_path,
|
||||
enabled,
|
||||
name,
|
||||
spec_dict['url'],
|
||||
spec_dict['test_description'],
|
||||
notes,
|
||||
url_map,
|
||||
make_path(dir_path, name, ORIGINAL_SUFFIX),
|
||||
make_path(dir_path, name, READABLE_SUFFIX)
|
||||
)
|
||||
|
||||
|
||||
def load_test_data(test):
|
||||
if test.enabled:
|
||||
orig = open(test.orig_path, 'r').read()
|
||||
rdbl = open(test.rdbl_path, 'r').read()
|
||||
return ReadabilityTestData(test, orig, rdbl)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def load_readability_tests(dir_path, files):
|
||||
yaml_files = [f for f in files if f.endswith(YAML_EXTENSION)]
|
||||
yaml_paths = [os.path.join(dir_path, f) for f in yaml_files]
|
||||
names = [re.sub('.yaml$', '', f) for f in yaml_files]
|
||||
spec_dicts = [read_yaml(p) for p in yaml_paths]
|
||||
return [
|
||||
make_readability_test(dir_path, name, spec_dict)
|
||||
for (name, spec_dict) in zip(names, spec_dicts)
|
||||
]
|
||||
|
||||
|
||||
def execute_test(test_data):
|
||||
if test_data is None:
|
||||
return None
|
||||
else:
|
||||
url = test_data.test.url
|
||||
fetcher = urlfetch.MockUrlFetch(test_data.test.url_map)
|
||||
doc = readability.Document(
|
||||
test_data.orig_html,
|
||||
url=url,
|
||||
urlfetch=fetcher
|
||||
)
|
||||
summary = doc.summary_with_metadata()
|
||||
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
|
||||
return ReadabilityTestResult(test_data, summary.html, diff)
|
||||
|
||||
|
||||
def element_string_lengths(elems):
|
||||
return [len(e.xpath('string()')) for e in elems]
|
||||
|
||||
|
||||
class ResultSummary():
|
||||
|
||||
def __init__(self, result):
|
||||
# logging.debug('diff: %s' % result.diff_html)
|
||||
doc = lxml.html.fragment_fromstring(result.diff_html)
|
||||
|
||||
insertions = doc.xpath('//ins')
|
||||
insertion_lengths = element_string_lengths(insertions)
|
||||
self.insertions = sum(insertion_lengths)
|
||||
self.insertion_blocks = len(insertions)
|
||||
|
||||
deletions = doc.xpath('//del')
|
||||
deletion_lengths = element_string_lengths(deletions)
|
||||
self.deletions = sum(deletion_lengths)
|
||||
self.deletion_blocks = len(deletions)
|
||||
pass
|
||||
|
||||
|
||||
def make_summary_row(test, result):
|
||||
def data(suffix):
|
||||
return os.path.abspath(os.path.join(TEST_DATA_PATH, test.name + suffix))
|
||||
def output(suffix):
|
||||
return test.name + suffix
|
||||
if test.enabled:
|
||||
s = ResultSummary(result)
|
||||
return B.TR(
|
||||
B.TD(test.name),
|
||||
B.TD('%d (%d)' % (s.insertions, s.insertion_blocks)),
|
||||
B.TD('%d (%d)' % (s.deletions, s.deletion_blocks)),
|
||||
B.TD(
|
||||
B.A('original', href = data(ORIGINAL_SUFFIX)),
|
||||
' ',
|
||||
B.A('benchmark', href = output(READABLE_SUFFIX)),
|
||||
' ',
|
||||
B.A('result', href = output(RESULT_SUFFIX)),
|
||||
' ',
|
||||
B.A('diff', href = output(DIFF_SUFFIX))
|
||||
),
|
||||
B.TD(test.notes)
|
||||
)
|
||||
else:
|
||||
return B.TR(
|
||||
B.CLASS('skipped'),
|
||||
B.TD('%s (SKIPPED)' % test.name),
|
||||
B.TD('N/A'),
|
||||
B.TD('N/A'),
|
||||
B.TD('N/A'),
|
||||
B.TD(test.notes)
|
||||
)
|
||||
|
||||
|
||||
def make_summary_doc(tests_w_results):
|
||||
tbody = B.TBODY(
|
||||
B.TR(
|
||||
B.TH('Test Name'),
|
||||
B.TH('Inserted (in # of blocks)'),
|
||||
B.TH('Deleted (in # of blocks)'),
|
||||
B.TH('Links'),
|
||||
B.TH('Notes')
|
||||
)
|
||||
)
|
||||
for (test, result) in tests_w_results:
|
||||
row = make_summary_row(test, result)
|
||||
tbody.append(row)
|
||||
return B.HTML(
|
||||
B.HEAD(
|
||||
B.TITLE('Readability Test Summary'),
|
||||
B.STYLE(SUMMARY_CSS, type = 'text/css')
|
||||
),
|
||||
B.BODY(
|
||||
B.TABLE(
|
||||
tbody
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def write_summary(path, tests_w_results):
|
||||
doc = make_summary_doc(tests_w_results)
|
||||
with open(path, 'w') as f:
|
||||
f.write(lxml.html.tostring(doc))
|
||||
|
||||
|
||||
def add_css(doc):
|
||||
style = B.STYLE(READABILITY_CSS, type = 'text/css')
|
||||
head = B.HEAD(style, content = 'text/html; charset=utf-8')
|
||||
doc.insert(0, head)
|
||||
|
||||
|
||||
def write_output_fragment(fragment, output_dir_path, test_name, suffix):
|
||||
doc = lxml.html.document_fromstring(fragment)
|
||||
add_css(doc)
|
||||
html = lxml.html.tostring(doc)
|
||||
file_name = ''.join([test_name, suffix])
|
||||
path = os.path.join(output_dir_path, file_name)
|
||||
with open(path, 'w') as f:
|
||||
f.write(html)
|
||||
|
||||
|
||||
def write_result(output_dir_path, result):
|
||||
test_name = result.test_data.test.name
|
||||
specs = [
|
||||
(result.test_data.rdbl_html, READABLE_SUFFIX),
|
||||
(result.diff_html, DIFF_SUFFIX),
|
||||
(result.result_html, RESULT_SUFFIX)
|
||||
]
|
||||
for (html, suffix) in specs:
|
||||
write_output_fragment(html, output_dir_path, test_name, suffix)
|
||||
|
||||
|
||||
def print_test_info(test):
|
||||
name_string = '%s' % test.name
|
||||
if test.enabled:
|
||||
skipped = ''
|
||||
else:
|
||||
skipped = ' (SKIPPED)'
|
||||
print('%20s: %s%s' % (name_string, test.desc, skipped))
|
||||
|
||||
def run_readability_tests():
|
||||
files = os.listdir(TEST_DATA_PATH)
|
||||
tests = load_readability_tests(TEST_DATA_PATH, files)
|
||||
test_datas = [load_test_data(t) for t in tests]
|
||||
results = [execute_test(t) for t in test_datas]
|
||||
for (test, result) in zip(tests, results):
|
||||
print_test_info(test)
|
||||
if result:
|
||||
write_result(TEST_OUTPUT_PATH, result)
|
||||
write_summary(TEST_SUMMARY_PATH, zip(tests, results))
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level = logging.DEBUG)
|
||||
if len(sys.argv) > 1 and sys.argv[1] == 'unittest':
|
||||
del sys.argv[1]
|
||||
return unittest.main()
|
||||
run_readability_tests()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,664 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
|
||||
<head>
|
||||
<title>June Web browser stats: Rapid Release edition</title>
|
||||
|
||||
<!-- Begin CSS -->
|
||||
<link rel="stylesheet" type="text/css" href="http://static.arstechnica.net//public/v6/styles/light/light.c.css?1309476728" media="screen" />
|
||||
<link rel="stylesheet" type="text/css" href="http://static.arstechnica.net//public/v6/styles/print/print.css?1309476728" media="print" />
|
||||
<!-- End CSS -->
|
||||
|
||||
<link rel="apple-touch-icon" href="http://static.arstechnica.net/apple-touch-icon.png" />
|
||||
<link rel="canonical" href="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars" />
|
||||
<link rel="shorturl" href="http://arst.ch/q4c" />
|
||||
<link rel="shortlink" href="http://arst.ch/q4c" />
|
||||
<link rev="canonical" href="http://arst.ch/q4c" />
|
||||
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Ars Technica" />
|
||||
<link rel="shortcut icon" href="http://static.arstechnica.net/favicon.ico" />
|
||||
<link rel="icon" type="image/x-icon" href="http://static.arstechnica.net/favicon.ico" />
|
||||
|
||||
<!-- Begin Feeds -->
|
||||
<link rel="alternate" type="application/rssxml" title="The Web" href="http://feeds.arstechnica.com/arstechnica/web/" />
|
||||
|
||||
<link rel="alternate" type="application/rss+xml" title="All Articles " href="http://feeds.arstechnica.com/arstechnica/everything" />
|
||||
<!-- End Feeds -->
|
||||
|
||||
<!-- C-razy IE9 stuff -->
|
||||
<meta name="application-name" content="Ars Technica"/>
|
||||
<meta name="msapplication-starturl" content="http://arstechnica.com/"/>
|
||||
<meta name="msapplication-tooltip" content="Ars Technica: Serving the technologist for 1.2 decades"/>
|
||||
<meta name="msapplication-task" content="name=News;action-uri=http://arstechnica.com/;icon-uri=http://arstechnica.com/favicon.ico"/>
|
||||
<meta name="msapplication-task" content="name=Features;action-uri=http://arstechnica.com/features/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-features.ico"/>
|
||||
<meta name="msapplication-task" content="name=OpenForum;action-uri=http://arstechnica.com/civis/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-forum.ico"/>
|
||||
<meta name="msapplication-task" content="name=One Microsoft Way;action-uri=http://arstechnica.com/microsoft/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-omw.ico"/>
|
||||
<meta name="msapplication-task" content="name=Subscribe;action-uri=http://arstechnica.com/subscriptions/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-subscribe.ico"/>
|
||||
|
||||
|
||||
<!-- Begin Metadata -->
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name="viewport" content="width=1000" />
|
||||
<meta name="description" content="In our monthly look at the world of Web browser market share statistics, we take a look at the first impact of Mozilla's new Rapid Release policy for Firefox and also consider why some Chrome users aren't aboard Google's update bandwagon." />
|
||||
<meta name="keywords" content="" />
|
||||
<meta name="title" content="June Web browser stats: Rapid Release edition" />
|
||||
<link rel="image_src" href="http://static.arstechnica.net/assets/2011/03/firefox-09-small-thumb-300x169-20442-f.jpg" />
|
||||
<meta name="medium" content="news" />
|
||||
|
||||
<meta name="entry_id" content="51247" />
|
||||
<meta property="og:title" content="June Web browser stats: Rapid Release edition"/>
|
||||
<meta property="og:site_name" content="Ars Technica"/>
|
||||
<meta property="og:image" content="http://static.arstechnica.net/assets/2011/03/firefox-09-small-thumb-300x169-20442-f.jpg"/>
|
||||
|
||||
<meta name="advertising" content="ask" />
|
||||
<meta property="fb:admins" content="13703630" />
|
||||
<!-- End Metadata -->
|
||||
<!-- Entry - itbiz_general_computing -->
|
||||
<style type="text/css" id="resource-styles"> </style>
|
||||
<script type="text/javascript" src="/public/shared/scripts/da-1.5.js"></script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
cnp.ad.dart.setSite("ars.dart");
|
||||
cnp.ad.dart.setZone('itbiz_general_computing');
|
||||
//cnp.ad.dart.addParameterString('kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
cnp.ad.dart.addParameterString('mtfIFPath=/mt-static/plugins/ArsTheme/ad-campaigns/doubleclick/');
|
||||
cnp.ad.emptyFrameSrc="/public/shared/scripts/empty.html";
|
||||
cnp.ad.loaderFrameSrc="/public/shared/scripts/ad-loader-frame.html";
|
||||
} catch(e) {}
|
||||
</script>
|
||||
|
||||
<script type="text/javascript" charset="utf-8">
|
||||
// In case someone on a desktop clicks a mobile #! link
|
||||
var l = window.location;
|
||||
if(l.hash.indexOf('#!') !== -1){
|
||||
window.location = l.protocol + '//' + l.host + l.hash.slice(2);
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body class="individual">
|
||||
<div id="page" class="">
|
||||
|
||||
<div id="masthead" class="">
|
||||
<div id="logo"><a href="/"><img src="http://static.arstechnica.net//public/v6/styles/light/images/masthead/logo.png?1309476728" alt="Ars Technica: The Art of Technology" width="110" height="81" /></a></div>
|
||||
<div id="ebc51ce07629d0e14d2fbc4236e44067" >
|
||||
<script type="text/javascript">
|
||||
var pbanner_start = new Date();
|
||||
try {
|
||||
var pbanner = cnp.ad.create(cnp.ad.refreshable, false);
|
||||
//pbanner.addParameter({'dcopt':'ist'});
|
||||
pbanner.addParameterString('kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
pbanner.addParameter({'sz': '728x90' });
|
||||
} catch(e) {}
|
||||
</script>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="search-navigation">
|
||||
<div id="search">
|
||||
<a id="search-link" href="http://www.google.com/cse?cx=011835048811694782689:7zpko-isndo">Search</a>
|
||||
|
||||
<div class="form">
|
||||
<span>Search:</span>
|
||||
<form action="http://www.google.com/cse" id="search-form">
|
||||
<div>
|
||||
<input type="hidden" value="011835048811694782689:7zpko-isndo" name="cx"/>
|
||||
<input type="hidden" value="UTF-8" name="ie"/>
|
||||
<input type="text" id="search-form-text" value="" name="q"/>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div id="navigation">
|
||||
<ul id="primary-navigation">
|
||||
<li class=""><a href="/">All</a></li>
|
||||
<li class="apple"><a href="/apple/">Apple</a></li>
|
||||
<li class="ask-ars"><a href="/ask-ars/">Ask Ars</a></li>
|
||||
<li class="business"><a href="/business/">Business</a></li>
|
||||
<li class="gadgets"><a href="/gadgets/">Gadgets</a></li>
|
||||
<li class="gaming"><a href="/gaming/">Gaming</a></li>
|
||||
<li class="microsoft"><a href="/microsoft/">Microsoft</a></li>
|
||||
<li class="open-source"><a href="/open-source/">Open Source</a></li>
|
||||
<li class="science"><a href="/science/">Science</a></li>
|
||||
<li class="tech-policy"><a href="/tech-policy/">Tech Policy</a></li>
|
||||
<li id="primary-navigation-more" style="display:none;">
|
||||
More
|
||||
<ul >
|
||||
<li><a href="/hardware/">Hardware</a></li>
|
||||
<li><a href="/media/">Media</a></li>
|
||||
<li><a href="/security/">Security</a></li>
|
||||
<li><a href="/software/">Software</a></li>
|
||||
<li><a href="/staff/">Staff</a></li>
|
||||
<li><a href="/telecom/">Telecom</a></li>
|
||||
<li><a href="/web/">Web</a></li>
|
||||
<li style="padding:0;"><span style="display:inline;background-color: #920404; padding: 3px; color:white; -webkit-border-radius: 4px;">New</span> <a style="display:inline;" href="/site/tv.ars" title="Ars Technica TV">Ars.TV</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<ul id="secondary-navigation" class="web">
|
||||
<li class="news selected"><a href="/web/news/">News</a></li>
|
||||
<li class="guides"><a href="/web/guides/">Guides</a></li>
|
||||
<li class="reviews"><a href="/web/reviews/">Reviews</a></li>
|
||||
</ul>
|
||||
<ul id="auxiliary-navigation">
|
||||
<li class="subscribe"><a href="/subscriptions/">Upgrade to a Premier Subscription</a>
|
||||
|
||||
</li>
|
||||
<li class="customize" style="display:none;">
|
||||
<a href="#">Customize ▾</a>
|
||||
<ul>
|
||||
<li>
|
||||
<p>Site Theme:</p>
|
||||
<label><input type="radio" checked="checked" value="light.css" class="site-style" name="site-style" /> White</label>
|
||||
<label><input type="radio" value="dark.css" class="site-style" name="site-style" /> Black</label>
|
||||
</li>
|
||||
<li>
|
||||
<p>Choose body font:</p>
|
||||
<label><input type="radio" checked="checked" value="arial" class="body_font" name="body_font" /> Arial</label>
|
||||
<label><input type="radio" value="helvetica" class="body_font" name="body_font" /> Helvetica</label>
|
||||
</li>
|
||||
<li>
|
||||
<p>Layout (beta):</p>
|
||||
<label><input type="radio" checked="checked" value="normal" class="fp_layout" name="fp_layout" /> Normal</label>
|
||||
<label><input type="radio" value="compact" class="fp_layout" name="fp_layout" /> Compact</label>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li class="openforum"><a href="http://arstechnica.com/civis/">OpenForum</a></li>
|
||||
|
||||
<li class="login-join"><a href="/civis/ucp.php?mode=login&return_to=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars">Login/Join</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="main">
|
||||
|
||||
<div id="silo-header" class="">
|
||||
<h1 class="web"><a href="/web/" title="Go to The Web">The Web</a></h1>
|
||||
</div>
|
||||
|
||||
<div id="content" class="normal"> <div id="content-inner">
|
||||
<div id="story">
|
||||
<h2 class="title">June Web browser stats: Rapid Release edition</h2>
|
||||
<div class="byline"><span class="author">By <a rel="author" href="/author/peter-bright/">Peter Bright</a>
|
||||
</span> | <span class="posted"><span class="published updated"><span class="name">Published </span> <abbr class="timeago datetime" title="2011-07-06T16:00:00Z">July 6, 2011 11:00 AM</abbr></span><span class="modified" style="display:none;"><span class="name">Last updated </span> <abbr class="timeago datetime" title="2011-07-06T16:33:33Z">July 6, 2011 11:33 AM</abbr></span></span></div>
|
||||
|
||||
<div class="story-image" style="width:300px;">
|
||||
<img width="300" src="http://static.arstechnica.net/opensource/firefox-09-small.jpg" alt="" />
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div id="" class="body" style="">
|
||||
<!--body--><p>June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different <a href="http://arstechnica.com/open-source/news/2011/06/firefox-5-released-arrives-only-three-months-after-firefox-4.ars">Firefox 5</a>. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.</p><!--page 1-->
|
||||
|
||||
<p>Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/global-browser-share.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
<p>The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points—that's a loss representing 20% of its market share. Our own Ryan Paul <a href="http://arstechnica.com/software/reviews/2011/06/hands-on-opera-1150s-new-featherweight-interface-packs-a-punch.ars">liked Opera 11.50</a>, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.</p>
|
||||
|
||||
<p>Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions—Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista—rather than making converts of the other browsers.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/internet-explorer-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
|
||||
<p>Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we <a href="http://arstechnica.com/microsoft/news/2010/04/why-microsoft-did-the-right-thing-in-ditching-xp-for-ie9.ars">agree with the decision to cut Windows XP off</a>, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent—putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.</p>
|
||||
|
||||
<p>Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/firefox-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
|
||||
<p>Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/chrome-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
|
||||
<p>Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.</p>
|
||||
|
||||
<p>Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.</p>
|
||||
|
||||
<p>That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).</p>
|
||||
|
||||
<p>To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The <a href="http://data.cnzz.com/main.php?s=brow">most popular</a> of these are <a href="http://se.360.cn/">360 Secure Browser</a> with about 19 percent share of the Chinese market, and <a href="http://ie.sogou.com/">Sogou high speed browser</a>, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like <a href="http://code.google.com/chrome/chromeframe/">Chrome Frame</a>, that allows Internet Explorer users to use Chrome for some browser tabs.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><a href="http://static.arstechnica.com/browsers-june-2011/sogou-ie.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-ie.png" /></a></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-text">Sogou browser running as Internet Explorer</div><div class="news-item-figure-caption-byline">Thanks to Ars reader WJ</div></div></div>
|
||||
|
||||
<p>These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><a href="http://static.arstechnica.com/browsers-june-2011/sogou-chrome.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-chrome.png" /></a></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-text">Sogou browser using its embedded Chrome</div><div class="news-item-figure-caption-byline">Thanks to Ars reader WJ</div></div></div>
|
||||
|
||||
<p>The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/ars-browser-share.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline">Ars Technica</div></div></div>
|
||||
|
||||
<p>Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."</p>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<!-- Article Pager -->
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<noscript>
|
||||
<img style="position: absolute; bottom: 0px; right: 0px; width: 1px; height: 1px;" src="http://arstechnica.com/dragons/brains.gif?id=51247&1396906973" alt="" />
|
||||
</noscript>
|
||||
<script type="text/javascript">
|
||||
document.write('<img style="position: absolute; bottom: 0px; right: 0px; width: 1px; height: 1px;" src="http://arstechnica.com/dragons/brains.gif?id=51247&' + (parseInt(Math.random()*99999999, 10)).toString() + '" alt="" />');
|
||||
</script>
|
||||
|
||||
|
||||
<!--googleoff: all-->
|
||||
|
||||
<div id="comments-bar" class="with-bubble">
|
||||
<h2>User comments</h2>
|
||||
|
||||
<div class="comments-link">
|
||||
<a name="comments-bar" rel="nofollow" href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars?comments=1#comments-bar">Click here to view the 81 comments on this story</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="hiddencomment"></div>
|
||||
<!--<div id="alert"><p><img src="http://arstechnica.com/civis/images/smilies/flail.gif" /> We're making some updates to the commenting system. We should have the kinks worked out soon.</p></div>-->
|
||||
<!--googleon: all-->
|
||||
<div id="links-bar">
|
||||
<ul>
|
||||
|
||||
|
||||
<li class="facebook">
|
||||
<iclint src="http://www.facebook.com/plugins/like.php?href=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars&layout=button_count&show_faces=false&width=85&action=like&font=arial&colorscheme=light&height=21" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:85px; height:21px;" allowTransparency="true"></iclint>
|
||||
</li>
|
||||
|
||||
|
||||
<li><a href="http://twitter.com/share" class="twitter-share-button" data-url="http://arst.ch/q4c" data-counturl="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars" data-count="horizontal" data-via="arstechnica" data-related="drpizza:Peter Bright">Tweet</a></li>
|
||||
|
||||
<li class="reddit">
|
||||
<iclint src="http://www.reddit.com/static/button/button1.html?width=120&url=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars&title=June%20Web%20browser%20stats%3A%20Rapid%20Release%20edition&bgcolor=fff&bordercolor=eee" width="120" height="20" scrolling="no" frameborder="0"></iclint>
|
||||
</li>
|
||||
|
||||
<li class="share">
|
||||
<a class="a2a_dd" href="http://www.addtoany.com/share_save?linkname=June%20Web%20browser%20stats%3A%20Rapid%20Release%20edition&linkurl=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars"><img src="http://static.addtoany.com/buttons/favicon.png" width="16" height="16" border="0" alt="Share/Bookmark" style="display:inline;vertical-align:middle;"/> Share/Email</a>
|
||||
<script type="text/javascript">
|
||||
var a2a_linkname="June Web browser stats: Rapid Release edition",
|
||||
a2a_linkurl="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars",
|
||||
a2a_onclick=1,
|
||||
a2a_show_title=1,
|
||||
a2a_hide_embeds=0,
|
||||
a2a_num_services=8,
|
||||
a2a_color_main="989EA3",
|
||||
a2a_color_border="989EA3",
|
||||
a2a_color_link_text="FF5B00",
|
||||
a2a_color_link_text_hover="ffffff",
|
||||
a2a_track_links='ga',
|
||||
a2a_prioritize= [
|
||||
"digg",
|
||||
"yahoo_buzz",
|
||||
"stumbleupon",
|
||||
"instapaper",
|
||||
"slashdot",
|
||||
"linkedin",
|
||||
"delicious",
|
||||
"google_reader",
|
||||
"tumblr",
|
||||
"posterous"
|
||||
];
|
||||
var a2a_config = a2a_config || {};
|
||||
a2a_config.no_3p = 1;
|
||||
</script>
|
||||
<style type="text/css">#a2apage_BROWSER { display:none !important; }</style>
|
||||
</li>
|
||||
<li class="copypasta copy-pasta-button">Make a correction</li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
<!--googleoff: all-->
|
||||
<div id="read-more-stories">
|
||||
<h2>Read more stories</h2>
|
||||
<div class="story-navigation">
|
||||
<a href="/gadgets/news/2011/07/amazon-appstore-game-developer-pulls-app-highlights-problems.ars" title="Read the previously published article">< Older Story</a>
|
||||
|
||||
|
|
||||
<a href="/tech-policy/news/2011/07/copyright-troll-righthaven-now-starts-paying-those-it-sued.ars" title="Read the next newest article">Newer Story ></a>
|
||||
</div>
|
||||
<!--googleoff: all-->
|
||||
<script language='JavaScript'>
|
||||
var OB_langJS = "http://static.arstechnica.net//public/v6/scripts/outbrain.lang_en_ars.js",OBITm = '1306449288604',OB_raterMode = 'singlethumb',OB_recMode = 'strip',OutbrainPermaLink='http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars';
|
||||
if (typeof(OB_Script)!='undefined' ){OutbrainStart();}else{var OB_Script = true,str = unescape("%3Cscript src=\'http://widgets.outbrain.com/OutbrainRater.js\' type=\'text/javascript\'%3E%3C/script%3E");document.write(str);}
|
||||
</script>
|
||||
<!--googleon: all-->
|
||||
</div>
|
||||
<!--googleon: all-->
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<!--googleoff: all-->
|
||||
<div id="sidebar">
|
||||
|
||||
<div id="article-links" class="with-divider" style="display:none;">
|
||||
|
||||
<ul>
|
||||
<li class="enlarge-text"><a href="#">Increase text size</a></li>
|
||||
<li class="shrink-text"><a href="#">Reduce text size</a></li>
|
||||
<li class="print"><a href="#">Print this story</a></li>
|
||||
|
||||
<li class="comment"><a href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars?comments=1#comments-bar#comments-bar">Leave a comment (81)</a></li>
|
||||
<li class="copy-pasta-button edit-suggestion" style="display: none;"><a href="#">Make a correction</a></li>
|
||||
<li class="shorturl"><a rel="nofollow" href="http://arst.ch/q4c">http://arst.ch/q4c</a></li>
|
||||
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<style type="text/css" media="screen">
|
||||
#gwmdRBfSihEbZa {
|
||||
height: 250px;
|
||||
width: 300px;
|
||||
min-height: 250px;
|
||||
margin-bottom: 10px;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
#gwmdRBfSihEbZa.tall {
|
||||
height: 600px;
|
||||
}
|
||||
|
||||
body.premium-adset #gwmdRBfSihEbZa {
|
||||
/* height: 600px; */
|
||||
}
|
||||
</style>
|
||||
|
||||
<abbr></abbr>
|
||||
<blah></blah>
|
||||
<abbr></abbr>
|
||||
|
||||
<div id="gwmdRBfSihEbZa" class="">
|
||||
|
||||
<noscript>
|
||||
<div id="help-by-subscribing">
|
||||
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/2"><img src="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/4" alt="Please subscribe" /></a></div>
|
||||
</noscript>
|
||||
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var ppanel = cnp.ad.create(cnp.ad.refreshable, false);
|
||||
ppanel.addParameter({'sz':'300x250'});
|
||||
ppanel.addParameterString('kw=top;kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
ppanel.load();
|
||||
} catch(e) {}
|
||||
</script>
|
||||
</div>
|
||||
<div id="journals-box" class="with-divider">
|
||||
<h2 class="title">Latest Top Stories</h2>
|
||||
<ul class="category">
|
||||
<li class="all selected">
|
||||
<span class="tab-inner">
|
||||
<a href="/" title="All">All</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="apple">
|
||||
<span class="tab-inner">
|
||||
<a href="/apple/" title="Apple">Apple</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="gaming">
|
||||
<span class="tab-inner">
|
||||
<a href="/gaming/" title="Gaming">Gaming</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="microsoft">
|
||||
<span class="tab-inner">
|
||||
<a href="/microsoft/" title="Microsoft">Microsoft</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="gadgets">
|
||||
<span class="tab-inner">
|
||||
<a href="/gadgets/" title="Gadgets">Gadgets</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="open-source">
|
||||
<span class="tab-inner">
|
||||
<a href="/open-source/" title="Open Source">Open Source</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="business">
|
||||
<span class="tab-inner">
|
||||
<a href="/business/" title="Business">Business</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="science">
|
||||
<span class="tab-inner">
|
||||
<a href="/science/" title="Science">Science</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="tech-policy">
|
||||
<span class="tab-inner">
|
||||
<a href="/tech-policy/" title="Tech Policy">Tech Policy</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="staff">
|
||||
<span class="tab-inner">
|
||||
<a href="/staff/" title="Staff">Staff</a>
|
||||
</span>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="stories">
|
||||
<li id="journal-box-0" class="gadgets">
|
||||
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars">Dual-core Motorola Droid 3 launches July 14 for $199 on Verizon</a>
|
||||
</li>
|
||||
<li id="journal-box-1" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/major-isps-agree-to-six-strikes-copyright-enforcement-plan.ars">Major ISPs agree to "six strikes" copyright enforcement plan</a>
|
||||
</li>
|
||||
<li id="journal-box-2" class="gaming">
|
||||
<a href="/gaming/news/2011/07/sony-to-include-mandatory-psn-pass-codes-in-first-party-games.ars">Sony to include one-time use "PSN Pass" code in its games</a>
|
||||
</li>
|
||||
<li id="journal-box-3" class="gaming">
|
||||
<a href="/gaming/news/2011/07/journey-turns-strangers-to-friends-in-odd-desolate-landscape.ars"><em>Journey</em> turns strangers into friends in odd, desolate landscape</a>
|
||||
</li>
|
||||
<li id="journal-box-4" class="science">
|
||||
<a href="/science/news/2011/07/is-science-getting-harder-first-define-easy.ars">Is scientific progress slowing? Depends how you measure it</a>
|
||||
</li>
|
||||
<li id="journal-box-5" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/did-the-titanic-disaster-let-uncle-sam-take-over-the-airwaves.ars">How the <em>Titanic</em> disaster pushed Uncle Sam to "rule the air"</a>
|
||||
</li>
|
||||
<li id="journal-box-6" class="web">
|
||||
<a href="/web/news/2011/07/facebook-video-chatting-handy-definitely-not-awesome.ars">Analysis: Facebook video chatting handy, definitely not "awesome"</a>
|
||||
</li>
|
||||
<li id="journal-box-7" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/dozens-of-law-professors-protect-ip-act-is-unconstitutional.ars">Dozens of law professors: PROTECT IP Act is unconstitutional</a>
|
||||
</li>
|
||||
<li id="journal-box-8" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/should-net-neutrality-protect-third-party-mobile-tethering-apps.ars">Does net neutrality protect mobile tethering apps?</a>
|
||||
</li>
|
||||
<li id="journal-box-9" class="apple">
|
||||
<a href="/apple/news/2011/07/wsj-next-iphone-to-be-thinner-and-lighter-than-iphone-4.ars">WSJ: next iPhone to be "thinner and lighter" than iPhone 4</a>
|
||||
</li>
|
||||
<li id="journal-box-10" class="apple">
|
||||
<a href="/apple/news/2011/07/iphone-users-spend-147-hours-a-month-playing-games.ars">iPhone users spend 14.7 hours a month playing games</a>
|
||||
</li>
|
||||
<li id="journal-box-11" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/copyright-troll-righthaven-now-starts-paying-those-it-sued.ars">Copyright troll Righthaven now starts paying those it sued</a>
|
||||
</li>
|
||||
<li id="journal-box-12" class="web">
|
||||
<a href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars">June Web browser stats: Rapid Release edition</a>
|
||||
</li>
|
||||
<li id="journal-box-13" class="gadgets">
|
||||
<a href="/gadgets/news/2011/07/amazon-appstore-game-developer-pulls-app-highlights-problems.ars">Amazon Appstore problems: why one developer pulled its game</a>
|
||||
</li>
|
||||
<li id="journal-box-14" class="science">
|
||||
<a href="/science/news/2011/07/ocean-sediment-promising-source-of-rare-earth-metals.ars">Why ocean mud might matter to your future iPhone</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="with-divider" id="fb">
|
||||
<iclint src="http://www.facebook.com/plugins/likebox.php?href=http%3A%2F%2Ffacebook.com%2Farstechnica&width=300&colorscheme=light&show_faces=false&stream=false&header=false&height=62&border_color=%23FFFFFF" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:300px; height:62px;" allowTransparency="true"></iclint>
|
||||
<iclint src="http://www.facebook.com/plugins/activity.php?site=arstechnica.com&width=300&height=370&header=false&colorscheme=light&recommendations=false&border_color=%23FFFFFF" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:300px; height:370px;" allowTransparency="true"></iclint>
|
||||
|
||||
<p><a href="#" class="anonymous">Disable Facebook on Ars</a></p>
|
||||
</div>
|
||||
<style type="text/css" media="screen">
|
||||
#mieBfNdjZYK {
|
||||
height: 250px;
|
||||
width: 300px;
|
||||
min-height: 250px;
|
||||
margin-bottom: 10px;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
#mieBfNdjZYK.tall {
|
||||
height: 600px;
|
||||
}
|
||||
|
||||
body.premium-adset #mieBfNdjZYK {
|
||||
/* height: 600px; */
|
||||
}
|
||||
</style>
|
||||
|
||||
<kjaskjas></kjaskjas>
|
||||
<blah></blah>
|
||||
<sakjasd></sakjasd>
|
||||
<div></div>
|
||||
<kjaskjas></kjaskjas>
|
||||
<div></div>
|
||||
<span></span>
|
||||
<clint></clint>
|
||||
|
||||
<div id="mieBfNdjZYK" class="">
|
||||
|
||||
<noscript>
|
||||
<div id="help-by-subscribing">
|
||||
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/2"><img src="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/4" alt="Please subscribe" /></a></div>
|
||||
</noscript>
|
||||
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var ppanel = cnp.ad.create(cnp.ad.refreshable, false);
|
||||
ppanel.addParameter({'sz':'300x250'});
|
||||
ppanel.addParameterString('kw=bottom;kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
ppanel.load();
|
||||
} catch(e) {}
|
||||
</script>
|
||||
</div>
|
||||
<div id="jobs-ars" class="with-divider">
|
||||
<h2 class="title">
|
||||
<span class="title">Job.Ars</span>:
|
||||
<span class="subtitle">looking for a new job?</span>
|
||||
</h2>
|
||||
<div class="body">
|
||||
<ul>
|
||||
<div id="jobs-ars-content">
|
||||
<ul>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1027/">Software Engineer</a> at minerva-associates.com</div>
|
||||
<div class="job-location">San Diego, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1026/">Software Engineer</a> at minerva-associates.com</div>
|
||||
<div class="job-location">San Diego, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1025/">Senior Java / Scala Developer - Sequencing Informatics </a> at The Broad Institute</div>
|
||||
<div class="job-location">Cambridge, MA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1024/">Senior Java / Scala Developer - Sequencing Informatics </a> at The Broad Institute</div>
|
||||
<div class="job-location">Cambridge, MA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1022/">Web Developer for Online Organizing Incubator</a> at Citizen Engagement Laboratory</div>
|
||||
<div class="job-location">San Francisco Bay Area required</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1021/">.NET Developer (Oklahoma City & Salt Lake City) </a> at a la mode, inc.</div>
|
||||
<div class="job-location">Oklahoma City and Salt Lake City</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1019/">Senior Systems Administrator</a> at Synacor</div>
|
||||
<div class="job-location">Buffalo, NY</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1018/">Network Engineer</a> at Box.net</div>
|
||||
<div class="job-location">Palo Alto, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1017/">Software Engineer - Operations</a> at imo</div>
|
||||
<div class="job-location">Palo Alto, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1016/">Software Engineer</a> at imo</div>
|
||||
<div class="job-location">Palo Alto, CA</div>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
<div id="more-jobs"><a href="//jobs.arstechnica.com">More Job Listings</a></div>
|
||||
</div> </ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<!--googleon: all-->
|
||||
</div>
|
||||
<div id="footer">
|
||||
<div id="slogan">Serving the technologist for <span id="decades">1</span> × 10<sup>-1</sup> centuries</div>
|
||||
<iframe src="http://static.arstechnica.net//public/v6/footer.html?1309476727" frameborder="0" scrolling="no" width="1000" height="350"></iframe>
|
||||
</div>
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-31997-1']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
_gaq.push(['_trackPageLoadTime']);
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
var page_class = 'individual',
|
||||
site_root = "",
|
||||
site_root_rel = '/',
|
||||
discussion_url = "",
|
||||
entry_author = {
|
||||
"peter bright":true,
|
||||
"peter bright":true,
|
||||
"drpizza":true
|
||||
},
|
||||
entry_id = 51247,
|
||||
fp_layout = 'normal',
|
||||
syntaxhighlighter = "http://arstechnica.com/public/full/scripts/syntaxhighlighter.js",
|
||||
new_comments = true,
|
||||
disable_fb = 'false';
|
||||
</script>
|
||||
|
||||
|
||||
<script src="http://static.arstechnica.net//public/v6/scripts/site.min.js?1309476727" type="text/javascript" charset="utf-8"></script>
|
||||
|
||||
<noscript>
|
||||
<img src="http://b.scorecardresearch.com/b?c1=2&c2=6035094&c3=&c4=&c5=&c6=&c15=&cv=1.3&cj=1" style="position:absolute; bottom: 0px; right:0px;"
|
||||
width="1" height="1" alt="" />
|
||||
</noscript>
|
||||
|
||||
<span style="display: none" id="ArsTechnicaNews" class="hslice">
|
||||
<span style="display: none" class="entry-title">Ars Technica News</span>
|
||||
<a style="display: none" href="http://www.ieaddons.com/en/ie8slice/Content.ashx?id=330" rel="entry-content"></a>
|
||||
</span>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,53 @@
|
||||
<div id="article"><div id="" class="body">
|
||||
<p>June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different <a href="http://arstechnica.com/open-source/news/2011/06/firefox-5-released-arrives-only-three-months-after-firefox-4.ars">Firefox 5</a>. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.</p>
|
||||
|
||||
<p>Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/global-browser-share.png"/></div></div>
|
||||
<p>The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points—that's a loss representing 20% of its market share. Our own Ryan Paul <a href="http://arstechnica.com/software/reviews/2011/06/hands-on-opera-1150s-new-featherweight-interface-packs-a-punch.ars">liked Opera 11.50</a>, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.</p>
|
||||
|
||||
<p>Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions—Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista—rather than making converts of the other browsers.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/internet-explorer-transition.png"/></div></div>
|
||||
|
||||
<p>Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we <a href="http://arstechnica.com/microsoft/news/2010/04/why-microsoft-did-the-right-thing-in-ditching-xp-for-ie9.ars">agree with the decision to cut Windows XP off</a>, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent—putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.</p>
|
||||
|
||||
<p>Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/firefox-transition.png"/></div></div>
|
||||
|
||||
<p>Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/chrome-transition.png"/></div></div>
|
||||
|
||||
<p>Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.</p>
|
||||
|
||||
<p>Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.</p>
|
||||
|
||||
<p>That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).</p>
|
||||
|
||||
<p>To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The <a href="http://data.cnzz.com/main.php?s=brow">most popular</a> of these are <a href="http://se.360.cn/">360 Secure Browser</a> with about 19 percent share of the Chinese market, and <a href="http://ie.sogou.com/">Sogou high speed browser</a>, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like <a href="http://code.google.com/chrome/chromeframe/">Chrome Frame</a>, that allows Internet Explorer users to use Chrome for some browser tabs.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><a href="http://static.arstechnica.com/browsers-june-2011/sogou-ie.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-ie.png"/></a></div><div class="news-item-figure-caption"><p class="news-item-figure-caption-text">Sogou browser running as Internet Explorer</p><p class="news-item-figure-caption-byline">Thanks to Ars reader WJ</p></div></div>
|
||||
|
||||
<p>These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><a href="http://static.arstechnica.com/browsers-june-2011/sogou-chrome.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-chrome.png"/></a></div><div class="news-item-figure-caption"><p class="news-item-figure-caption-text">Sogou browser using its embedded Chrome</p><p class="news-item-figure-caption-byline">Thanks to Ars reader WJ</p></div></div>
|
||||
|
||||
<p>The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/ars-browser-share.png"/></div></div>
|
||||
|
||||
<p>Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."</p>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
@ -0,0 +1,2 @@
|
||||
test_description: standard article from arstechnica
|
||||
url: http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars
|
@ -0,0 +1,52 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>A Simple Multi-Page Article For Testing : Page 2</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 2</h1>
|
||||
<p>
|
||||
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
|
||||
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam
|
||||
risus vehicula nibh, in scelerisque lorem risus et risus. Aliquam
|
||||
erat volutpat. Pellentesque habitant morbi tristique senectus et
|
||||
netus et malesuada fames ac turpis egestas. Donec blandit venenatis
|
||||
feugiat. Ut quis turpis ac urna consectetur sagittis. Vestibulum
|
||||
aliquet eros et orci placerat vitae tempus tellus pretium. Quisque
|
||||
rutrum sapien quis nibh facilisis quis posuere ipsum elementum. In
|
||||
ac pretium justo. Sed egestas luctus mollis. Donec rutrum leo a
|
||||
turpis facilisis commodo. Nam quis quam eget mi malesuada
|
||||
scelerisque. Pellentesque semper condimentum sagittis. Nam
|
||||
lobortis, tortor ut placerat viverra, ante felis vehicula sem,
|
||||
blandit ultricies purus urna eget elit. Pellentesque habitant morbi
|
||||
tristique senectus et netus et malesuada fames ac turpis egestas.
|
||||
Sed vel nulla sollicitudin dolor adipiscing dapibus aliquam vitae
|
||||
leo. Phasellus at turpis tempus lectus pellentesque faucibus.
|
||||
</p>
|
||||
<p>
|
||||
Quisque egestas congue metus quis semper. Integer in ornare nunc.
|
||||
Nunc in est eget risus pulvinar tincidunt. Nullam eu tempus tortor.
|
||||
Suspendisse potenti. Aliquam erat volutpat. Praesent sem leo,
|
||||
molestie a dignissim eget, aliquet sit amet est. Suspendisse sed
|
||||
libero in urna tincidunt viverra. Maecenas posuere risus non elit
|
||||
adipiscing a tristique nibh aliquet. Nullam varius risus vitae
|
||||
turpis lacinia pharetra bibendum magna aliquam. Nam consectetur
|
||||
mattis lectus, vitae hendrerit lectus iaculis ut. Curabitur commodo
|
||||
pharetra nibh mollis pulvinar. Nulla in metus dui, vitae ultrices
|
||||
nibh. Cum sociis natoque penatibus et magnis dis parturient montes,
|
||||
nascetur ridiculus mus. Cras sed condimentum mi. Morbi vitae velit
|
||||
in neque tincidunt imperdiet quis quis orci. Proin molestie, erat
|
||||
convallis vulputate consectetur, diam odio interdum arcu, non
|
||||
semper neque ante a dolor.
|
||||
</p>
|
||||
<ul id="pageNumbers">
|
||||
<li> 1 </li>
|
||||
<li>
|
||||
<a title="Page 1" href="/article.html">1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a title="Page 3" href="/article.html?pagewanted=3">3</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,60 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>A Simple Multi-Page Article For Testing : Page 3</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
|
||||
<p>
|
||||
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
|
||||
erat, lobortis varius est massa quis metus. Donec vitae justo
|
||||
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
|
||||
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
|
||||
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
|
||||
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
|
||||
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
|
||||
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
|
||||
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
|
||||
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
|
||||
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
|
||||
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
|
||||
imperdiet est.
|
||||
</p>
|
||||
<p>
|
||||
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
|
||||
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
|
||||
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
|
||||
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
|
||||
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
|
||||
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
|
||||
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
|
||||
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
|
||||
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
|
||||
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
|
||||
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
|
||||
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
|
||||
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
|
||||
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
|
||||
dolor, imperdiet eget rutrum tempus, euismod nec augue.
|
||||
</p>
|
||||
<p>
|
||||
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
|
||||
neque magna, in laoreet felis. Aenean elit ligula, tempor in
|
||||
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
|
||||
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
|
||||
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
|
||||
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
|
||||
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
|
||||
magna scelerisque vitae vulputate ipsum luctus.
|
||||
</p>
|
||||
<ul id="pageNumbers">
|
||||
<li> 1 </li>
|
||||
<li>
|
||||
<a title="Page 1" href="/article.html">1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,123 @@
|
||||
<div id="article">
|
||||
<h1>A Simple Multi-Page Article For Testing</h1>
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
<p>
|
||||
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
|
||||
pharetra at consequat tristique, convallis nec turpis. Vestibulum
|
||||
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
|
||||
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
|
||||
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
|
||||
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
|
||||
dignissim vitae libero. Nunc at mauris et ante accumsan
|
||||
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
|
||||
venenatis eu consectetur non, vehicula vel metus. Curabitur
|
||||
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
|
||||
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
|
||||
risus, sagittis ut elementum ut, porttitor non libero. Integer
|
||||
fringilla magna quis augue dapibus malesuada. Nulla consectetur
|
||||
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
|
||||
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
|
||||
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
|
||||
massa.
|
||||
</p>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 2</h1>
|
||||
<p>
|
||||
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
|
||||
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam
|
||||
risus vehicula nibh, in scelerisque lorem risus et risus. Aliquam
|
||||
erat volutpat. Pellentesque habitant morbi tristique senectus et
|
||||
netus et malesuada fames ac turpis egestas. Donec blandit venenatis
|
||||
feugiat. Ut quis turpis ac urna consectetur sagittis. Vestibulum
|
||||
aliquet eros et orci placerat vitae tempus tellus pretium. Quisque
|
||||
rutrum sapien quis nibh facilisis quis posuere ipsum elementum. In
|
||||
ac pretium justo. Sed egestas luctus mollis. Donec rutrum leo a
|
||||
turpis facilisis commodo. Nam quis quam eget mi malesuada
|
||||
scelerisque. Pellentesque semper condimentum sagittis. Nam
|
||||
lobortis, tortor ut placerat viverra, ante felis vehicula sem,
|
||||
blandit ultricies purus urna eget elit. Pellentesque habitant morbi
|
||||
tristique senectus et netus et malesuada fames ac turpis egestas.
|
||||
Sed vel nulla sollicitudin dolor adipiscing dapibus aliquam vitae
|
||||
leo. Phasellus at turpis tempus lectus pellentesque faucibus.
|
||||
</p>
|
||||
<p>
|
||||
Quisque egestas congue metus quis semper. Integer in ornare nunc.
|
||||
Nunc in est eget risus pulvinar tincidunt. Nullam eu tempus tortor.
|
||||
Suspendisse potenti. Aliquam erat volutpat. Praesent sem leo,
|
||||
molestie a dignissim eget, aliquet sit amet est. Suspendisse sed
|
||||
libero in urna tincidunt viverra. Maecenas posuere risus non elit
|
||||
adipiscing a tristique nibh aliquet. Nullam varius risus vitae
|
||||
turpis lacinia pharetra bibendum magna aliquam. Nam consectetur
|
||||
mattis lectus, vitae hendrerit lectus iaculis ut. Curabitur commodo
|
||||
pharetra nibh mollis pulvinar. Nulla in metus dui, vitae ultrices
|
||||
nibh. Cum sociis natoque penatibus et magnis dis parturient montes,
|
||||
nascetur ridiculus mus. Cras sed condimentum mi. Morbi vitae velit
|
||||
in neque tincidunt imperdiet quis quis orci. Proin molestie, erat
|
||||
convallis vulputate consectetur, diam odio interdum arcu, non
|
||||
semper neque ante a dolor.
|
||||
</p>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
|
||||
<p>
|
||||
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
|
||||
erat, lobortis varius est massa quis metus. Donec vitae justo
|
||||
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
|
||||
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
|
||||
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
|
||||
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
|
||||
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
|
||||
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
|
||||
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
|
||||
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
|
||||
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
|
||||
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
|
||||
imperdiet est.
|
||||
</p>
|
||||
<p>
|
||||
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
|
||||
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
|
||||
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
|
||||
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
|
||||
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
|
||||
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
|
||||
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
|
||||
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
|
||||
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
|
||||
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
|
||||
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
|
||||
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
|
||||
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
|
||||
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
|
||||
dolor, imperdiet eget rutrum tempus, euismod nec augue.
|
||||
</p>
|
||||
<p>
|
||||
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
|
||||
neque magna, in laoreet felis. Aenean elit ligula, tempor in
|
||||
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
|
||||
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
|
||||
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
|
||||
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
|
||||
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
|
||||
magna scelerisque vitae vulputate ipsum luctus.
|
||||
</p>
|
||||
</div>
|
@ -0,0 +1,60 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>A Simple Multi-Page Article For Testing</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>A Simple Multi-Page Article For Testing</h1>
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
<p>
|
||||
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
|
||||
pharetra at consequat tristique, convallis nec turpis. Vestibulum
|
||||
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
|
||||
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
|
||||
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
|
||||
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
|
||||
dignissim vitae libero. Nunc at mauris et ante accumsan
|
||||
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
|
||||
venenatis eu consectetur non, vehicula vel metus. Curabitur
|
||||
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
|
||||
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
|
||||
risus, sagittis ut elementum ut, porttitor non libero. Integer
|
||||
fringilla magna quis augue dapibus malesuada. Nulla consectetur
|
||||
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
|
||||
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
|
||||
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
|
||||
massa.
|
||||
</p>
|
||||
<ul id="pageNumbers">
|
||||
<li> 1 </li>
|
||||
<li>
|
||||
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
|
||||
</li>
|
||||
<li>
|
||||
<a title="Page 3" href="/article.html?pagewanted=3">3</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,11 @@
|
||||
<div id="article"><div class="comment-content" id="comment-content-4e141229cadcbbb33f050000">
|
||||
|
||||
|
||||
<p class="comment-text">
|
||||
Yep, you gotta love that almost 90% market share failure. Like I said before, if that's failure than sign me up for some of that. I'm pretty sure the good people over at Apple, Google, etc. would like to be signed up for some of that failure too.<br/><br/>
|
||||
For the, "If this, if that, (insert scenario)" people, enjoy your new OS and whatever other new software you may choose to use. However, don't be surprised when those metro ui interface imitations start to land on those products too. Did you really think that static grid-icons on a screen was going to last forever? I think 20+ years is enough, it's time for new innovation in design and don't be surprised when the copycats jump on board. That's the way the industry works. One group comes up with a new design or concept and the others tend to follow suit and you don't have to be a market leader to get that following. Just ask the Opera/Chrome developers. That's just one of many, many examples that could be pointed out. The metro ui is a very suitable design for the touch screen world that we're migrating to. Sure, there will be changes and enhancements as time goes on and everyone will put their own spin on it, but I'd get used to similar offerings from MSFT's competitors if I were you.<br/><br/>
|
||||
Also, for those who like to comment, but seem to have little info about what's expected in things like Windows 8, let me fill you in a bit. The info. out right now is that Windows 8 will let you choose to use the new ui or to use the more, "Windows past" icon ui. I think anyone with some modicum of common sense can see how that would be a wise move from MSFT. For instance: The metro ui may not appeal to the corporate world as much as the consumer world. Plus, it give long-time Window's users the option to stick with what they know, but still gain the newest features and security measures that new OS's tend to bring. So, if your going to use another product, but all means, have fun with it, but don't try to justify it to yourself with reasons that are unlikely to exist. Just say you want to move on and anyone else can respect that, but when you seem to have little knowledge of what your options will be, it just makes you look like the typical sheep some people can be.<br/><br/>
|
||||
Personally, I love the new direction MSFT is going in and for the first time in years, they seem to be thinking more and more consumer friendly. That's not an easy task for a company who has to appeal to business the way MSFT does and I commend the effort. Believe me, or don't, but Apple, Google and any other group would suffer the same balancing act if they dominated the corporate world the way Microsoft does. Corporate and consumers are very different beasts and it's not always easy to appeal to both, yet Microsoft has kept a large following in both sectors and anyone who doesn't see the skill it takes to do that, has a lot to learn my friends. </p>
|
||||
|
||||
</div>
|
||||
</div>
|
@ -0,0 +1,3 @@
|
||||
test_description: businessinsider article
|
||||
notes: missed the article completely; got a long comment instead
|
||||
url: http://www.businessinsider.com/where-windows-8-came-from-microsoft-ui-ideas-that-never-took-off-2011-7
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,2 @@
|
||||
test_description: cnet article
|
||||
url: http://howto.cnet.com/8301-11310_39-20078249-285/best-free-alternatives-to-top-selling-software/?tag=epicStories
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,2 @@
|
||||
test_description: deadspin article
|
||||
url: http://deadspin.com/5820463/would-you-kill-a-stranger-to-save-football
|
@ -0,0 +1,48 @@
|
||||
<div id="article">
|
||||
<div id="page-1" class="article-page">
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
</div>
|
||||
<div id="page-2" class="article-page">
|
||||
<p>
|
||||
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
|
||||
pharetra at consequat tristique, convallis nec turpis. Vestibulum
|
||||
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
|
||||
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
|
||||
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
|
||||
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
|
||||
dignissim vitae libero. Nunc at mauris et ante accumsan
|
||||
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
|
||||
venenatis eu consectetur non, vehicula vel metus. Curabitur
|
||||
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
|
||||
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
|
||||
risus, sagittis ut elementum ut, porttitor non libero. Integer
|
||||
fringilla magna quis augue dapibus malesuada. Nulla consectetur
|
||||
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
|
||||
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
|
||||
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
|
||||
massa.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
@ -0,0 +1,25 @@
|
||||
<div id="page-1" class="article-page">
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
</div>
|
@ -0,0 +1,20 @@
|
||||
<div id="page-3" class="article-page">
|
||||
<p>
|
||||
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
|
||||
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam risus
|
||||
vehicula nibh, in scelerisque lorem risus et risus. Aliquam erat
|
||||
volutpat. Pellentesque habitant morbi tristique senectus et netus et
|
||||
malesuada fames ac turpis egestas. Donec blandit venenatis feugiat. Ut
|
||||
quis turpis ac urna consectetur sagittis. Vestibulum aliquet eros et
|
||||
orci placerat vitae tempus tellus pretium. Quisque rutrum sapien quis
|
||||
nibh facilisis quis posuere ipsum elementum. In ac pretium justo. Sed
|
||||
egestas luctus mollis. Donec rutrum leo a turpis facilisis commodo. Nam
|
||||
quis quam eget mi malesuada scelerisque. Pellentesque semper
|
||||
condimentum sagittis. Nam lobortis, tortor ut placerat viverra, ante
|
||||
felis vehicula sem, blandit ultricies purus urna eget elit.
|
||||
Pellentesque habitant morbi tristique senectus et netus et malesuada
|
||||
fames ac turpis egestas. Sed vel nulla sollicitudin dolor adipiscing
|
||||
dapibus aliquam vitae leo. Phasellus at turpis tempus lectus
|
||||
pellentesque faucibus.
|
||||
</p>
|
||||
</div>
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,31 @@
|
||||
<div id="article"><div class="mod-article-title">
|
||||
<div class="datehead"><span class="page-actions">
|
||||
<p id="fb-root"/><p class="date"><span>Updated: </span>July 12, 2011, 4:52 PM ET</p>
|
||||
</span></div>
|
||||
<p class="headline">
|
||||
</p><h1 class="h2">Roger Clemens' defense sets strategy</h1>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div><p>
|
||||
WASHINGTON -- <a href="http://espn.go.com/mlb/player/_/id/1427/roger-clemens">Roger Clemens</a>' attorney revealed Tuesday that the ex-baseball star plans to begin his defense against charges of lying to Congress by questioning if the lawmakers' investigation into whether he used performance-enhancing drugs was proper.</p><p>Clemens attorney Michael Attanasio said in court that the hearing the House Oversight and Government Reform Committee held in February 2008 had nothing to do with Congress' responsibility for legislation. He said the hearing was only concerned with airing a "credibility contest" between Clemens and his longtime trainer, Brian McNamee, who said he injected the pitcher with steroids and human growth hormone.</p><p/><div class="mod-container mod-inline content-box mod-podcast floatright mod-no-header-footer">
|
||||
<div class="mod-content"><h4>Mike and Mike in the Morning</h4><p class="podcast-player"/>
|
||||
<p>ESPN legal analyst Roger Cossack explains what is going on with the Roger Clemens trial.</p>
|
||||
<p class="footer clear"><a href="http://espn.go.com/espnradio/podcast/"> More Podcasts »</a></p></div></div>
|
||||
<p>Clemens denied those allegations and has been charged with perjury, false statements and obstruction of Congress. The obstruction count charges Clemens with making 15 false or misleading statements to the committee, including his repeated denials he didn't take performance-enhancing drugs during his 24-season career and even whether he attended a 1998 pool party at then-<a href="http://espn.go.com/mlb/team/_/name/tor/toronto-blue-jays">Toronto Blue Jays</a> teammate Jose Canseco's home in Miami.</p><p>McNamee says he saw Clemens and admitted steroids user Canseco talking at the party with another man and that after they returned to Canada, Clemens asked McNamee to inject him with steroids for the first time. </p><p>
|
||||
Clemens and Canseco say Clemens was never at the party but was golfing at the time. Attanasio said that dispute suggests how improper the whole inquiry was and that jurors should be able to determine whether a "he said, he said debate" between Clemens and McNamee was a legitimate congressional concern.</p><p>"We're going to have a mini-trial on whether Roger Clemens went swimming," Attanasio said. "We're going to have a trial in U.S. District Court, Congress is going to have a hearing on these things? That's our point."</p><p>Assistant U.S. attorney Daniel Butler responded that the committee has responsibility for oversight that is broad and goes beyond legislation. He said steroids in baseball is a drug matter and pointed out that a 2005 hearing into the issue led to legislation to regulate steroids and triggered Major League Baseball to commission a report by former Sen. George Mitchell into the extent of the problem in the league.</p><p/><div class="mod-container mod-no-footer mod-inline content-box floatright mod-no-header-footer">
|
||||
<div class="mod-content"><h4>Follow the trial</h4>
|
||||
<img class="io-img" src="http://a.espncdn.com/photo/2010/0116/quinn_tj_m.jpg" border="0"/><p>ESPN's T.J. Quinn will provide live coverage from the courtroom during the Clemens trial. Follow along with our up-to-the-minute <a href="http://twitter.com/#!/TJQuinnESPN" target="_blank"><b>Twitter coverage</b></a>.<br/>
|
||||
•  <b><a href="http://espn.go.com/photo/preview/!pdfs/espn_voir_dire_questions.pdf">Voir dire questions</a></b>
|
||||
</p></div>
|
||||
</div><p>The Mitchell report was released in December 2007 and named Clemens and 85 other current and former ballplayers as using drugs. Clemens denied the allegations and Butler pointed out that leaders of the House committee said they needed to investigate Clemens' denials to determine what weight to give the Mitchell report and its recommendations.</p><p>Attanasio argued that if the committee's purpose was to come full circle on the Mitchell report, it had done so with a January 2008 hearing featuring testimony by Mitchell, baseball commissioner Bud Selig and former players union director Donald Fehr.</p><p>"That ship had left. That work was done. And now it becomes a question between Mr. Clemens and Mr. McNamee," Attanasio said.</p><p>But U.S. District Judge Reggie Walton said if "one of the icons of baseball" was taking exception to the Mitchell report, "it seems to me that Congress has the authority to hold hearings to determine which view is correct."</p><p>Attanasio said the issue will be addressed in testimony from the first two witnesses prosecutors plan to call after opening arguments Wednesday morning. He said the first will be retired House Parliamentarian Charles Johnson, followed by Phil Barnett, who was chief counsel for the committee at the time it investigated Clemens.</p><p>The dispute over the committee's proper role came as Walton considered what preliminary instructions to give the jury, which was seated Tuesday afternoon after 3½ days of screening potential members.</p><p>The jury of 10 women and two men includes a woman whose cousin, former outfielder Al Bumbry, was a coach for the <a href="http://espn.go.com/mlb/team/_/name/bos/boston-red-sox">Boston Red Sox</a> when Clemens played for the team. Another woman on the jury said she believes <a href="http://espn.go.com/nfl/team/_/name/phi/philadelphia-eagles">Philadelphia Eagles</a> quarterback <a href="http://sports.espn.go.com/nfl/players/profile?playerId=2549">Michael Vick</a> was "done wrong" in his criminal conviction in connection with dogfighting.</p><p>Four other people were seated as alternate jurors in case any of the 12 can't serve.</p><p>Prosecutors and Clemens' defense team removed 20 people from the pool of 36 jurors, offering no public explanation for their decisions.</p><p>Clemens' attorney pressed potential jurors not to hold it against Clemens if he chooses not to testify, his strongest hint yet that the ex-pitcher might not take the stand.</p><p>Walton also said he was upset to read a New York Daily News item that members of Clemens' family have been criticizing McNamee and other government witnesses on Twitter and elsewhere online. The judge has a gag order on parties involved in the case, but he said he doesn't have any authority over anyone who isn't before him and hopes that those that are were not involved. </p><p>Clemens' attorney Rusty Hardin said he would look into it but that it's been "extremely difficult" for Clemens' family to see harsh criticisms of the baseball star online and in the media and not be able to respond.</p><p><i>Information from The Associated Press was used in this report.</i>
|
||||
</p>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
@ -0,0 +1,2 @@
|
||||
test_description: espn article
|
||||
url: http://sports.espn.go.com/mlb/news/story?id=6760720
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
||||
test_description: mit news article
|
||||
notes: links are broken out into paragraph divs
|
||||
url: http://web.mit.edu/newsoffice/2011/compare-recommendation-systems-0708.html
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,2 @@
|
||||
test_description: nytimes article
|
||||
url: http://thecaucus.blogs.nytimes.com/2011/07/12/mcconnell-proposal-gives-obama-power-to-increase-debt-limit/?hp
|
@ -0,0 +1,134 @@
|
||||
<div id="article"><div id="page-1" class="article-page"><div class="articleSpanImage"><img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad_span/10bad_span-articleLarge.jpg" alt="" border="0"/><p class="credit">Robert Yager for The New York Times</p>
|
||||
<p class="caption"><strong/>Gilligan on the set with the actors Bryan Cranston and Aaron Paul. </p>
|
||||
</div>
|
||||
<div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p>
|
||||
In the first three seasons of the AMC series “Breaking Bad,” Aaron Paul — or rather, his meth-dealing character, Jesse Pinkman — has been slapped, mauled and beaten purple by, respectively, a hit man, a sociopath and a federal drug-enforcement agent. If he were a piñata, the candy would have poured out of this guy long ago. And apparently there is little mercy for Paul in the new season on the way. For there Paul was, one day in late May, standing on Tijeras Avenue in downtown Albuquerque, being tasered by a brawny man in sunglasses. </p>
|
||||
</nyt_text></div>
|
||||
<div class="articleInline runaroundLeft"><p class="articleInline runaroundLeft"/>
|
||||
|
||||
<div class="inlineImage module">
|
||||
<div class="image">
|
||||
|
||||
<a href="http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html">
|
||||
<img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad1/mag-10Bad-t_CA1-articleInline.jpg" alt=""/></a>
|
||||
</div>
|
||||
<h6 class="credit">Robert Yager for The New York Times</h6>
|
||||
<p class="caption">The goal, Gilligan says, was to turn "Mr. Chips into Scarface." </p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
The street had been blocked off, and a crew of dozens waited as the actors rehearsed the assault with Vince Gilligan, the creator, head writer and show runner, who was also directing the episode. </p><p>
|
||||
“Maybe we play this moment just a little longer, so we know for sure he got zapped,” Gilligan said. “Otherwise, Jesse would fight back more.” </p><p>
|
||||
“Yeah, I like that,” Paul said. </p><p>
|
||||
“And let’s go back to the brass-knuckle-looking taser,” Gilligan said. </p><p>
|
||||
“Fly in the brass-knuckle taser!” a nearby crew member shouted into a walkie-talkie. </p><p>
|
||||
As the cameras were moved into place, Gilligan, who is 44 and speaks in a lyrical Southern drawl, reminisced fondly about some of the torments he has inflicted on Jesse Pinkman. One of the most gruesome was a plunge through the roof of a Port-a-Potty in a junkyard in Season 2. </p><p>
|
||||
“The original version was that he was going to get bit by a guard dog,” Gilligan said, leaning up against a rail and squinting against the New Mexico sun. “But the guard dog would have cost us $25,000, and we didn’t have the money. So we came up with the $5,000 outhouse gag. Which is quite a bit more memorable.” </p><p>
|
||||
Mordantly amusing ordeals are a specialty on “Breaking Bad,” which begins its fourth season on July 17. Credit the show’s forbiddingly grim premise: A 50-year-old high-school chemistry teacher named Walter White (played by Bryan Cranston) finds out he has terminal lung cancer and starts making crystal meth, hoping to leave behind a nest egg for his son and pregnant wife. Walter, it emerges, is a chemistry wizard, and after teaming up with Pinkman, a burnout student he once flunked, the pair drive a ramshackle R.V. into the desert and confect the purest, most coveted meth that local dealers have ever known. With the death penalty of his diagnosis looming, Walt wakes from the slumber of an unfulfilling life, evolving from feckless drudge to reluctant part-time criminal, then gradually to something worse. </p><p>
|
||||
In its first season, “Breaking Bad” seemed like the story of the nuttiest midlife crisis ever, told with elements that felt vaguely familiar. The structure — felonious dad copes with stress of work and family; complications ensue — owed an obvious debt to “The Sopranos,” and the collision of regular people and colorfully violent thugs nodded to Tarantino. The story and setting were an update of the spaghetti Western, minus the cowboys and set in the present. </p><p>
|
||||
But it was soon clear that “Breaking Bad” was something much more satisfying and complex: a revolutionary take on the serial drama. What sets the show apart from its small-screen peers is a subtle metaphysical layer all its own. As Walter inches toward damnation, Gilligan and his writers have posed some large questions about good and evil, questions with implications for every kind of malefactor you can imagine, from Ponzi schemers to terrorists. Questions like: Do we live in a world where terrible people go unpunished for their misdeeds? Or do the wicked ultimately suffer for their sins? </p><p>
|
||||
Gilligan has the nerve to provide his own hopeful answer. “Breaking Bad” takes place in a universe where nobody gets away with anything and karma is the great uncredited player in the cast. This moral dimension might explain why “Breaking Bad” has yet to achieve pop cultural breakthrough status, at least on the scale of other cable hits set in decidedly amoral universes, like “True Blood” or “Mad Men,” AMC’s far-more-buzzed-about series that takes place in an ad agency in the ’60s. The total audience for “Breaking Bad” is only slightly smaller than that of “Mad Men” — 19.5 million versus 22.4 million cumulative viewers in their respective third seasons — but the top three markets for “Breaking Bad” are Albuquerque/Santa Fe, Kansas City and Memphis; neither New York nor Los Angeles are in its top 10. The show, in other words, doesn’t play on the coasts. It gets chatter, just not among what has long been considered the chattering class. </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-2" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 2 of 5)</font></p><p/><p/><p>
|
||||
Which might make Gilligan TV’s first true red-state auteur. His characters lead middle-American lives in a middle-American place, and they are beset with middle-American problems. They speak like middle Americans too, and they inhabit a realm of moral ambiguities that’s overseen by a man with both a wicked sense of humor and a highly refined sense of right and wrong. </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
“If there’s a larger lesson to ‘Breaking Bad,’ it’s that actions have consequences,” Gilligan said during lunch one day in his trailer. “If religion is a reaction of man, and nothing more, it seems to me that it represents a human desire for wrongdoers to be punished. I hate the idea of Idi Amin living in Saudi Arabia for the last 25 years of his life. That galls me to no end.” </p><p>
|
||||
He paused for a moment and speared a few tater tots in a white plastic-foam tray perched on his lap. </p><p>
|
||||
“I feel some sort of need for biblical atonement, or justice, or something,” he said between chews. “I like to believe there is some comeuppance, that karma kicks in at some point, even if it takes years or decades to happen,” he went on. “My girlfriend says this great thing that’s become my philosophy as well. ‘I want to believe there’s a heaven. But I can’t not believe there’s a hell.’ ” </p><p>
|
||||
‘Breaking Bad” was born out of a conversation in 2004 between Gilligan and a friend named Thomas Schnauz, who is now a writer on the show. Schnauz had just read a story about a man cooking meth in an apartment complex, which had sickened kids in apartments above. Saddam Hussein’s putative mobile chemical-weapons labs came up in the conversation, too. </p><p>
|
||||
“Neither of us were working,” Schnauz says, “and we were like two 70-year-old men who like to complain about the world. And somehow we spun off into the idea of driving around in a mobile lab, cooking meth. It was a joke and not something I would have ever thought about again. But a couple days later Vince called back and said: ‘Remember we were talking about that mobile lab and meth? Do you mind if I run with that?’ ” </p><p>
|
||||
A show about a very smart middle-aged guy who hadn’t quite achieved his dreams had a faintly autobiographical whiff for Gilligan at the time. He grew up in Farmville, Va., a town of roughly 6,000 people, not far from Appomattox, the site of the South’s surrender in the Civil War. His father was an insurance claims adjuster, and his mother was a grade-school teacher who had a brief career as a wing walker. “Vince was an acolyte in the Catholic Church,” Gail Gilligan says, though she notes that he also played Dungeons and Dragons. “There was certainly a lot of evil in that game, but it never seemed to affect him adversely.” </p><p>
|
||||
Gilligan earned a partial scholarship to attend New York University’s film program, where his instructors included Jesse Kornbluth, who remembers a polite kid who was so good at drawing bent, violent characters that Kornbluth initially pegged him as the “go postal” type. “In the end, he turned us all into his audience,” Kornbluth said to me. “We were all just mesmerized. Attendance was unnaturally high on days when he was reading his scenes.” </p><p>
|
||||
After graduating, Gilligan won a screenplay contest in 1989, and one of the judges, a producer named Mark Johnson (now an executive producer on “Breaking Bad”), helped him find an agent and sell scripts to Hollywood. Two of them, “Home Fries,” starring Drew Barrymore, and “Wilder Napalm,” starring Debra Winger and Dennis Quaid, were turned into films. It was a promising start. Gilligan bought a house outside Richmond, assuming that he would keep lobbing movie scripts to Los Angeles, which would keep lobbing money back. That did not happen. By 1994, the money dried up and he lost his writer’s guild health insurance. That year, his agent got Gilligan a meeting with Chris Carter, the creator of “The X-Files.” </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-3" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 3 of 5)</font></p><p/><p/><p>
|
||||
“I pitched them an idea about a guy whose shadow comes to life and sucks people in like a black hole and kills them,” he recalls. “They bought that as a freelance episode, and then I moved to California.” He spent seven years as a writer and producer on “The X-Files,” his first full-time TV job. The gig died with the show in 2002, and what followed was another succession of false starts and disappointments. There was “Lone Gunman,” a show for Fox, which expired after one year, and one for CBS called “Battle Creek,” which failed to ignite. </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
“I’ve had two fallow periods in my life,” Gilligan said. The first one was after his two movies were made. “The second was the five years after ‘X-Files.’ Money wasn’t as big an issue as it was the first time, but as a writer you always want to be working on something that has a hope in hell of being made.” </p><p>
|
||||
In its basic outline, “Breaking Bad” — the title is a Southern phrase for going wild — also seemed destined for rejection. Its concept sounded a lot like that of “Weeds,” Showtime’s suburban pot-dealer series. Plus, its lead character is given a diagnosis of cancer within the first 20 minutes, and the action centers on one of the most destructive (and unglamorous) drugs known to man. Not to mention that the show ditches Rule No. 1 of series TV: the personality of the main character must stay the same. </p><p>
|
||||
“Television is really good at protecting the franchise,” Gilligan said. “It’s good at keeping the Korean War going for 11 seasons, like ‘M*A*S*H.’ It’s good at keeping Marshal Dillon policing his little town for 20 years. By their very nature TV shows are open-ended. So I thought, Wouldn’t it be interesting to have a show that takes the protagonist and transforms him into the antagonist?” </p><p>
|
||||
That was the pitch to AMC executives in 2007. The network was searching for a second original series, to go along with “Mad Men,” which made its debut that year. The goal was to find something set in the present, so that AMC wasn’t pigeonholed as the home of period television. And management wanted a conceit that would skew male and complement the network’s library of antihero action movies, the kind that star Clint Eastwood and Charles Bronson. Sitting in his Manhattan office, Charlie Collier, the president of AMC, recalls his introduction to Gilligan’s work: “Our development team put the pilot script on my desk and said, ‘Just read this.’ ” </p><p>
|
||||
At the time that Gilligan conceived “Breaking Bad,” his past success, plus all the hackwork offers that could have kept him busy for years, fortified his sense that only a show built to his iconoclastic sensibility was worth doing. He wanted a show devoid of snappy banter (of the kind that Aaron Sorkin writes), and one that doesn’t flatter you for getting its winking references (as Matthew Weiner does in “Mad Men,” with his chain-smoking doctors and kids playing with dry-cleaning bags). And he wanted a leading man who would not only change over the course of the series but also suffer crushing reversals with lasting impact. </p><p>
|
||||
That is something new. The depravities of leading men in TV dramas traditionally don’t leave permanent scars. Don Draper of “Mad Men” is still pretty much the tippling rake he has been from the start, despite a flirtation or two with confession and reform. Tony Soprano tried, through therapy, to improve as a human being, but he didn’t get very far. Dr. House of “House” will always be a brilliant cuss. Walter White progresses from unassuming savant to opportunistic gangster — and as he does so, the show dares you to excuse him, or find a moral line that you deem a point of no return. </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-4" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 4 of 5)</font></p><p/><p/><p>
|
||||
In 2007, if you needed an actor to dramatize so profound a transformation, Bryan Cranston would have seemed an unlikely choice. Before “Breaking Bad,” he was known as the dad in “Malcolm in the Middle,” a broadly comic role. When Gilligan told AMC executives that he wanted Cranston to play Walter, they initially were baffled. Then Gilligan explained that years earlier, he cast Cranston in an episode of “The X-Files.” “We had this villain, and we needed the audience to feel bad for him when he died,” Gilligan said. “Bryan alone was the only actor who could do that, who could pull off that trick. And it is a trick. I have no idea how he does it.” </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
Meeting Bryan Cranston only deepens the mystery. He is Walter’s opposite. The character is coiled and burdened, while Cranston in person is buoyant. Walter’s default facial expression is a rictus of angst, while Cranston’s is a mischievous smile. Cranston looks at least five years younger than the character, and his co-stars say, he often behaves like a 10-year-old. Aaron Paul described Cranston as “a kid trapped in a man’s body.” Anna Gunn, who plays Skyler, Walter’s wife, says that she has never seen an adult more amused by stuffing fruit down his pants. But Cranston’s performance as Walter White has made history, winning three Emmys in a row for outstanding lead in a drama series, the first actor to do so since Bill Cosby in “I Spy” in the mid-’60s. </p><p>
|
||||
“Physically, to create Walter White, I use my dad,” he said one night over dinner. “My dad is 87 years old. I’m not going to dodder, but Walter is always a little hunched over, never erect. The message to the audience is that the weight of the world is on this man’s shoulders.” </p><p>
|
||||
Cranston is from the total-commitment school of acting, and he once famously did a scene in “Malcolm in the Middle” while covered head to toe with bees. When Gilligan declined to fill in large holes in Walter’s back story, Cranston sat down and wrote out one of his own. On a handful of occasions, he has flagged lines in the script that felt false to him. Cranston reads each episode about a week in advance so that these bumps can be smoothed over before it’s time to start shooting. When he can’t resolve the issue with the writer on the set that week, a call is placed to Gilligan, who is usually in the writer’s room in Burbank. “It’s up to them, but I won’t bend unless I’m convinced it’s the right thing to do,” Cranston says. “Convince me and I’ll do it. I have a theory — our job isn’t to lie to the audience, our job is to find the truth in the character. If we lie, we’re giving the audience a little pinch of poison. They won’t even know they ingested it. But if you lie again and again and again, all of a sudden, your audience is going, ‘This isn’t working for me.’ They just feel sick, and they turn you off.” </p><p>
|
||||
Cranston has found many nuanced ways to enact Walt’s many miseries, the most wrenching of which was the loss of his wife’s love. There is a long history in art of foisting suffering on characters who sin, but it seems to have fallen out of favor. As awful as Tony Soprano was, it’s left purposefully unclear at the end of “The Sopranos” whether he paid the ultimate price. Or consider the “simple chaos” take on the universe as represented in movies by Woody Allen, a director whom Gilligan admires. “And Woody Allen may be right,” Gilligan says. “I’m pretty much agnostic at this point in my life. But I find <a href="http://topics.nytimes.com/top/reference/timestopics/subjects/a/atheism/index.html?inline=nyt-classifier" title="More articles about atheism." class="meta-classifier">atheism</a> just as hard to get my head around as I find fundamental Christianity. Because if there is no such thing as cosmic justice, what is the point of being good? That’s the one thing that no one has ever explained to me. Why shouldn’t I go rob a bank, especially if I’m smart enough to get away with it? What’s stopping me?” </p><p>
|
||||
On a cloudless day in May, five members of the cast and a scrum of crew members were shooting in what is referred to as “the Schrader house,” the home of Walter’s in-laws, Hank and Marie Schrader. It’s rented from a local couple and sits in the shadows of the type of steep, reddish mountains that Wile E. Coyote tumbled off chasing the Road Runner. Gilligan was the ringmaster of this circus, standing on the balcony and sipping a jumbo-size and constantly refilled McDonald’s container of unsweetened iced tea, which he calls brain juice. He was wearing what turned out to be his first pair of designer jeans. They were acquired during a recent shopping spree urged upon him by his girlfriend of 20 years, Holly Rice. His go-to pants have been $12 Wal-Mart jeans, he said, which is what he wore the following day. </p><p>
|
||||
He watched as a crew member put a series of sunglasses on the face of a 20ish Latino man with a nonspeaking background role. </p><p>
|
||||
“I like that one,” he said when the first pair of dark wraparounds were put on the actor’s face. </p><p>
|
||||
On went the second. “Not as good as the first,” Gilligan said. </p><p>
|
||||
Then the third. “Not as good as the first,” Gilligan repeated. </p><p>
|
||||
A fourth. “Let’s go with the first.” </p><p>
|
||||
This, it turns out, is an abbreviated version of a process that Gilligan goes through with virtually every article of clothing, every choice of color, every prop and every extra who appears in “Breaking Bad.” “You see this shirt?” said Dean Norris, who plays Hank Schrader, as he sat on the veranda between takes. He spoke in a stage whisper, out of the side of his mouth, like an inmate describing a warden who has gone insane. “Vince had to see five versions of it before he chose it. <em>Five different shades of a gray T-shirt.</em> That’s unique,” he said, heading into the house. “That’s beyond.” </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-5" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 5 of 5)</font></p><p/><p/><p>
|
||||
Perfectionists often don’t play well with others, but Gilligan seems eager to accommodate everyone with an idea. It’s a running joke in the cast, the disconnect between Gilligan the person and Gilligan the writer. The former is sweet-tempered and polite; the latter strapped a character’s severed head to a tortoise, which was then rigged with explosives and blown up as D.E.A. agents swarmed around it. </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
During a break in the shooting, I asked Gilligan if, now four seasons into his show, he could explain the gulf between his manners and his material. </p><p>
|
||||
“I’m not the happiest person,” he said. “But I respect this crew and these actors. I try to be as cheerful as possible. I fake it pretty well.” </p><p>
|
||||
Well, a lot of people can fake cheerful. But how does such a benign-seeming person come up with such malign tales? Gilligan thought for a moment, then quoted Flaubert. “I’m not going to get this exactly right, but it’s something like, ‘You should be neat and orderly in your life so you can be violent and original in your work,’ and there’s something to that,” he said. “It’s fun to explore that darkness and that criminal behavior on the page, but I’m too timid to do it in real life.” </p><p>
|
||||
The pilot of the show opened, memorably, with just such a burst of darkness and violence: Walt driving that R.V. through a desert in a crazed dash, wearing nothing but tighty-whitey briefs and a gas mask. Two male bodies roll in a soup of liquid, broken beakers and cash in the cabin. Cut to three weeks earlier. Walt is a regular schlub, in an unremarkable house, on his way to a mundane job. Gilligan slyly signals his overarching theme when Walter stands before his class and tells his students, “Chemistry is . . . well, technically it’s the study of matter, but I prefer to see it as the study of <em>change</em>.” </p><p>
|
||||
When you give your lead character a terminal illness, usher him into the underworld and embroil him in ever bolder and more ambitious criminal plans, you create a man who is rushing toward the ultimate change — from being alive to being dead. Walter White is surely the most doomed character on television, meaning that, just as “Breaking Bad” is finally winning acclaim, the end of the series is in sight. Which is just fine with Gilligan. He can imagine a fifth season of “Breaking Bad,” but that’s it. </p><p>
|
||||
Driving to the set after lunch one day, he told me that Walter White had started off as a person he could imagine chatting with over a beer. </p><p>
|
||||
“Now he’s not quite at the point where I’d cross the street if I saw him coming,” he said, with a smile. “But I wouldn’t want to be stuck in an elevator with him too long.” Plotting Walt’s transgressions has proved wearying enough. “It’s hard to write a character that dark and morally ambiguous,” he said. “I’m going to miss the show when it’s over, but on some level, it’ll be a relief to not have Walt in my head anymore.” </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div></div>
|
@ -0,0 +1,9 @@
|
||||
test_description: multi-page article from nytimes
|
||||
notes: wrongly includes author identification from each page
|
||||
url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html
|
||||
url_map:
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1: nytimes-001-orig-2.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2: nytimes-001-orig-2.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=3: nytimes-001-orig-3.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=4: nytimes-001-orig-4.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=5: nytimes-001-orig-5.html
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,6 @@
|
||||
<div id="article"><article><p>Put another way, Democrats reacted to the “grand bargain” proposed by President Obama and House Speaker John Boehner by squawking, complaining and highlighting elements they didn’t like. This is known throughout the world as the way to begin a process of negotiation.</p><p>Republicans, by contrast, answered with a definitive “no” and then covered their ears. Given the looming Aug. 2 deadline for default if the debt ceiling is not raised, the proper term for this approach is blackmail.</p><p>Yet the “both sides are to blame” narrative somehow gained currency after <a href="http://www.washingtonpost.com/business/economy/boehner-abandons-efforts-to-reach-comprehensive-debt-reduction-deal/2011/07/09/gIQARUJ55H_story.html">Boehner announced Saturday</a> that House Republicans would not support any increase in revenue, period. A false equivalence was drawn between the absolute Republican rejection of “revenue-positive” tax reform and the less-than-absolute Democratic opposition to “benefit cuts” in Medicare and Social Security.</p><p>The bogus story line is that the radical right-wing base of the GOP and the radical left-wing base of the Democratic Party are equally to blame for sinking the deal. </p><p>Leave aside, for the moment, the fact that in the Obama-Boehner proposal, there would be roughly three dollars’ worth of budget cuts for every dollar of new revenue. Don’t pause to ask whether it makes sense to slash government spending when the economy is still sputtering out of the worst recession in decades. Instead, focus narrowly on the politics of the deal.</p><p>It is true that House Minority Leader Nancy Pelosi howled like a blindsided politician when she learned that entitlement programs were on the table. But her objections — and those of Democrats in general — are philosophical and tactical, not absolute.</p><p>Progressives understand that Medicare and Social Security are not sustainable on their current trajectories; in the long term, both must have their revenue and costs brought into balance. Pelosi’s position is that each program should be addressed with an eye toward sustainability — not as a part of a last-minute deal for a hike in the debt ceiling that covers us for two or three years.</p><p>It’s also true that Democrats believe they can win back a passel of House seats next year by highlighting the GOP plan to convert Medicare into a voucher program. They don’t want Republicans to be able to point and say, “See, the Democrats want to cut Medicare, too.”</p><p>There’s nothing in these Democratic objections, however, that couldn’t be creatively finessed. You can claim you haven’t actually “cut” a benefit, for example, if what you’ve done is restrained the rate at which its cost will grow. You can offset spending with new revenue, and you can do so in a way that gives low-income taxpayers a break. Democrats left the door open and these options could have been explored.</p><p>The story on the Republican side is entirely different. There are ways to finesse a “no new taxes” pledge, too. Instead of raising tax rates, you close loopholes in the name of reform; you add an enhancement here, a “user fee” there, and you can manage to get the revenue you need and still claim you haven’t voted to raise taxes.</p><p>But Republicans are taking the position that not a cent of new revenue can be raised, no matter the euphemism. Some Democrats, yes, are being scratchy and cantankerous. But Republicans are refusing to negotiate at all. That’s not the same thing.</p><p>I understand why President Obama, <a href="http://projects.washingtonpost.com/obama-speeches/speech/736/">in his news conference Monday</a>, chided “each side” for taking a “maximalist position.” For political and practical reasons, it’s advantageous for him to be seen as an honest broker.</p><p>Meanwhile, though, the clock ticks toward Aug. 2 and the possibility of a catastrophic default becomes more real. And no one should be confused about what the president confronts: On one side, grousing and grumbling. On the other, a brick wall. </p><p>
|
||||
|
||||
<i>
|
||||
<a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Eugene Robinson will be online</a> to chat with readers at 1 p.m. Eastern time Tuesday. <a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Submit your questions</a> before or during the discussion.</i>
|
||||
|
||||
</p></article></div>
|
@ -0,0 +1,2 @@
|
||||
test_description: washingtonpost.com op-ed
|
||||
url: http://www.washingtonpost.com/opinions/dont-blame-both-sides-for-debt-impasse/2011/07/11/gIQA0XDg9H_story.html?hpid=z1
|
@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
@ -0,0 +1,55 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from readability_lxml.readability import Document
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
|
||||
|
||||
def load_sample(filename):
|
||||
"""Helper to get the content out of the sample files"""
|
||||
return open(os.path.join(SAMPLES, filename)).read()
|
||||
|
||||
|
||||
class TestArticleOnly(unittest.TestCase):
|
||||
"""The option to not get back a full html doc should work
|
||||
|
||||
Given a full html document, the call can request just divs of processed
|
||||
content. In this way the developer can then wrap the article however they
|
||||
want in their own view or application.
|
||||
|
||||
"""
|
||||
|
||||
def test_si_sample(self):
|
||||
"""Using the si sample, load article with only opening body element"""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(
|
||||
sample,
|
||||
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary()
|
||||
self.assertEqual('<html><body id="page"><div><div class', res[0:37])
|
||||
|
||||
def test_si_sample_html_partial(self):
|
||||
"""Using the si sample, make sure we can get the article alone."""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary(enclose_with_html_tag=False)
|
||||
self.assertEqual('<div id="page"><div class="', res[0:27])
|
||||
|
||||
def test_si_sample_full_summary(self):
|
||||
"""We should parse the doc and get a full summary with confidence"""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary_with_metadata(enclose_with_html_tag=False)
|
||||
self.assertTrue(hasattr(res, 'html'),
|
||||
'res should have an html attrib')
|
||||
self.assertTrue(hasattr(res, 'confidence'),
|
||||
'res should have an html attrib')
|
||||
self.assertTrue(hasattr(res, 'title'),
|
||||
'res should have an titile attrib')
|
||||
self.assertTrue(hasattr(res, 'short_title'),
|
||||
'res should have an short_title attrib')
|
||||
self.assertEqual('<div id="page"><div class="', res.html[0:27])
|
||||
self.assertTrue(res.confidence > 50,
|
||||
'The confidence score should be larger than 50: ' + str(res.confidence))
|
@ -0,0 +1,253 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html.diff import htmldiff
|
||||
|
||||
from helpers import load_regression_data
|
||||
from helpers import REGRESSION_DATA
|
||||
from readability_lxml.readability import Document
|
||||
from readability_lxml import readability as r
|
||||
from readability_lxml import urlfetch
|
||||
|
||||
|
||||
class TestReadabilityDocument(unittest.TestCase):
|
||||
"""Test the Document parser."""
|
||||
|
||||
def test_none_input_raises_exception(self):
|
||||
"""Feeding a None input to the document should blow up."""
|
||||
|
||||
doc = None
|
||||
self.assertRaises(ValueError, Document, doc)
|
||||
|
||||
|
||||
class TestFindBaseUrl(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.longMessage = True
|
||||
|
||||
def _assert_url(self, url, expected_base_url, msg = None):
|
||||
actual_base_url = r.find_base_url(url)
|
||||
self.assertEqual(expected_base_url, actual_base_url, msg)
|
||||
|
||||
def _run_urls(self, specs):
|
||||
"""
|
||||
Asserts expected results on a sequence of specs, where each spec is a
|
||||
pair: (URL, expected base URL).
|
||||
"""
|
||||
for spec in specs:
|
||||
url = spec[0]
|
||||
expected = spec[1]
|
||||
if len(spec) > 2:
|
||||
msg = spec[2]
|
||||
else:
|
||||
msg = None
|
||||
self._assert_url(url, expected, msg)
|
||||
|
||||
def test_none(self):
|
||||
self._assert_url(None, None)
|
||||
|
||||
def test_no_change(self):
|
||||
url = 'http://foo.com/article'
|
||||
self._assert_url(url, url)
|
||||
|
||||
def test_extension_stripping(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/article.html',
|
||||
'http://foo.com/article',
|
||||
'extension should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/article.html',
|
||||
'http://foo.com/path/to/article',
|
||||
'extension should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/article.123not',
|
||||
'http://foo.com/article.123not',
|
||||
'123not is not extension'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/article.123not',
|
||||
'http://foo.com/path/to/article.123not',
|
||||
'123not is not extension'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_ewcms(self):
|
||||
self._assert_url(
|
||||
'http://www.ew.com/ew/article/0,,20313460_20369436,00.html',
|
||||
'http://www.ew.com/ew/article/0,,20313460_20369436'
|
||||
)
|
||||
|
||||
def test_page_numbers(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/page5.html',
|
||||
'http://foo.com',
|
||||
'page number should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/page5.html',
|
||||
'http://foo.com/path/to',
|
||||
'page number should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/article-5.html',
|
||||
'http://foo.com/article',
|
||||
'page number should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_numbers(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/5.html',
|
||||
'http://foo.com',
|
||||
'number should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/5.html',
|
||||
'http://foo.com/path/to',
|
||||
'number should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_index(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/index.html',
|
||||
'http://foo.com',
|
||||
'index should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/index.html',
|
||||
'http://foo.com/path/to',
|
||||
'index should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_short(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/en/1234567890',
|
||||
'http://foo.com/1234567890',
|
||||
'short segment should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/en/de/1234567890',
|
||||
'http://foo.com/en/1234567890',
|
||||
'short segment should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
|
||||
class TestMultiPageHelpers(unittest.TestCase):
|
||||
|
||||
def test_find_next_page_url(self):
|
||||
"""Verify we can find a next page url in the html body"""
|
||||
html = """
|
||||
<html><body><a href="/?page=2">next</a></body></html>
|
||||
"""
|
||||
from lxml.html import document_fromstring
|
||||
doc = document_fromstring(html)
|
||||
|
||||
res = r.find_next_page_url(set(), None, doc)
|
||||
self.assertEqual('/?page=2', res,
|
||||
'Should find out page 2 url in the body.')
|
||||
|
||||
|
||||
class TestFindNextPageLink(unittest.TestCase):
|
||||
|
||||
def _test_page(self, url, html_path, expected):
|
||||
html = load_regression_data(html_path)
|
||||
doc = r.parse(html, url)
|
||||
parsed_urls = {url}
|
||||
actual = r.find_next_page_url(parsed_urls, url, doc)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_basic(self):
|
||||
self._test_page(
|
||||
'http://basic.com/article.html',
|
||||
'basic-multi-page.html',
|
||||
'http://basic.com/article.html?pagewanted=2'
|
||||
)
|
||||
|
||||
def test_nytimes(self):
|
||||
# This better work for the New York Times.
|
||||
self._test_page(
|
||||
'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html',
|
||||
'nytimes-next-page.html',
|
||||
'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1'
|
||||
)
|
||||
|
||||
|
||||
class TestMultiPage(unittest.TestCase):
|
||||
"""
|
||||
Tests the full path of generating a readable page for a multi-page article.
|
||||
The test article is very simple, so this test should be resilient to tweaks
|
||||
of the algorithm.
|
||||
"""
|
||||
|
||||
def _make_basic_urldict(self):
|
||||
url_fmt = 'http://basic.com/article.html?pagewanted=%s'
|
||||
file_fmt = 'basic-multi-page-%s.html'
|
||||
|
||||
pairs = [(url_fmt % i, os.path.join(REGRESSION_DATA, file_fmt % i)) for i in ['2', '3']]
|
||||
return dict(pairs)
|
||||
|
||||
def test_basic(self):
|
||||
html = load_regression_data('basic-multi-page.html')
|
||||
urldict = self._make_basic_urldict()
|
||||
fetcher = urlfetch.MockUrlFetch(urldict)
|
||||
options = {
|
||||
'url': 'http://basic.com/article.html',
|
||||
'multipage': True,
|
||||
'urlfetch': fetcher
|
||||
}
|
||||
doc = Document(html, **options)
|
||||
res = doc.summary_with_metadata()
|
||||
|
||||
self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
|
||||
self.assertIn('Page 3', res.html, 'Should find the page 3 heading')
|
||||
|
||||
expected_html = load_regression_data('basic-multi-page-expected.html')
|
||||
diff_html = htmldiff(expected_html, res.html)
|
||||
diff_doc = document_fromstring(diff_html)
|
||||
|
||||
insertions = diff_doc.xpath('//ins')
|
||||
deletions = diff_doc.xpath('//del')
|
||||
|
||||
if len(insertions) != 0:
|
||||
for i in insertions:
|
||||
print('unexpected insertion: %s' % i.xpath('string()'))
|
||||
self.fail('readability result does not match expected')
|
||||
|
||||
if len(deletions) != 0:
|
||||
for i in deletions:
|
||||
print('unexpected deletion: %s' % i.xpath('string()'))
|
||||
self.fail('readability result does not match expected')
|
||||
|
||||
|
||||
class TestIsSuspectedDuplicate(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(TestIsSuspectedDuplicate, self).setUp()
|
||||
html = load_regression_data('duplicate-page-article.html')
|
||||
self._article = r.fragment_fromstring(html)
|
||||
|
||||
def test_unique(self):
|
||||
html = load_regression_data('duplicate-page-unique.html')
|
||||
page = r.fragment_fromstring(html)
|
||||
self.assertFalse(r.is_suspected_duplicate(self._article, page))
|
||||
|
||||
def test_duplicate(self):
|
||||
html = load_regression_data('duplicate-page-duplicate.html')
|
||||
page = r.fragment_fromstring(html)
|
||||
self.assertTrue(r.is_suspected_duplicate(self._article, page))
|
@ -0,0 +1,24 @@
|
||||
"""Process all of the samples and make sure that process without error."""
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from helpers import load_sample
|
||||
from readability_lxml.readability import Document
|
||||
|
||||
sample_list = [
|
||||
'nyt.sample.html',
|
||||
'si-game.sample.html',
|
||||
]
|
||||
|
||||
|
||||
def test_processes():
|
||||
for article in sample_list:
|
||||
yield process_article, article
|
||||
|
||||
|
||||
def process_article(article):
|
||||
sample = load_sample(article)
|
||||
doc = Document(sample)
|
||||
res = doc.summary()
|
||||
failed_msg = "Failed to process the article: " + res[0:37]
|
||||
assert '<html><body id="page"><div><div class' == res[0:37], failed_msg
|
@ -1,39 +0,0 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from readability import Document
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
|
||||
|
||||
def load_sample(filename):
|
||||
"""Helper to get the content out of the sample files"""
|
||||
return open(os.path.join(SAMPLES, filename)).read()
|
||||
|
||||
|
||||
class TestArticleOnly(unittest.TestCase):
|
||||
"""The option to not get back a full html doc should work
|
||||
|
||||
Given a full html document, the call can request just divs of processed
|
||||
content. In this way the developer can then wrap the article however they
|
||||
want in their own view or application.
|
||||
|
||||
"""
|
||||
|
||||
def test_si_sample(self):
|
||||
"""Using the si sample, load article with only opening body element"""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(
|
||||
sample,
|
||||
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary()
|
||||
self.assertEqual('<html><body><div><div class', res[0:27])
|
||||
|
||||
def test_si_sample_html_partial(self):
|
||||
"""Using the si sample, make sure we can get the article alone."""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary(html_partial=True)
|
||||
self.assertEqual('<div><div class="', res[0:17])
|
||||
|
Loading…
Reference in New Issue