Compare commits
85 Commits
Author | SHA1 | Date |
---|---|---|
Richard Harding | d708744822 | 12 years ago |
Jerry Charumilind | eefb8e1125 | 12 years ago |
Richard Harding | c931a80ba8 | 12 years ago |
Jerry Charumilind | 883a02ad5d | 12 years ago |
Richard Harding | cfc6f94634 | 12 years ago |
Jerry Charumilind | 816c66482e | 12 years ago |
Richard Harding | 99d5fc0a87 | 12 years ago |
Jerry Charumilind | f02fe79840 | 12 years ago |
Richard Harding | 5cb4b8b8c0 | 12 years ago |
Jerry Charumilind | f8315d011c | 12 years ago |
Richard Harding | 99efa5c10b | 12 years ago |
Richard Harding | a012fd2362 | 12 years ago |
Jerry Charumilind | 3fe416a5d1 | 12 years ago |
Richard Harding | 8cadc4a958 | 12 years ago |
Richard Harding | 9765d13e90 | 12 years ago |
Jerry Charumilind | 32d1764e83 | 12 years ago |
Richard Harding | 0951647c8e | 12 years ago |
Richard Harding | ace51a6819 | 12 years ago |
Jerry Charumilind | 2505c78e5b | 12 years ago |
Richard Harding | edc0e4d4c6 | 12 years ago |
Jerry Charumilind | 6abc6f7ef2 | 12 years ago |
Jerry Charumilind | 1e30e33302 | 12 years ago |
Richard Harding | e8a6250605 | 12 years ago |
Jerry Charumilind | 62df35570d | 12 years ago |
Richard Harding | 29fceeb4b1 | 12 years ago |
Richard Harding | 6f8184be27 | 12 years ago |
Richard Harding | 9aef5e36b7 | 12 years ago |
Jerry Charumilind | 8988b6b767 | 12 years ago |
Jerry Charumilind | 7d097d5f11 | 12 years ago |
Jerry Charumilind | b04f75239c | 12 years ago |
Jerry Charumilind | c21f00b1ee | 12 years ago |
Richard Harding | 9fec245ae4 | 12 years ago |
Jerry Charumilind | 6af808bc14 | 12 years ago |
Jerry Charumilind | 7980ca84c9 | 12 years ago |
Richard Harding | a700bb8bd4 | 12 years ago |
Jerry Charumilind | bf203b5a4b | 12 years ago |
Jerry Charumilind | 65989b538a | 12 years ago |
Jerry Charumilind | 9b7e5bb327 | 12 years ago |
Jerry Charumilind | 068eba19ae | 12 years ago |
Richard Harding | 6d3ad559f6 | 12 years ago |
Jerry Charumilind | 5222ed0628 | 12 years ago |
Richard Harding | 6454fb3f37 | 12 years ago |
Richard Harding | 9366436861 | 12 years ago |
Richard Harding | 7dc373e9c5 | 12 years ago |
Richard Harding | b1966df1c3 | 12 years ago |
Richard Harding | 57694cb352 | 12 years ago |
Jerry Charumilind | b78d7e8501 | 12 years ago |
Richard Harding | a2b17e757c | 12 years ago |
Richard Harding | 3347f16d93 | 12 years ago |
Richard Harding | 93ac1111a1 | 12 years ago |
Richard Harding | 08660f6f0c | 12 years ago |
Richard Harding | 35792e7a59 | 12 years ago |
Richard Harding | aa51283dff | 12 years ago |
Richard Harding | a4b6957be2 | 12 years ago |
Richard Harding | b0063ffb3c | 12 years ago |
Richard Harding | 8091a75f00 | 12 years ago |
Richard Harding | 8f420bd950 | 12 years ago |
Richard Harding | 58c69651d3 | 12 years ago |
Richard Harding | 8b0210c4dc | 12 years ago |
Richard Harding | 0f9da8ace4 | 12 years ago |
Richard Harding | dc86283d83 | 12 years ago |
Richard Harding | 2ee2fe9536 | 12 years ago |
Richard Harding | ac5ef73e71 | 12 years ago |
Richard Harding | f5451356ee | 12 years ago |
Richard Harding | 509aed0d9f | 12 years ago |
Richard Harding | 273878214f | 12 years ago |
Richard Harding | 674e5f9ef2 | 12 years ago |
Richard Harding | 1c1cbaefa5 | 12 years ago |
Richard Harding | 7e57767070 | 12 years ago |
Richard Harding | 62e153eaf8 | 12 years ago |
Richard Harding | d11b928504 | 12 years ago |
Richard Harding | a6361854a9 | 12 years ago |
Richard Harding | b498df200b | 12 years ago |
Richard Harding | bbb60ed077 | 12 years ago |
Jerry Charumilind | cc0af7a105 | 13 years ago |
Jerry Charumilind | 82eabfc6b1 | 13 years ago |
Jerry Charumilind | cba19f209b | 13 years ago |
Jerry Charumilind | 18fa6b5146 | 13 years ago |
Jerry Charumilind | cdd30f625e | 13 years ago |
Jerry Charumilind | 7aac0f0855 | 13 years ago |
Jerry Charumilind | ac517834e6 | 13 years ago |
Jerry Charumilind | 01247903b8 | 13 years ago |
Jerry Charumilind | 33f935e39a | 13 years ago |
Jerry Charumilind | 7ceb8e6d7b | 13 years ago |
Jerry Charumilind | 8877754d7e | 13 years ago |
@ -1,17 +1,12 @@
|
||||
*.pyc
|
||||
__pycache__
|
||||
*.egg-info
|
||||
/build
|
||||
/dist
|
||||
build
|
||||
dist
|
||||
/bin
|
||||
/include
|
||||
/lib
|
||||
/local
|
||||
/man
|
||||
/share
|
||||
nosetests.xml
|
||||
.coverage
|
||||
.tox
|
||||
.idea
|
||||
.cache
|
||||
/.noseids
|
||||
/.venv
|
@ -1,60 +0,0 @@
|
||||
language: python
|
||||
os: linux
|
||||
cache: pip
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- name: "Python 2.7 on Linux"
|
||||
python: 2.7
|
||||
env: PIP=pip
|
||||
- name: "Python 3.5 on Linux"
|
||||
python: 3.5
|
||||
- name: "Python 3.6 on Linux"
|
||||
python: 3.6
|
||||
- name: "Python 3.7 on Linux"
|
||||
python: 3.7
|
||||
- name: "Python 3.8 on Linux"
|
||||
dist: xenial
|
||||
python: 3.8
|
||||
- name: "Python 3.9 Nightly on Linux"
|
||||
dist: bionic
|
||||
python: nightly
|
||||
- name: "Pypy on Linux"
|
||||
python: pypy
|
||||
env: PIP=pip
|
||||
- name: "Pypy 3 on Linux"
|
||||
python: pypy3
|
||||
- name: "Python 3.7 on older macOS"
|
||||
os: osx
|
||||
osx_image: xcode9.4
|
||||
language: shell
|
||||
env: TOXENV=py37
|
||||
before_install:
|
||||
- sw_vers
|
||||
- python3 --version
|
||||
- pip3 --version
|
||||
- name: "Python 3.7 on macOS"
|
||||
os: osx
|
||||
osx_image: xcode11
|
||||
language: shell
|
||||
env: TOXENV=py37
|
||||
before_install:
|
||||
- sw_vers
|
||||
- python3 --version
|
||||
- pip3 --version
|
||||
allow_failures:
|
||||
- python: nightly
|
||||
- python: pypy
|
||||
- python: pypy3
|
||||
- os: osx
|
||||
|
||||
install:
|
||||
- if [ $PIP ]; then true; else PIP=pip3; fi
|
||||
- travis_retry $PIP install -U pip wheel tox-travis pytest-cov codecov
|
||||
- travis_retry $PIP install -U -r requirements.txt -e ".[test]"
|
||||
|
||||
script:
|
||||
- tox
|
||||
|
||||
after_success:
|
||||
- codecov
|
@ -0,0 +1,10 @@
|
||||
Yuri Baburov
|
||||
facundo
|
||||
gfxmonk
|
||||
Jan Weiß
|
||||
Jerry Charumilind
|
||||
Laurent Peuch
|
||||
Lee Semel
|
||||
Rick Harding
|
||||
Sean Brant
|
||||
Tim Cuthbertson
|
@ -1,68 +1,92 @@
|
||||
.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master
|
||||
:target: https://travis-ci.org/buriy/python-readability
|
||||
readability_lxml
|
||||
================
|
||||
|
||||
|
||||
python-readability
|
||||
==================
|
||||
This is a python port of a ruby port of `arc90's readability`_ project
|
||||
|
||||
Given a html document, it pulls out the main body text and cleans it up.
|
||||
It also can clean up title based on latest readability.js code.
|
||||
|
||||
This is a python port of a ruby port of `arc90's readability
|
||||
project <http://lab.arc90.com/experiments/readability/>`__.
|
||||
|
||||
Installation
|
||||
------------
|
||||
Inspiration
|
||||
-----------
|
||||
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
|
||||
- Ruby port by starrhorne and iterationlabs
|
||||
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
|
||||
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
|
||||
- "BR to P" fix from readability.js which improves quality for smaller texts.
|
||||
- Github users contributions.
|
||||
|
||||
|
||||
Try it out!
|
||||
-----------
|
||||
You can try out the parser by entering your test urls on the following test
|
||||
service.
|
||||
|
||||
It's easy using ``pip``, just run:
|
||||
http://readable.bmark.us
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
Installation
|
||||
-------------
|
||||
::
|
||||
|
||||
$ easy_install readability-lxml
|
||||
# or
|
||||
$ pip install readability-lxml
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
>>> import requests
|
||||
>>> from readability import Document
|
||||
|
||||
>>> response = requests.get('http://example.com')
|
||||
>>> doc = Document(response.text)
|
||||
>>> doc.title()
|
||||
'Example Domain'
|
||||
|
||||
>>> doc.summary()
|
||||
"""<html><body><div><body id="readabilityBody">\n<div>\n <h1>Example Domain</h1>\n
|
||||
<p>This domain is established to be used for illustrative examples in documents. You may
|
||||
use this\n domain in examples without prior coordination or asking for permission.</p>
|
||||
\n <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
|
||||
\n</body>\n</div></body></html>"""
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
|
||||
- 0.8 Replaced XHTML output with HTML5 output in summary() call.
|
||||
- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
|
||||
- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
|
||||
- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
|
||||
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
|
||||
- 0.4 Added Videos loading and allowed more images per paragraph
|
||||
- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
|
||||
|
||||
Licensing
|
||||
---------
|
||||
|
||||
This code is under `the Apache License
|
||||
2.0 <http://www.apache.org/licenses/LICENSE-2.0>`__ license.
|
||||
|
||||
Thanks to
|
||||
---------
|
||||
|
||||
- Latest `readability.js <https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js>`__
|
||||
- Ruby port by starrhorne and iterationlabs
|
||||
- `Python port <https://github.com/gfxmonk/python-readability>`__ by gfxmonk
|
||||
- `Decruft effort <http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/>` to move to lxml
|
||||
- "BR to P" fix from readability.js which improves quality for smaller texts
|
||||
- Github users contributions.
|
||||
------
|
||||
|
||||
Command Line Client
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
::
|
||||
|
||||
$ readability http://pypi.python.org/pypi/readability-lxml
|
||||
$ readability /home/rharding/sampledoc.html
|
||||
|
||||
As a Library
|
||||
~~~~~~~~~~~~
|
||||
::
|
||||
|
||||
from readability.readability import Document
|
||||
import urllib
|
||||
html = urllib.urlopen(url).read()
|
||||
readable_article = Document(html).summary()
|
||||
readable_title = Document(html).short_title()
|
||||
|
||||
You can also use the `get_summary_with_metadata` method to get back other
|
||||
metadata such as the confidence score found while processing the input.
|
||||
|
||||
::
|
||||
|
||||
doc = Document(html).summary_with_metadata()
|
||||
print doc.html
|
||||
print doc.confidence
|
||||
|
||||
|
||||
Optional `Document` keyword argument:
|
||||
|
||||
- attributes:
|
||||
- debug: output debug messages
|
||||
- min_text_length:
|
||||
- multipage: should we try to parse and combine multiple page articles?
|
||||
- retry_length:
|
||||
- url: will allow adjusting links to be absolute
|
||||
|
||||
|
||||
Test and BUild Status
|
||||
---------------------
|
||||
Tests are run against the package at:
|
||||
|
||||
http://build.bmark.us/job/readability-lxml/
|
||||
|
||||
You can view it for build history and test status.
|
||||
|
||||
|
||||
History
|
||||
-------
|
||||
|
||||
- `0.2.5` Update setup.py for uploading .tar.gz to pypi
|
||||
|
||||
|
||||
.. _arc90's readability: http://lab.arc90.com/experiments/readability/
|
||||
|
@ -1,30 +0,0 @@
|
||||
Reference
|
||||
=========
|
||||
|
||||
.. automodule:: readability
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. automodule:: readability.browser
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. automodule:: readability.cleaners
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. automodule:: readability.debug
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. automodule:: readability.encoding
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. automodule:: readability.htmls
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. automodule:: readability.readability
|
||||
:members:
|
||||
:show-inheritance:
|
@ -1,164 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# readability documentation build configuration file, created by
|
||||
# sphinx-quickstart on Thu Mar 23 16:29:38 2017.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
|
||||
import readability
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#
|
||||
# needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.doctest",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx.ext.todo",
|
||||
"recommonmark",
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
source_suffix = [".rst", ".md"]
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = "index"
|
||||
|
||||
# General information about the project.
|
||||
project = "readability"
|
||||
copyright = "2020, Yuri Baburov"
|
||||
author = "Yuri Baburov"
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
|
||||
# The short X.Y version.
|
||||
version = readability.__version__
|
||||
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = readability.__version__
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
# This is also used if you do content translation via gettext catalogs.
|
||||
# Usually you set "language" from the command line for these cases.
|
||||
language = None
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This patterns also effect to html_static_path and html_extra_path
|
||||
exclude_patterns = []
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = "sphinx"
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = False
|
||||
|
||||
|
||||
# -- Options for HTML output ----------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
# html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = [] #'_static']
|
||||
|
||||
|
||||
# -- Options for HTMLHelp output ------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = "readabilitydoc"
|
||||
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [(master_doc, "readability.tex", "Readability Documentation", "Yuri Baburov", "manual")]
|
||||
|
||||
|
||||
# -- Options for manual page output ---------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [(master_doc, "readability", "readability Documentation", [author], 1)]
|
||||
|
||||
|
||||
# -- Options for Texinfo output -------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"readability",
|
||||
"Readability Documentation",
|
||||
author,
|
||||
"readability",
|
||||
"One line description of project.",
|
||||
"Miscellaneous",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
}
|
@ -1,13 +0,0 @@
|
||||
.. include:: ../../README.rst
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
api
|
||||
|
||||
Indices and tables
|
||||
------------------
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
@ -1,3 +0,0 @@
|
||||
__version__ = "0.8.1.1"
|
||||
|
||||
from .readability import Document
|
@ -1,21 +0,0 @@
|
||||
def open_in_browser(html):
|
||||
"""
|
||||
Open the HTML document in a web browser, saving it to a temporary
|
||||
file to open it. Note that this does not delete the file after
|
||||
use. This is mainly meant for debugging.
|
||||
"""
|
||||
import os
|
||||
import webbrowser
|
||||
import tempfile
|
||||
|
||||
handle, fn = tempfile.mkstemp(suffix=".html")
|
||||
f = os.fdopen(handle, "wb")
|
||||
try:
|
||||
f.write(b"<meta charset='UTF-8' />")
|
||||
f.write(html.encode("utf-8"))
|
||||
finally:
|
||||
# we leak the file itself here, but we should at least close it
|
||||
f.close()
|
||||
url = "file://" + fn.replace(os.path.sep, "/")
|
||||
webbrowser.open(url)
|
||||
return url
|
@ -1,52 +0,0 @@
|
||||
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
||||
import re
|
||||
from lxml.html.clean import Cleaner
|
||||
|
||||
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
|
||||
single_quoted = "'[^']+'"
|
||||
double_quoted = '"[^"]+"'
|
||||
non_space = "[^ \"'>]+"
|
||||
htmlstrip = re.compile(
|
||||
"<" # open
|
||||
"([^>]+) " # prefix
|
||||
"(?:%s) *" % ("|".join(bad_attrs),)
|
||||
+ "= *(?:%s|%s|%s)" # undesirable attributes
|
||||
% (non_space, single_quoted, double_quoted)
|
||||
+ "([^>]*)" # value # postfix
|
||||
">", # end
|
||||
re.I,
|
||||
)
|
||||
|
||||
|
||||
def clean_attributes(html):
|
||||
while htmlstrip.search(html):
|
||||
html = htmlstrip.sub("<\\1\\2>", html)
|
||||
return html
|
||||
|
||||
|
||||
def normalize_spaces(s):
|
||||
if not s:
|
||||
return ""
|
||||
"""replace any sequence of whitespace
|
||||
characters with a single space"""
|
||||
return " ".join(s.split())
|
||||
|
||||
|
||||
html_cleaner = Cleaner(
|
||||
scripts=True,
|
||||
javascript=True,
|
||||
comments=True,
|
||||
style=True,
|
||||
links=True,
|
||||
meta=False,
|
||||
add_nofollow=False,
|
||||
page_structure=False,
|
||||
processing_instructions=True,
|
||||
embedded=False,
|
||||
frames=False,
|
||||
forms=False,
|
||||
annoying_tags=False,
|
||||
remove_tags=None,
|
||||
remove_unknown_tags=False,
|
||||
safe_attrs_only=False,
|
||||
)
|
@ -1,20 +0,0 @@
|
||||
"""
|
||||
This module contains compatibility helpers for Python 2/3 interoperability.
|
||||
|
||||
It mainly exists because their are certain incompatibilities in the Python
|
||||
syntax that can only be solved by conditionally importing different functions.
|
||||
"""
|
||||
import sys
|
||||
from lxml.etree import tostring
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
bytes_ = str
|
||||
str_ = unicode
|
||||
def tostring_(s):
|
||||
return tostring(s, encoding='utf-8').decode('utf-8')
|
||||
|
||||
elif sys.version_info[0] == 3:
|
||||
bytes_ = bytes
|
||||
str_ = str
|
||||
def tostring_(s):
|
||||
return tostring(s, encoding='utf-8')
|
@ -1,6 +0,0 @@
|
||||
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
|
||||
"""
|
||||
Raise a new exception of type `exc_type` with an existing `traceback`. All
|
||||
additional (keyword-)arguments are forwarded to `exc_type`
|
||||
"""
|
||||
raise exc_type(*args, **kwargs).with_traceback(traceback)
|
@ -1,6 +0,0 @@
|
||||
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
|
||||
"""
|
||||
Raise a new exception of type `exc_type` with an existing `traceback`. All
|
||||
additional (keyword-)arguments are forwarded to `exc_type`
|
||||
"""
|
||||
raise exc_type(*args, **kwargs), None, traceback
|
@ -1,51 +0,0 @@
|
||||
import re
|
||||
|
||||
|
||||
# FIXME: use with caution, can leak memory
|
||||
uids = {}
|
||||
uids_document = None
|
||||
|
||||
|
||||
def describe_node(node):
|
||||
global uids
|
||||
if node is None:
|
||||
return ""
|
||||
if not hasattr(node, "tag"):
|
||||
return "[%s]" % type(node)
|
||||
name = node.tag
|
||||
if node.get("id", ""):
|
||||
name += "#" + node.get("id")
|
||||
if node.get("class", "").strip():
|
||||
name += "." + ".".join(node.get("class").split())
|
||||
if name[:4] in ["div#", "div."]:
|
||||
name = name[3:]
|
||||
if name in ["tr", "td", "div", "p"]:
|
||||
uid = uids.get(node)
|
||||
if uid is None:
|
||||
uid = uids[node] = len(uids) + 1
|
||||
name += "{%02d}" % uid
|
||||
return name
|
||||
|
||||
|
||||
def describe(node, depth=1):
|
||||
global uids, uids_document
|
||||
doc = node.getroottree().getroot()
|
||||
if doc != uids_document:
|
||||
uids = {}
|
||||
uids_document = doc
|
||||
|
||||
# return repr(NodeRepr(node))
|
||||
parent = ""
|
||||
if depth and node.getparent() is not None:
|
||||
parent = describe(node.getparent(), depth=depth - 1) + ">"
|
||||
return parent + describe_node(node)
|
||||
|
||||
|
||||
RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
|
||||
|
||||
|
||||
def text_content(elem, length=40):
|
||||
content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
|
||||
if len(content) < length:
|
||||
return content
|
||||
return content[:length] + "..."
|
@ -1,63 +0,0 @@
|
||||
import re
|
||||
import chardet
|
||||
import sys
|
||||
|
||||
|
||||
RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
||||
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
||||
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
||||
|
||||
CHARSETS = {
|
||||
"big5": "big5hkscs",
|
||||
"gb2312": "gb18030",
|
||||
"ascii": "utf-8",
|
||||
"maccyrillic": "cp1251",
|
||||
"win1251": "cp1251",
|
||||
"win-1251": "cp1251",
|
||||
"windows-1251": "cp1251",
|
||||
}
|
||||
|
||||
|
||||
def fix_charset(encoding):
|
||||
"""Overrides encoding when charset declaration
|
||||
or charset determination is a subset of a larger
|
||||
charset. Created because of issues with Chinese websites"""
|
||||
encoding = encoding.lower()
|
||||
return CHARSETS.get(encoding, encoding)
|
||||
|
||||
|
||||
def get_encoding(page):
|
||||
# Regex for XML and HTML Meta charset declaration
|
||||
declared_encodings = (
|
||||
RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
|
||||
)
|
||||
|
||||
# Try any declared encodings
|
||||
for declared_encoding in declared_encodings:
|
||||
try:
|
||||
if sys.version_info[0] == 3:
|
||||
# declared_encoding will actually be bytes but .decode() only
|
||||
# accepts `str` type. Decode blindly with ascii because no one should
|
||||
# ever use non-ascii characters in the name of an encoding.
|
||||
declared_encoding = declared_encoding.decode("ascii", "replace")
|
||||
|
||||
encoding = fix_charset(declared_encoding)
|
||||
|
||||
# Now let's decode the page
|
||||
page.decode(encoding)
|
||||
# It worked!
|
||||
return encoding
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
pass
|
||||
|
||||
# Fallback to chardet if declared encodings fail
|
||||
# Remove all HTML tags, and leave only text for chardet
|
||||
text = re.sub(br"(\s*</?[^>]*>)+\s*", b" ", page).strip()
|
||||
enc = "utf-8"
|
||||
if len(text) < 10:
|
||||
return enc # can't guess
|
||||
res = chardet.detect(text)
|
||||
enc = res["encoding"] or "utf-8"
|
||||
# print '->', enc, "%.2f" % res['confidence']
|
||||
enc = fix_charset(enc)
|
||||
return enc
|
@ -1,144 +0,0 @@
|
||||
from lxml.html import tostring
|
||||
import lxml.html
|
||||
import re
|
||||
|
||||
from .cleaners import normalize_spaces, clean_attributes
|
||||
from .encoding import get_encoding
|
||||
from .compat import str_
|
||||
|
||||
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
|
||||
|
||||
|
||||
def build_doc(page):
|
||||
if isinstance(page, str_):
|
||||
encoding = None
|
||||
decoded_page = page
|
||||
else:
|
||||
encoding = get_encoding(page) or "utf-8"
|
||||
decoded_page = page.decode(encoding, "replace")
|
||||
|
||||
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
|
||||
doc = lxml.html.document_fromstring(
|
||||
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
|
||||
)
|
||||
return doc, encoding
|
||||
|
||||
|
||||
def js_re(src, pattern, flags, repl):
|
||||
return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
|
||||
|
||||
|
||||
def normalize_entities(cur_title):
|
||||
entities = {
|
||||
u"\u2014": "-",
|
||||
u"\u2013": "-",
|
||||
u"—": "-",
|
||||
u"–": "-",
|
||||
u"\u00A0": " ",
|
||||
u"\u00AB": '"',
|
||||
u"\u00BB": '"',
|
||||
u""": '"',
|
||||
}
|
||||
for c, r in entities.items():
|
||||
if c in cur_title:
|
||||
cur_title = cur_title.replace(c, r)
|
||||
|
||||
return cur_title
|
||||
|
||||
|
||||
def norm_title(title):
|
||||
return normalize_entities(normalize_spaces(title))
|
||||
|
||||
|
||||
def get_title(doc):
|
||||
title = doc.find(".//title")
|
||||
if title is None or title.text is None or len(title.text) == 0:
|
||||
return "[no-title]"
|
||||
|
||||
return norm_title(title.text)
|
||||
|
||||
|
||||
def add_match(collection, text, orig):
|
||||
text = norm_title(text)
|
||||
if len(text.split()) >= 2 and len(text) >= 15:
|
||||
if text.replace('"', "") in orig.replace('"', ""):
|
||||
collection.add(text)
|
||||
|
||||
|
||||
TITLE_CSS_HEURISTICS = [
|
||||
"#title",
|
||||
"#head",
|
||||
"#heading",
|
||||
".pageTitle",
|
||||
".news_title",
|
||||
".title",
|
||||
".head",
|
||||
".heading",
|
||||
".contentheading",
|
||||
".small_header_red",
|
||||
]
|
||||
|
||||
|
||||
def shorten_title(doc):
|
||||
title = doc.find(".//title")
|
||||
if title is None or title.text is None or len(title.text) == 0:
|
||||
return ""
|
||||
|
||||
title = orig = norm_title(title.text)
|
||||
|
||||
candidates = set()
|
||||
|
||||
for item in [".//h1", ".//h2", ".//h3"]:
|
||||
for e in list(doc.iterfind(item)):
|
||||
if e.text:
|
||||
add_match(candidates, e.text, orig)
|
||||
if e.text_content():
|
||||
add_match(candidates, e.text_content(), orig)
|
||||
|
||||
for item in TITLE_CSS_HEURISTICS:
|
||||
for e in doc.cssselect(item):
|
||||
if e.text:
|
||||
add_match(candidates, e.text, orig)
|
||||
if e.text_content():
|
||||
add_match(candidates, e.text_content(), orig)
|
||||
|
||||
if candidates:
|
||||
title = sorted(candidates, key=len)[-1]
|
||||
else:
|
||||
for delimiter in [" | ", " - ", " :: ", " / "]:
|
||||
if delimiter in title:
|
||||
parts = orig.split(delimiter)
|
||||
if len(parts[0].split()) >= 4:
|
||||
title = parts[0]
|
||||
break
|
||||
elif len(parts[-1].split()) >= 4:
|
||||
title = parts[-1]
|
||||
break
|
||||
else:
|
||||
if ": " in title:
|
||||
parts = orig.split(": ")
|
||||
if len(parts[-1].split()) >= 4:
|
||||
title = parts[-1]
|
||||
else:
|
||||
title = orig.split(": ", 1)[1]
|
||||
|
||||
if not 15 < len(title) < 150:
|
||||
return orig
|
||||
|
||||
return title
|
||||
|
||||
|
||||
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
|
||||
def get_body(doc):
|
||||
for elem in doc.xpath(".//script | .//link | .//style"):
|
||||
elem.drop_tree()
|
||||
# tostring() always return utf-8 encoded string
|
||||
# FIXME: isn't better to use tounicode?
|
||||
raw_html = str_(tostring(doc.body or doc))
|
||||
cleaned = clean_attributes(raw_html)
|
||||
try:
|
||||
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
||||
return cleaned
|
||||
except Exception: # FIXME find the equivalent lxml error
|
||||
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
|
||||
return raw_html
|
@ -1,756 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
|
||||
from lxml.etree import tounicode
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import fragment_fromstring
|
||||
|
||||
from .cleaners import clean_attributes
|
||||
from .cleaners import html_cleaner
|
||||
from .htmls import build_doc
|
||||
from .htmls import get_body
|
||||
from .htmls import get_title
|
||||
from .htmls import shorten_title
|
||||
from .compat import str_, bytes_, tostring_
|
||||
from .debug import describe, text_content
|
||||
|
||||
|
||||
log = logging.getLogger("readability.readability")
|
||||
|
||||
REGEXES = {
|
||||
"unlikelyCandidatesRe": re.compile(
|
||||
r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
|
||||
re.I,
|
||||
),
|
||||
"okMaybeItsACandidateRe": re.compile(r"and|article|body|column|main|shadow", re.I),
|
||||
"positiveRe": re.compile(
|
||||
r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",
|
||||
re.I,
|
||||
),
|
||||
"negativeRe": re.compile(
|
||||
r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget",
|
||||
re.I,
|
||||
),
|
||||
"divToPElementsRe": re.compile(
|
||||
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
|
||||
),
|
||||
#'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
||||
#'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
|
||||
#'trimRe': re.compile(r'^\s+|\s+$/'),
|
||||
#'normalizeRe': re.compile(r'\s{2,}/'),
|
||||
#'killBreaksRe': re.compile(r'(<br\s*\/?>(\s| ?)*){1,}/'),
|
||||
"videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
|
||||
# skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
|
||||
}
|
||||
|
||||
|
||||
class Unparseable(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def to_int(x):
|
||||
if not x:
|
||||
return None
|
||||
x = x.strip()
|
||||
if x.endswith("px"):
|
||||
return int(x[:-2])
|
||||
if x.endswith("em"):
|
||||
return int(x[:-2]) * 12
|
||||
return int(x)
|
||||
|
||||
|
||||
def clean(text):
|
||||
# Many spaces make the following regexes run forever
|
||||
text = re.sub(r"\s{255,}", " " * 255, text)
|
||||
text = re.sub(r"\s*\n\s*", "\n", text)
|
||||
text = re.sub(r"\t|[ \t]{2,}", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def text_length(i):
|
||||
return len(clean(i.text_content() or ""))
|
||||
|
||||
|
||||
def compile_pattern(elements):
|
||||
if not elements:
|
||||
return None
|
||||
elif isinstance(elements, re._pattern_type):
|
||||
return elements
|
||||
elif isinstance(elements, (str_, bytes_)):
|
||||
if isinstance(elements, bytes_):
|
||||
elements = str_(elements, "utf-8")
|
||||
elements = elements.split(u",")
|
||||
if isinstance(elements, (list, tuple)):
|
||||
return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
|
||||
else:
|
||||
raise Exception("Unknown type for the pattern: {}".format(type(elements)))
|
||||
# assume string or string like object
|
||||
|
||||
|
||||
class Document:
|
||||
"""Class to build a etree document out of html."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input,
|
||||
positive_keywords=None,
|
||||
negative_keywords=None,
|
||||
url=None,
|
||||
min_text_length=25,
|
||||
retry_length=250,
|
||||
xpath=False,
|
||||
handle_failures="discard",
|
||||
):
|
||||
"""Generate the document
|
||||
|
||||
:param input: string of the html content.
|
||||
:param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
|
||||
:param negative_keywords: regex, list or comma-separated string in classes and ids
|
||||
:param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
|
||||
:param retry_length: Tunable. Set to a lower value for better detection of very small texts.
|
||||
:param xpath: If set to True, adds x="..." attribute to each HTML node,
|
||||
containing xpath path pointing to original document path (allows to
|
||||
reconstruct selected summary in original document).
|
||||
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
|
||||
Support options = ["discard", "ignore", None]
|
||||
|
||||
Examples:
|
||||
positive_keywords=["news-item", "block"]
|
||||
positive_keywords=["news-item, block"]
|
||||
positive_keywords=re.compile("news|block")
|
||||
negative_keywords=["mysidebar", "related", "ads"]
|
||||
|
||||
The Document class is not re-enterable.
|
||||
It is designed to create a new Document() for each HTML file to process it.
|
||||
|
||||
API methods:
|
||||
.title() -- full title
|
||||
.short_title() -- cleaned up title
|
||||
.content() -- full content
|
||||
.summary() -- cleaned up content
|
||||
"""
|
||||
self.input = input
|
||||
self.html = None
|
||||
self.encoding = None
|
||||
self.positive_keywords = compile_pattern(positive_keywords)
|
||||
self.negative_keywords = compile_pattern(negative_keywords)
|
||||
self.url = url
|
||||
self.min_text_length = min_text_length
|
||||
self.retry_length = retry_length
|
||||
self.xpath = xpath
|
||||
self.handle_failures = handle_failures
|
||||
|
||||
def _html(self, force=False):
|
||||
if force or self.html is None:
|
||||
self.html = self._parse(self.input)
|
||||
if self.xpath:
|
||||
root = self.html.getroottree()
|
||||
for i in self.html.getiterator():
|
||||
# print root.getpath(i)
|
||||
i.attrib["x"] = root.getpath(i)
|
||||
return self.html
|
||||
|
||||
def _parse(self, input):
|
||||
doc, self.encoding = build_doc(input)
|
||||
doc = html_cleaner.clean_html(doc)
|
||||
base_href = self.url
|
||||
if base_href:
|
||||
# trying to guard against bad links like <a href="http://[http://...">
|
||||
try:
|
||||
# such support is added in lxml 3.3.0
|
||||
doc.make_links_absolute(
|
||||
base_href,
|
||||
resolve_base_href=True,
|
||||
handle_failures=self.handle_failures,
|
||||
)
|
||||
except TypeError: # make_links_absolute() got an unexpected keyword argument 'handle_failures'
|
||||
# then we have lxml < 3.3.0
|
||||
# please upgrade to lxml >= 3.3.0 if you're failing here!
|
||||
doc.make_links_absolute(
|
||||
base_href,
|
||||
resolve_base_href=True,
|
||||
handle_failures=self.handle_failures,
|
||||
)
|
||||
else:
|
||||
doc.resolve_base_href(handle_failures=self.handle_failures)
|
||||
return doc
|
||||
|
||||
def content(self):
|
||||
"""Returns document body"""
|
||||
return get_body(self._html(True))
|
||||
|
||||
def title(self):
|
||||
"""Returns document title"""
|
||||
return get_title(self._html(True))
|
||||
|
||||
def short_title(self):
|
||||
"""Returns cleaned up document title"""
|
||||
return shorten_title(self._html(True))
|
||||
|
||||
def get_clean_html(self):
|
||||
"""
|
||||
An internal method, which can be overridden in subclasses, for example,
|
||||
to disable or to improve DOM-to-text conversion in .summary() method
|
||||
"""
|
||||
return clean_attributes(tounicode(self.html, method="html"))
|
||||
|
||||
def summary(self, html_partial=False):
|
||||
"""
|
||||
Given a HTML file, extracts the text of the article.
|
||||
|
||||
:param html_partial: return only the div of the document, don't wrap
|
||||
in html and body tags.
|
||||
|
||||
Warning: It mutates internal DOM representation of the HTML document,
|
||||
so it is better to call other API methods before this one.
|
||||
"""
|
||||
try:
|
||||
ruthless = True
|
||||
while True:
|
||||
self._html(True)
|
||||
for i in self.tags(self.html, "script", "style"):
|
||||
i.drop_tree()
|
||||
for i in self.tags(self.html, "body"):
|
||||
i.set("id", "readabilityBody")
|
||||
if ruthless:
|
||||
self.remove_unlikely_candidates()
|
||||
self.transform_misused_divs_into_paragraphs()
|
||||
candidates = self.score_paragraphs()
|
||||
|
||||
best_candidate = self.select_best_candidate(candidates)
|
||||
|
||||
if best_candidate:
|
||||
article = self.get_article(
|
||||
candidates, best_candidate, html_partial=html_partial
|
||||
)
|
||||
else:
|
||||
if ruthless:
|
||||
log.info("ruthless removal did not work. ")
|
||||
ruthless = False
|
||||
log.debug(
|
||||
(
|
||||
"ended up stripping too much - "
|
||||
"going for a safer _parse"
|
||||
)
|
||||
)
|
||||
# try again
|
||||
continue
|
||||
else:
|
||||
log.debug(
|
||||
(
|
||||
"Ruthless and lenient parsing did not work. "
|
||||
"Returning raw html"
|
||||
)
|
||||
)
|
||||
article = self.html.find("body")
|
||||
if article is None:
|
||||
article = self.html
|
||||
cleaned_article = self.sanitize(article, candidates)
|
||||
|
||||
article_length = len(cleaned_article or "")
|
||||
retry_length = self.retry_length
|
||||
of_acceptable_length = article_length >= retry_length
|
||||
if ruthless and not of_acceptable_length:
|
||||
ruthless = False
|
||||
# Loop through and try again.
|
||||
continue
|
||||
else:
|
||||
return cleaned_article
|
||||
except Exception as e:
|
||||
log.exception("error getting summary: ")
|
||||
if sys.version_info[0] == 2:
|
||||
from .compat.two import raise_with_traceback
|
||||
else:
|
||||
from .compat.three import raise_with_traceback
|
||||
raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
|
||||
|
||||
def get_article(self, candidates, best_candidate, html_partial=False):
|
||||
# Now that we have the top candidate, look through its siblings for
|
||||
# content that might also be related.
|
||||
# Things like preambles, content split by ads that we removed, etc.
|
||||
sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
|
||||
# create a new html document with a html->body->div
|
||||
if html_partial:
|
||||
output = fragment_fromstring("<div/>")
|
||||
else:
|
||||
output = document_fromstring("<div/>")
|
||||
best_elem = best_candidate["elem"]
|
||||
parent = best_elem.getparent()
|
||||
siblings = parent.getchildren() if parent is not None else [best_elem]
|
||||
for sibling in siblings:
|
||||
# in lxml there no concept of simple text
|
||||
# if isinstance(sibling, NavigableString): continue
|
||||
append = False
|
||||
if sibling is best_elem:
|
||||
append = True
|
||||
sibling_key = sibling # HashableElement(sibling)
|
||||
if (
|
||||
sibling_key in candidates
|
||||
and candidates[sibling_key]["content_score"] >= sibling_score_threshold
|
||||
):
|
||||
append = True
|
||||
|
||||
if sibling.tag == "p":
|
||||
link_density = self.get_link_density(sibling)
|
||||
node_content = sibling.text or ""
|
||||
node_length = len(node_content)
|
||||
|
||||
if node_length > 80 and link_density < 0.25:
|
||||
append = True
|
||||
elif (
|
||||
node_length <= 80
|
||||
and link_density == 0
|
||||
and re.search(r"\.( |$)", node_content)
|
||||
):
|
||||
append = True
|
||||
|
||||
if append:
|
||||
# We don't want to append directly to output, but the div
|
||||
# in html->body->div
|
||||
if html_partial:
|
||||
output.append(sibling)
|
||||
else:
|
||||
output.getchildren()[0].getchildren()[0].append(sibling)
|
||||
# if output is not None:
|
||||
# output.append(best_elem)
|
||||
return output
|
||||
|
||||
def select_best_candidate(self, candidates):
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
sorted_candidates = sorted(
|
||||
candidates.values(), key=lambda x: x["content_score"], reverse=True
|
||||
)
|
||||
for candidate in sorted_candidates[:5]:
|
||||
elem = candidate["elem"]
|
||||
log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
|
||||
|
||||
best_candidate = sorted_candidates[0]
|
||||
return best_candidate
|
||||
|
||||
def get_link_density(self, elem):
|
||||
link_length = 0
|
||||
for i in elem.findall(".//a"):
|
||||
link_length += text_length(i)
|
||||
# if len(elem.findall(".//div") or elem.findall(".//p")):
|
||||
# link_length = link_length
|
||||
total_length = text_length(elem)
|
||||
return float(link_length) / max(total_length, 1)
|
||||
|
||||
def score_paragraphs(self):
|
||||
MIN_LEN = self.min_text_length
|
||||
candidates = {}
|
||||
ordered = []
|
||||
for elem in self.tags(self._html(), "p", "pre", "td"):
|
||||
parent_node = elem.getparent()
|
||||
if parent_node is None:
|
||||
continue
|
||||
grand_parent_node = parent_node.getparent()
|
||||
|
||||
inner_text = clean(elem.text_content() or "")
|
||||
inner_text_len = len(inner_text)
|
||||
|
||||
# If this paragraph is less than 25 characters
|
||||
# don't even count it.
|
||||
if inner_text_len < MIN_LEN:
|
||||
continue
|
||||
|
||||
if parent_node not in candidates:
|
||||
candidates[parent_node] = self.score_node(parent_node)
|
||||
ordered.append(parent_node)
|
||||
|
||||
if grand_parent_node is not None and grand_parent_node not in candidates:
|
||||
candidates[grand_parent_node] = self.score_node(grand_parent_node)
|
||||
ordered.append(grand_parent_node)
|
||||
|
||||
content_score = 1
|
||||
content_score += len(inner_text.split(","))
|
||||
content_score += min((inner_text_len / 100), 3)
|
||||
# if elem not in candidates:
|
||||
# candidates[elem] = self.score_node(elem)
|
||||
|
||||
# WTF? candidates[elem]['content_score'] += content_score
|
||||
candidates[parent_node]["content_score"] += content_score
|
||||
if grand_parent_node is not None:
|
||||
candidates[grand_parent_node]["content_score"] += content_score / 2.0
|
||||
|
||||
# Scale the final candidates score based on link density. Good content
|
||||
# should have a relatively small link density (5% or less) and be
|
||||
# mostly unaffected by this operation.
|
||||
for elem in ordered:
|
||||
candidate = candidates[elem]
|
||||
ld = self.get_link_density(elem)
|
||||
score = candidate["content_score"]
|
||||
log.debug(
|
||||
"Branch %6.3f %s link density %.3f -> %6.3f"
|
||||
% (score, describe(elem), ld, score * (1 - ld))
|
||||
)
|
||||
candidate["content_score"] *= 1 - ld
|
||||
|
||||
return candidates
|
||||
|
||||
def class_weight(self, e):
|
||||
weight = 0
|
||||
for feature in [e.get("class", None), e.get("id", None)]:
|
||||
if feature:
|
||||
if REGEXES["negativeRe"].search(feature):
|
||||
weight -= 25
|
||||
|
||||
if REGEXES["positiveRe"].search(feature):
|
||||
weight += 25
|
||||
|
||||
if self.positive_keywords and self.positive_keywords.search(feature):
|
||||
weight += 25
|
||||
|
||||
if self.negative_keywords and self.negative_keywords.search(feature):
|
||||
weight -= 25
|
||||
|
||||
if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
|
||||
weight += 25
|
||||
|
||||
if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag):
|
||||
weight -= 25
|
||||
|
||||
return weight
|
||||
|
||||
def score_node(self, elem):
|
||||
content_score = self.class_weight(elem)
|
||||
name = elem.tag.lower()
|
||||
if name in ["div", "article"]:
|
||||
content_score += 5
|
||||
elif name in ["pre", "td", "blockquote"]:
|
||||
content_score += 3
|
||||
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
|
||||
content_score -= 3
|
||||
elif name in [
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"th",
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
]:
|
||||
content_score -= 5
|
||||
return {"content_score": content_score, "elem": elem}
|
||||
|
||||
def remove_unlikely_candidates(self):
|
||||
for elem in self.html.findall(".//*"):
|
||||
s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
|
||||
if len(s) < 2:
|
||||
continue
|
||||
if (
|
||||
REGEXES["unlikelyCandidatesRe"].search(s)
|
||||
and (not REGEXES["okMaybeItsACandidateRe"].search(s))
|
||||
and elem.tag not in ["html", "body"]
|
||||
):
|
||||
log.debug("Removing unlikely candidate - %s" % describe(elem))
|
||||
elem.drop_tree()
|
||||
|
||||
def transform_misused_divs_into_paragraphs(self):
|
||||
for elem in self.tags(self.html, "div"):
|
||||
# transform <div>s that do not contain other block elements into
|
||||
# <p>s
|
||||
# FIXME: The current implementation ignores all descendants that
|
||||
# are not direct children of elem
|
||||
# This results in incorrect results in case there is an <img>
|
||||
# buried within an <a> for example
|
||||
if not REGEXES["divToPElementsRe"].search(
|
||||
str_(b"".join(map(tostring_, list(elem))))
|
||||
):
|
||||
# log.debug("Altering %s to p" % (describe(elem)))
|
||||
elem.tag = "p"
|
||||
# print "Fixed element "+describe(elem)
|
||||
|
||||
for elem in self.tags(self.html, "div"):
|
||||
if elem.text and elem.text.strip():
|
||||
p = fragment_fromstring("<p/>")
|
||||
p.text = elem.text
|
||||
elem.text = None
|
||||
elem.insert(0, p)
|
||||
# print "Appended "+tounicode(p)+" to "+describe(elem)
|
||||
|
||||
for pos, child in reversed(list(enumerate(elem))):
|
||||
if child.tail and child.tail.strip():
|
||||
p = fragment_fromstring("<p/>")
|
||||
p.text = child.tail
|
||||
child.tail = None
|
||||
elem.insert(pos + 1, p)
|
||||
# print "Inserted "+tounicode(p)+" to "+describe(elem)
|
||||
if child.tag == "br":
|
||||
# print 'Dropped <br> at '+describe(elem)
|
||||
child.drop_tree()
|
||||
|
||||
def tags(self, node, *tag_names):
|
||||
for tag_name in tag_names:
|
||||
for e in node.findall(".//%s" % tag_name):
|
||||
yield e
|
||||
|
||||
def reverse_tags(self, node, *tag_names):
|
||||
for tag_name in tag_names:
|
||||
for e in reversed(node.findall(".//%s" % tag_name)):
|
||||
yield e
|
||||
|
||||
def sanitize(self, node, candidates):
|
||||
MIN_LEN = self.min_text_length
|
||||
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
|
||||
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
|
||||
header.drop_tree()
|
||||
|
||||
for elem in self.tags(node, "form", "textarea"):
|
||||
elem.drop_tree()
|
||||
|
||||
for elem in self.tags(node, "iframe"):
|
||||
if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]):
|
||||
elem.text = "VIDEO" # ADD content to iframe text node to force <iframe></iframe> proper output
|
||||
else:
|
||||
elem.drop_tree()
|
||||
|
||||
allowed = {}
|
||||
# Conditionally clean <table>s, <ul>s, and <div>s
|
||||
for el in self.reverse_tags(
|
||||
node, "table", "ul", "div", "aside", "header", "footer", "section"
|
||||
):
|
||||
if el in allowed:
|
||||
continue
|
||||
weight = self.class_weight(el)
|
||||
if el in candidates:
|
||||
content_score = candidates[el]["content_score"]
|
||||
# print '!',el, '-> %6.3f' % content_score
|
||||
else:
|
||||
content_score = 0
|
||||
tag = el.tag
|
||||
|
||||
if weight + content_score < 0:
|
||||
log.debug(
|
||||
"Removed %s with score %6.3f and weight %-3s"
|
||||
% (describe(el), content_score, weight,)
|
||||
)
|
||||
el.drop_tree()
|
||||
elif el.text_content().count(",") < 10:
|
||||
counts = {}
|
||||
for kind in ["p", "img", "li", "a", "embed", "input"]:
|
||||
counts[kind] = len(el.findall(".//%s" % kind))
|
||||
counts["li"] -= 100
|
||||
counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
|
||||
|
||||
# Count the text length excluding any surrounding whitespace
|
||||
content_length = text_length(el)
|
||||
link_density = self.get_link_density(el)
|
||||
parent_node = el.getparent()
|
||||
if parent_node is not None:
|
||||
if parent_node in candidates:
|
||||
content_score = candidates[parent_node]["content_score"]
|
||||
else:
|
||||
content_score = 0
|
||||
# if parent_node is not None:
|
||||
# pweight = self.class_weight(parent_node) + content_score
|
||||
# pname = describe(parent_node)
|
||||
# else:
|
||||
# pweight = 0
|
||||
# pname = "no parent"
|
||||
to_remove = False
|
||||
reason = ""
|
||||
|
||||
# if el.tag == 'div' and counts["img"] >= 1:
|
||||
# continue
|
||||
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
|
||||
reason = "too many images (%s)" % counts["img"]
|
||||
to_remove = True
|
||||
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
|
||||
reason = "more <li>s than <p>s"
|
||||
to_remove = True
|
||||
elif counts["input"] > (counts["p"] / 3):
|
||||
reason = "less than 3x <p>s than <input>s"
|
||||
to_remove = True
|
||||
elif content_length < MIN_LEN and counts["img"] == 0:
|
||||
reason = (
|
||||
"too short content length %s without a single image"
|
||||
% content_length
|
||||
)
|
||||
to_remove = True
|
||||
elif content_length < MIN_LEN and counts["img"] > 2:
|
||||
reason = (
|
||||
"too short content length %s and too many images"
|
||||
% content_length
|
||||
)
|
||||
to_remove = True
|
||||
elif weight < 25 and link_density > 0.2:
|
||||
reason = "too many links %.3f for its weight %s" % (
|
||||
link_density,
|
||||
weight,
|
||||
)
|
||||
to_remove = True
|
||||
elif weight >= 25 and link_density > 0.5:
|
||||
reason = "too many links %.3f for its weight %s" % (
|
||||
link_density,
|
||||
weight,
|
||||
)
|
||||
to_remove = True
|
||||
elif (counts["embed"] == 1 and content_length < 75) or counts[
|
||||
"embed"
|
||||
] > 1:
|
||||
reason = (
|
||||
"<embed>s with too short content length, or too many <embed>s"
|
||||
)
|
||||
to_remove = True
|
||||
elif not content_length:
|
||||
reason = "no content"
|
||||
to_remove = True
|
||||
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
|
||||
# imgs = el.findall('.//img')
|
||||
# valid_img = False
|
||||
# log.debug(tounicode(el))
|
||||
# for img in imgs:
|
||||
#
|
||||
# height = img.get('height')
|
||||
# text_length = img.get('text_length')
|
||||
# log.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
|
||||
# if to_int(height) >= 100 or to_int(text_length) >= 100:
|
||||
# valid_img = True
|
||||
# log.debug("valid image" + tounicode(img))
|
||||
# break
|
||||
# if valid_img:
|
||||
# to_remove = False
|
||||
# log.debug("Allowing %s" %el.text_content())
|
||||
# for desnode in self.tags(el, "table", "ul", "div"):
|
||||
# allowed[desnode] = True
|
||||
|
||||
# find x non empty preceding and succeeding siblings
|
||||
i, j = 0, 0
|
||||
x = 1
|
||||
siblings = []
|
||||
for sib in el.itersiblings():
|
||||
# log.debug(sib.text_content())
|
||||
sib_content_length = text_length(sib)
|
||||
if sib_content_length:
|
||||
i = +1
|
||||
siblings.append(sib_content_length)
|
||||
if i == x:
|
||||
break
|
||||
for sib in el.itersiblings(preceding=True):
|
||||
# log.debug(sib.text_content())
|
||||
sib_content_length = text_length(sib)
|
||||
if sib_content_length:
|
||||
j = +1
|
||||
siblings.append(sib_content_length)
|
||||
if j == x:
|
||||
break
|
||||
# log.debug(str_(siblings))
|
||||
if siblings and sum(siblings) > 1000:
|
||||
to_remove = False
|
||||
log.debug("Allowing %s" % describe(el))
|
||||
for desnode in self.tags(el, "table", "ul", "div", "section"):
|
||||
allowed[desnode] = True
|
||||
|
||||
if to_remove:
|
||||
log.debug(
|
||||
"Removed %6.3f %s with weight %s cause it has %s."
|
||||
% (content_score, describe(el), weight, reason)
|
||||
)
|
||||
# print tounicode(el)
|
||||
# log.debug("pname %s pweight %.3f" %(pname, pweight))
|
||||
el.drop_tree()
|
||||
else:
|
||||
log.debug(
|
||||
"Not removing %s of length %s: %s"
|
||||
% (describe(el), content_length, text_content(el))
|
||||
)
|
||||
|
||||
self.html = node
|
||||
return self.get_clean_html()
|
||||
|
||||
|
||||
def main():
|
||||
VERBOSITY = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}
|
||||
|
||||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser(usage="%prog: [options] [file]")
|
||||
parser.add_option("-v", "--verbose", action="count", default=0)
|
||||
parser.add_option(
|
||||
"-b", "--browser", default=None, action="store_true", help="open in browser"
|
||||
)
|
||||
parser.add_option(
|
||||
"-l", "--log", default=None, help="save logs into file (appended)"
|
||||
)
|
||||
parser.add_option(
|
||||
"-u", "--url", default=None, help="use URL instead of a local file"
|
||||
)
|
||||
parser.add_option("-x", "--xpath", default=None, help="add original xpath")
|
||||
parser.add_option(
|
||||
"-p",
|
||||
"--positive-keywords",
|
||||
default=None,
|
||||
help="positive keywords (comma-separated)",
|
||||
action="store",
|
||||
)
|
||||
parser.add_option(
|
||||
"-n",
|
||||
"--negative-keywords",
|
||||
default=None,
|
||||
help="negative keywords (comma-separated)",
|
||||
action="store",
|
||||
)
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if options.verbose:
|
||||
logging.basicConfig(
|
||||
level=VERBOSITY[options.verbose],
|
||||
filename=options.log,
|
||||
format="%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)",
|
||||
)
|
||||
|
||||
if not (len(args) == 1 or options.url):
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
file = None
|
||||
if options.url:
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
if sys.version_info[0] == 3:
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
|
||||
request = urllib.request.Request(options.url, None, headers)
|
||||
file = urllib.request.urlopen(request)
|
||||
else:
|
||||
import urllib2
|
||||
|
||||
request = urllib2.Request(options.url, None, headers)
|
||||
file = urllib2.urlopen(request)
|
||||
else:
|
||||
file = open(args[0], "rt")
|
||||
try:
|
||||
doc = Document(
|
||||
file.read(),
|
||||
url=options.url,
|
||||
positive_keywords=options.positive_keywords,
|
||||
negative_keywords=options.negative_keywords,
|
||||
)
|
||||
if options.browser:
|
||||
from .browser import open_in_browser
|
||||
|
||||
result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
|
||||
open_in_browser(result)
|
||||
else:
|
||||
enc = (
|
||||
sys.__stdout__.encoding or "utf-8"
|
||||
) # XXX: this hack could not always work, better to set PYTHONIOENCODING
|
||||
result = "Title:" + doc.short_title() + "\n" + doc.summary()
|
||||
if sys.version_info[0] == 3:
|
||||
print(result)
|
||||
else:
|
||||
print(result.encode(enc, "replace"))
|
||||
finally:
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,6 +0,0 @@
|
||||
lxml
|
||||
chardet
|
||||
nose
|
||||
pep8
|
||||
coverage
|
||||
timeout_decorator
|
@ -1 +0,0 @@
|
||||
-e .
|
@ -1,78 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
from __future__ import print_function
|
||||
import codecs
|
||||
import os
|
||||
import re
|
||||
from setuptools import setup
|
||||
import sys
|
||||
|
||||
lxml_requirement = "lxml"
|
||||
if sys.platform == "darwin":
|
||||
import platform
|
||||
|
||||
mac_ver = platform.mac_ver()[0]
|
||||
mac_major, mac_minor = mac_ver.split('.')[:2]
|
||||
if int(mac_major) == 10 and int(mac_minor) < 9:
|
||||
print("Using lxml<2.4")
|
||||
lxml_requirement = "lxml<2.4"
|
||||
|
||||
test_deps = [
|
||||
# Test timeouts
|
||||
"timeout_decorator",
|
||||
version = "0.3.0"
|
||||
install_requires = [
|
||||
"chardet",
|
||||
"lxml",
|
||||
]
|
||||
tests_require = [
|
||||
'coverage',
|
||||
'nose',
|
||||
'pep8',
|
||||
'PyYaml',
|
||||
]
|
||||
|
||||
extras = {
|
||||
"test": test_deps,
|
||||
}
|
||||
|
||||
# Adapted from https://github.com/pypa/pip/blob/master/setup.py
|
||||
def find_version(*file_paths):
|
||||
here = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
# Intentionally *not* adding an encoding option to open, See:
|
||||
# https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
|
||||
with codecs.open(os.path.join(here, *file_paths), "r") as fp:
|
||||
version_file = fp.read()
|
||||
version_match = re.search(
|
||||
r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M,
|
||||
)
|
||||
if version_match:
|
||||
return version_match.group(1)
|
||||
|
||||
raise RuntimeError("Unable to find version string.")
|
||||
|
||||
|
||||
setup(
|
||||
name="readability-lxml",
|
||||
version=find_version("readability", "__init__.py"),
|
||||
version=version,
|
||||
author="Yuri Baburov",
|
||||
author_email="burchik@gmail.com",
|
||||
description="fast html to text parser (article readability tool) with python 3 support",
|
||||
test_suite="tests.test_article_only",
|
||||
description="fast python port of arc90's readability tool",
|
||||
keywords='readable read parse html document readability',
|
||||
long_description=open("README.rst").read(),
|
||||
long_description_content_type='text/x-rst',
|
||||
license="Apache License 2.0",
|
||||
url="http://github.com/buriy/python-readability",
|
||||
packages=["readability", "readability.compat"],
|
||||
install_requires=["chardet", lxml_requirement, "cssselect"],
|
||||
tests_require=test_deps,
|
||||
extras_require=extras,
|
||||
classifiers=[
|
||||
"Environment :: Web Environment",
|
||||
"Intended Audience :: Developers",
|
||||
"Operating System :: OS Independent",
|
||||
"Topic :: Text Processing :: Indexing",
|
||||
"Topic :: Utilities",
|
||||
"Topic :: Internet",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 2",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
],
|
||||
url="http://github.com/buriy/python-readability",
|
||||
packages=find_packages('src', exclude=["*.tests", "*.tests.*"]),
|
||||
package_dir = {'': 'src'},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
install_requires=install_requires,
|
||||
tests_require=tests_require,
|
||||
extras_require={'test': tests_require},
|
||||
test_suite = "nose.collector",
|
||||
entry_points={
|
||||
'console_scripts':
|
||||
['readability=readability_lxml:client.main']
|
||||
},
|
||||
)
|
||||
|
@ -0,0 +1,3 @@
|
||||
VERSION = '0.2.5'
|
||||
|
||||
import client
|
@ -0,0 +1,38 @@
|
||||
# strip out a set of nuisance html attributes that can mess up rendering in
|
||||
# RSS feeds
|
||||
import re
|
||||
from lxml.html.clean import Cleaner
|
||||
|
||||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
|
||||
'background[-a-z]*', 'on*']
|
||||
single_quoted = "'[^']+'"
|
||||
double_quoted = '"[^"]+"'
|
||||
non_space = '[^ "\'>]+'
|
||||
htmlstrip = re.compile("<" # open
|
||||
"([^>]+) " # prefix
|
||||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||
"([^>]*)" # postfix
|
||||
">", # end
|
||||
re.I)
|
||||
|
||||
|
||||
def clean_attributes(html):
|
||||
while htmlstrip.search(html):
|
||||
html = htmlstrip.sub('<\\1\\2>', html)
|
||||
return html
|
||||
|
||||
|
||||
def normalize_spaces(s):
|
||||
"""replace any sequence of whitespace characters with a single space"""
|
||||
if not s:
|
||||
return ''
|
||||
return ' '.join(s.split())
|
||||
|
||||
|
||||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||
style=True, links=True, meta=False, add_nofollow=False,
|
||||
page_structure=False, processing_instructions=True,
|
||||
embedded=False, frames=False, forms=False,
|
||||
annoying_tags=False, remove_tags=None,
|
||||
remove_unknown_tags=False, safe_attrs_only=False)
|
@ -0,0 +1,69 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from readability_lxml import VERSION
|
||||
from readability_lxml.readability import Document
|
||||
|
||||
|
||||
def parse_args():
|
||||
desc = "fast python port of arc90's readability tool"
|
||||
parser = argparse.ArgumentParser(description=desc)
|
||||
parser.add_argument('--version',
|
||||
action='version', version=VERSION)
|
||||
|
||||
parser.add_argument('-v', '--verbose',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Increase logging verbosity to DEBUG.')
|
||||
|
||||
parser.add_argument('-m', '--metadata',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='print all metadata as well as content for the content')
|
||||
|
||||
parser.add_argument('path', metavar='P', type=str, nargs=1,
|
||||
help="The url or file path to process in readable form.")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
target = args.path[0]
|
||||
|
||||
if target.startswith('http') or target.startswith('www'):
|
||||
is_url = True
|
||||
url = target
|
||||
else:
|
||||
is_url = False
|
||||
url = None
|
||||
|
||||
if is_url:
|
||||
import urllib
|
||||
target = urllib.urlopen(target)
|
||||
else:
|
||||
target = open(target, 'rt')
|
||||
|
||||
enc = sys.__stdout__.encoding or 'utf-8'
|
||||
|
||||
try:
|
||||
doc = Document(target.read(),
|
||||
debug=args.verbose,
|
||||
url=url)
|
||||
if args.metadata:
|
||||
m = doc.summary_with_metadata()
|
||||
print m.title()
|
||||
print m.short_title()
|
||||
print m.confidence
|
||||
print m.html.encode(enc, 'replace')
|
||||
else:
|
||||
print doc.summary().encode(enc, 'replace')
|
||||
|
||||
finally:
|
||||
target.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,32 @@
|
||||
uids = {}
|
||||
|
||||
|
||||
def save_to_file(text, filename):
|
||||
f = open(filename, 'wt')
|
||||
f.write("""
|
||||
<meta http-equiv="Content-Type"
|
||||
content="text/html; charset=UTF-8"
|
||||
/>""")
|
||||
f.write(text.encode('utf-8'))
|
||||
f.close()
|
||||
|
||||
|
||||
def describe(node, depth=2):
|
||||
if not hasattr(node, 'tag'):
|
||||
return "[%s]" % type(node)
|
||||
name = node.tag
|
||||
if node.get('id', ''):
|
||||
name += '#' + node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ', '.')
|
||||
if name[:4] in ['div#', 'div.']:
|
||||
name = name[3:]
|
||||
if name in ['tr', 'td', 'div', 'p']:
|
||||
if not node in uids:
|
||||
uid = uids[node] = len(uids) + 1
|
||||
else:
|
||||
uid = uids.get(node)
|
||||
name += "%02d" % (uid)
|
||||
if depth and node.getparent() is not None:
|
||||
return name + ' - ' + describe(node.getparent(), depth - 1)
|
||||
return name
|
@ -0,0 +1,27 @@
|
||||
import logging
|
||||
import re
|
||||
import chardet
|
||||
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
def get_encoding(page):
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
try:
|
||||
diff = text.decode(enc, 'ignore').encode(enc)
|
||||
sizes = len(diff), len(text)
|
||||
# 99% of utf-8
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
|
||||
return enc
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
# print '->', enc, "%.2f" % res['confidence']
|
||||
if enc == 'MacCyrillic':
|
||||
enc = 'cp1251'
|
||||
return enc
|
@ -0,0 +1,139 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import HTMLParser
|
||||
from lxml.html import tostring
|
||||
|
||||
from cleaners import clean_attributes
|
||||
from cleaners import normalize_spaces
|
||||
from encoding import get_encoding
|
||||
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
utf8_parser = HTMLParser(encoding='utf-8')
|
||||
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
def build_doc(page):
|
||||
"""Requires that the `page` not be None"""
|
||||
if page is None:
|
||||
LOG.error("Page content is None, can't build_doc")
|
||||
return ''
|
||||
if isinstance(page, unicode):
|
||||
page_unicode = page
|
||||
else:
|
||||
enc = get_encoding(page)
|
||||
page_unicode = page.decode(enc, 'replace')
|
||||
doc = document_fromstring(
|
||||
page_unicode.encode('utf-8', 'replace'),
|
||||
parser=utf8_parser)
|
||||
return doc
|
||||
|
||||
|
||||
def js_re(src, pattern, flags, repl):
|
||||
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
||||
|
||||
|
||||
def normalize_entities(cur_title):
|
||||
entities = {
|
||||
u'\u2014': '-',
|
||||
u'\u2013': '-',
|
||||
u'—': '-',
|
||||
u'–': '-',
|
||||
u'\u00A0': ' ',
|
||||
u'\u00AB': '"',
|
||||
u'\u00BB': '"',
|
||||
u'"': '"',
|
||||
}
|
||||
for c, r in entities.iteritems():
|
||||
if c in cur_title:
|
||||
cur_title = cur_title.replace(c, r)
|
||||
|
||||
return cur_title
|
||||
|
||||
|
||||
def norm_title(title):
|
||||
return normalize_entities(normalize_spaces(title))
|
||||
|
||||
|
||||
def get_title(doc):
|
||||
title_node = doc.find('.//title')
|
||||
|
||||
if not title_node:
|
||||
return '[no-title]'
|
||||
title = title_node.text
|
||||
return norm_title(title)
|
||||
|
||||
|
||||
def add_match(collection, text, orig):
|
||||
text = norm_title(text)
|
||||
if len(text.split()) >= 2 and len(text) >= 15:
|
||||
if text.replace('"', '') in orig.replace('"', ''):
|
||||
collection.add(text)
|
||||
|
||||
|
||||
def shorten_title(doc):
|
||||
title_node = doc.find('.//title')
|
||||
if not title_node:
|
||||
return ''
|
||||
|
||||
title = title_node.text
|
||||
title = orig = norm_title(title)
|
||||
|
||||
candidates = set()
|
||||
|
||||
for item in ['.//h1', './/h2', './/h3']:
|
||||
for e in list(doc.iterfind(item)):
|
||||
if e.text:
|
||||
add_match(candidates, e.text, orig)
|
||||
if e.text_content():
|
||||
add_match(candidates, e.text_content(), orig)
|
||||
|
||||
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
|
||||
'.title', '.head', '.heading', '.contentheading', '.small_header_red']:
|
||||
for e in doc.cssselect(item):
|
||||
if e.text:
|
||||
add_match(candidates, e.text, orig)
|
||||
if e.text_content():
|
||||
add_match(candidates, e.text_content(), orig)
|
||||
|
||||
if candidates:
|
||||
title = sorted(candidates, key=len)[-1]
|
||||
else:
|
||||
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
|
||||
if delimiter in title:
|
||||
parts = orig.split(delimiter)
|
||||
if len(parts[0].split()) >= 4:
|
||||
title = parts[0]
|
||||
break
|
||||
elif len(parts[-1].split()) >= 4:
|
||||
title = parts[-1]
|
||||
break
|
||||
else:
|
||||
if ': ' in title:
|
||||
parts = orig.split(': ')
|
||||
if len(parts[-1].split()) >= 4:
|
||||
title = parts[-1]
|
||||
else:
|
||||
title = orig.split(': ', 1)[1]
|
||||
|
||||
if not 15 < len(title) < 150:
|
||||
return orig
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def get_body(doc):
|
||||
[elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
|
||||
raw_html = unicode(tostring(doc.body or doc))
|
||||
cleaned = clean_attributes(raw_html)
|
||||
try:
|
||||
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
||||
return cleaned
|
||||
except Exception: # FIXME find the equivalent lxml error
|
||||
logging.error("cleansing broke html content: %s\n---------\n%s" % (
|
||||
raw_html,
|
||||
cleaned))
|
||||
return raw_html
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,22 @@
|
||||
import urllib2
|
||||
|
||||
|
||||
class UrlFetch():
|
||||
"""
|
||||
A class for fetching URLs. This provides a layer of abstraction that can
|
||||
be easily replaced for testing.
|
||||
"""
|
||||
|
||||
def urlread(self, url):
|
||||
return urllib2.urlopen(url).read()
|
||||
|
||||
|
||||
class MockUrlFetch(UrlFetch):
|
||||
|
||||
def __init__(self, urldict):
|
||||
self._urldict = urldict
|
||||
|
||||
def urlread(self, url):
|
||||
path = self._urldict[url]
|
||||
with open(path, 'r') as f:
|
||||
return f.read()
|
@ -0,0 +1,174 @@
|
||||
"""
|
||||
This program facilitates the creation of a regression test case as used by the
|
||||
test module. It uses the current readability algorithm to capture a benchmark
|
||||
and construct a new test case.
|
||||
|
||||
"""
|
||||
import argparse
|
||||
import errno
|
||||
import os
|
||||
import os.path
|
||||
import urllib2
|
||||
import yaml
|
||||
|
||||
from readability_lxml import readability
|
||||
from readability_lxml import urlfetch
|
||||
|
||||
from regression import (
|
||||
TEST_DATA_PATH,
|
||||
ORIGINAL_SUFFIX,
|
||||
READABLE_SUFFIX,
|
||||
YAML_EXTENSION,
|
||||
adjust_url_map,
|
||||
read_yaml
|
||||
)
|
||||
|
||||
|
||||
OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
|
||||
|
||||
|
||||
def y_or_n(question):
|
||||
while True:
|
||||
response = raw_input(question).strip()
|
||||
if len(response) > 0:
|
||||
return response[0] in ['y', 'Y']
|
||||
|
||||
|
||||
def write_file(test_name, suffix, data):
|
||||
path = os.path.join(TEST_DATA_PATH, test_name + suffix)
|
||||
mode = 0644
|
||||
try:
|
||||
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
|
||||
except OSError as e:
|
||||
if e.errno == errno.EEXIST:
|
||||
if y_or_n(OVERWRITE_QUESTION % path):
|
||||
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, mode)
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
raise e
|
||||
f = os.fdopen(fd, 'w')
|
||||
f.write(data)
|
||||
return True
|
||||
|
||||
|
||||
def write_original(test_name, orig):
|
||||
return write_file(test_name, ORIGINAL_SUFFIX, orig)
|
||||
|
||||
|
||||
def write_readable(test_name, orig, options):
|
||||
rdbl_doc = readability.Document(orig, **options)
|
||||
summary = rdbl_doc.summary()
|
||||
return write_file(test_name, READABLE_SUFFIX, summary.html)
|
||||
|
||||
|
||||
def read_spec(test_name):
|
||||
yaml_path = os.path.join(
|
||||
TEST_DATA_PATH,
|
||||
test_name + YAML_EXTENSION
|
||||
)
|
||||
return read_yaml(yaml_path)
|
||||
|
||||
def read_orig(test_name, url = None):
|
||||
"""
|
||||
Reads the original HTML for a given test. If a url is provided, the HTML
|
||||
is fetched from it. Otherwise, we look for an existing local copy. This
|
||||
returns a pair: (HTML string, True iff the HTML has been or is already
|
||||
stored in a local copy).
|
||||
"""
|
||||
if url:
|
||||
orig = urllib2.urlopen(url).read()
|
||||
write_result = write_file(test_name, ORIGINAL_SUFFIX, orig)
|
||||
return orig, write_result
|
||||
else:
|
||||
orig_path = os.path.join(
|
||||
TEST_DATA_PATH,
|
||||
test_name + ORIGINAL_SUFFIX
|
||||
)
|
||||
orig = open(orig_path).read()
|
||||
return orig, True
|
||||
|
||||
def create(args):
|
||||
# TODO: Make this work for multi-page articles.
|
||||
spec_dict = {'url': args.url, 'test_description': args.test_description}
|
||||
spec = yaml.dump(spec_dict, default_flow_style = False)
|
||||
if not write_file(args.test_name, YAML_EXTENSION, spec):
|
||||
return False
|
||||
orig = urllib2.urlopen(url).read()
|
||||
if not write_original(args.test_name, orig):
|
||||
return False
|
||||
if not write_readable(args.test_name, orig):
|
||||
return False
|
||||
return True
|
||||
|
||||
def genbench(args):
|
||||
spec_dict = read_spec(args.test_name)
|
||||
if args.refetch:
|
||||
url = spec_dict['url']
|
||||
else:
|
||||
url = None
|
||||
url_map = adjust_url_map(spec_dict.get('url_map', dict()))
|
||||
fetcher = urlfetch.MockUrlFetch(url_map)
|
||||
options = {'url': spec_dict['url'], 'urlfetch': fetcher}
|
||||
orig, success = read_orig(args.test_name, url)
|
||||
if not success:
|
||||
return False
|
||||
rdbl_doc = readability.Document(orig, **options)
|
||||
summary = rdbl_doc.summary()
|
||||
if not write_file(args.test_name, READABLE_SUFFIX, summary.html):
|
||||
return False
|
||||
return True
|
||||
|
||||
DESCRIPTION = 'Create a readability regression test case.'
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description = DESCRIPTION)
|
||||
subparsers = parser.add_subparsers(help = 'available subcommands')
|
||||
|
||||
parser_create = subparsers.add_parser(
|
||||
'create',
|
||||
help = 'create an entirely new test'
|
||||
)
|
||||
parser_create.add_argument(
|
||||
'url',
|
||||
metavar = 'url',
|
||||
help = 'the url for which to generate a test'
|
||||
)
|
||||
parser_create.add_argument(
|
||||
'test_name',
|
||||
metavar = 'test-name',
|
||||
help = 'the name of the test'
|
||||
)
|
||||
parser_create.add_argument(
|
||||
'test_description',
|
||||
metavar = 'test-description',
|
||||
help = 'the description of the test'
|
||||
)
|
||||
parser_create.set_defaults(func = create)
|
||||
|
||||
parser_genbench = subparsers.add_parser(
|
||||
'genbench',
|
||||
help = 'regenerate the benchmark for an existing test'
|
||||
)
|
||||
parser_genbench.add_argument(
|
||||
'test_name',
|
||||
metavar = 'test-name',
|
||||
help = 'the name of the test'
|
||||
)
|
||||
parser_genbench.add_argument(
|
||||
'--refetch',
|
||||
dest = 'refetch',
|
||||
action = 'store_const',
|
||||
const = True,
|
||||
default = False,
|
||||
help = 'if set, original html is refetched from the url'
|
||||
)
|
||||
parser_genbench.set_defaults(func = genbench)
|
||||
|
||||
args = parser.parse_args()
|
||||
result = args.func(args)
|
||||
if not result:
|
||||
print('test was not fully generated')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,15 @@
|
||||
import os
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
REGRESSION_DATA = os.path.join(os.path.dirname(__file__), 'regression_test_data')
|
||||
|
||||
|
||||
def load_sample(filename):
|
||||
"""Helper to get the content out of the sample files"""
|
||||
return open(os.path.join(SAMPLES, filename)).read()
|
||||
|
||||
|
||||
def load_regression_data(filename):
|
||||
"""Get the content of a test_data regression file"""
|
||||
return open(os.path.join(REGRESSION_DATA, filename)).read()
|
@ -0,0 +1,354 @@
|
||||
"""
|
||||
This module provides a regression test for results of running the readability
|
||||
algorithm on a variety of different real-world examples. For each page in the
|
||||
test suite, a benchmark was captured that represents the current readability
|
||||
results. Note that these are not necessarily ideal results, just the ones used
|
||||
as a benchmark.
|
||||
|
||||
This allows you to tweak and change the readability algorithm and see how it
|
||||
changes existing results, hopefully for the better.
|
||||
|
||||
"""
|
||||
import logging
|
||||
import lxml.html
|
||||
import lxml.html.diff
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
import unittest
|
||||
import yaml
|
||||
|
||||
from lxml.html import builder as B
|
||||
from readability_lxml import readability
|
||||
from readability_lxml import urlfetch
|
||||
|
||||
|
||||
DIFF_SUFFIX = '-diff.html'
|
||||
ORIGINAL_SUFFIX = '-orig.html'
|
||||
READABLE_SUFFIX = '-rdbl.html'
|
||||
RESULT_SUFFIX = '-result.html'
|
||||
YAML_EXTENSION = '.yaml'
|
||||
|
||||
TESTDIR = os.path.dirname(__file__)
|
||||
TEST_DATA_PATH = os.path.join(TESTDIR, 'regression_test_data')
|
||||
TEST_OUTPUT_PATH = os.path.join(TESTDIR, 'regression_test_output')
|
||||
TEST_SUMMARY_PATH = os.path.join(TEST_OUTPUT_PATH, 'index.html')
|
||||
|
||||
SUMMARY_CSS = '''
|
||||
table, th, td {
|
||||
border: 1px solid black;
|
||||
border-collapse: collapse;
|
||||
font-family: Georgia, 'Times New Roman', serif;
|
||||
}
|
||||
table {
|
||||
margin: auto;
|
||||
}
|
||||
.skipped {
|
||||
color: gray;
|
||||
}
|
||||
td, th {
|
||||
font-size: 1.2em;
|
||||
border: 1px solid black;
|
||||
padding: 3px 7px 2px 7px;
|
||||
}
|
||||
th {
|
||||
font-size: 16px;
|
||||
text-align: left;
|
||||
padding-top: 5px;
|
||||
padding-bottom: 4px;
|
||||
}
|
||||
'''
|
||||
|
||||
READABILITY_CSS = '''
|
||||
#article {
|
||||
margin: 0 auto;
|
||||
max-width: 705px;
|
||||
min-width: 225px;
|
||||
font-family: Georgia, 'Times New Roman', serif;
|
||||
font-size: 19px;
|
||||
line-height: 29px;
|
||||
}
|
||||
|
||||
#article p {
|
||||
font-size: 19px;
|
||||
line-height: 29px;
|
||||
margin: 19px 0px 19px 0px;
|
||||
}
|
||||
|
||||
ins {
|
||||
background-color: #C6F7C3;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
ins img {
|
||||
border-width: 3px;
|
||||
border-style: dotted;
|
||||
border-color: #51B548;
|
||||
}
|
||||
|
||||
del {
|
||||
background-color: #F7C3C3;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
del img {
|
||||
border-width: 3px;
|
||||
border-style: dotted;
|
||||
border-color: #D12626;
|
||||
}
|
||||
'''
|
||||
|
||||
class ReadabilityTest:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dir_path,
|
||||
enabled,
|
||||
name,
|
||||
url,
|
||||
desc,
|
||||
notes,
|
||||
url_map,
|
||||
orig_path,
|
||||
rdbl_path
|
||||
):
|
||||
self.dir_path = dir_path
|
||||
self.enabled = enabled
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.desc = desc
|
||||
self.notes = notes
|
||||
self.url_map = url_map
|
||||
self.orig_path = orig_path
|
||||
self.rdbl_path = rdbl_path
|
||||
|
||||
|
||||
class ReadabilityTestData:
|
||||
|
||||
def __init__(self, test, orig_html, rdbl_html):
|
||||
self.test = test
|
||||
self.orig_html = orig_html
|
||||
self.rdbl_html = rdbl_html
|
||||
|
||||
|
||||
class ReadabilityTestResult:
|
||||
|
||||
def __init__(self, test_data, result_html, diff_html):
|
||||
self.test_data = test_data
|
||||
self.result_html = result_html
|
||||
self.diff_html = diff_html
|
||||
|
||||
|
||||
def read_yaml(path):
|
||||
with open(path, 'r') as f:
|
||||
return yaml.load(f)
|
||||
|
||||
|
||||
def make_path(dir_path, name, suffix):
|
||||
return os.path.join(dir_path, ''.join([name, suffix]))
|
||||
|
||||
|
||||
def adjust_url_map(url_map):
|
||||
adjusted = dict()
|
||||
for k, v in url_map.items():
|
||||
adjusted[k] = os.path.join(TEST_DATA_PATH, v)
|
||||
return adjusted
|
||||
|
||||
|
||||
def make_readability_test(dir_path, name, spec_dict):
|
||||
enabled = spec_dict.get('enabled', True)
|
||||
notes = spec_dict.get('notes', '')
|
||||
url_map = adjust_url_map(spec_dict.get('url_map', dict()))
|
||||
return ReadabilityTest(
|
||||
dir_path,
|
||||
enabled,
|
||||
name,
|
||||
spec_dict['url'],
|
||||
spec_dict['test_description'],
|
||||
notes,
|
||||
url_map,
|
||||
make_path(dir_path, name, ORIGINAL_SUFFIX),
|
||||
make_path(dir_path, name, READABLE_SUFFIX)
|
||||
)
|
||||
|
||||
|
||||
def load_test_data(test):
|
||||
if test.enabled:
|
||||
orig = open(test.orig_path, 'r').read()
|
||||
rdbl = open(test.rdbl_path, 'r').read()
|
||||
return ReadabilityTestData(test, orig, rdbl)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def load_readability_tests(dir_path, files):
|
||||
yaml_files = [f for f in files if f.endswith(YAML_EXTENSION)]
|
||||
yaml_paths = [os.path.join(dir_path, f) for f in yaml_files]
|
||||
names = [re.sub('.yaml$', '', f) for f in yaml_files]
|
||||
spec_dicts = [read_yaml(p) for p in yaml_paths]
|
||||
return [
|
||||
make_readability_test(dir_path, name, spec_dict)
|
||||
for (name, spec_dict) in zip(names, spec_dicts)
|
||||
]
|
||||
|
||||
|
||||
def execute_test(test_data):
|
||||
if test_data is None:
|
||||
return None
|
||||
else:
|
||||
url = test_data.test.url
|
||||
fetcher = urlfetch.MockUrlFetch(test_data.test.url_map)
|
||||
doc = readability.Document(
|
||||
test_data.orig_html,
|
||||
url=url,
|
||||
urlfetch=fetcher
|
||||
)
|
||||
summary = doc.summary_with_metadata()
|
||||
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
|
||||
return ReadabilityTestResult(test_data, summary.html, diff)
|
||||
|
||||
|
||||
def element_string_lengths(elems):
|
||||
return [len(e.xpath('string()')) for e in elems]
|
||||
|
||||
|
||||
class ResultSummary():
|
||||
|
||||
def __init__(self, result):
|
||||
# logging.debug('diff: %s' % result.diff_html)
|
||||
doc = lxml.html.fragment_fromstring(result.diff_html)
|
||||
|
||||
insertions = doc.xpath('//ins')
|
||||
insertion_lengths = element_string_lengths(insertions)
|
||||
self.insertions = sum(insertion_lengths)
|
||||
self.insertion_blocks = len(insertions)
|
||||
|
||||
deletions = doc.xpath('//del')
|
||||
deletion_lengths = element_string_lengths(deletions)
|
||||
self.deletions = sum(deletion_lengths)
|
||||
self.deletion_blocks = len(deletions)
|
||||
pass
|
||||
|
||||
|
||||
def make_summary_row(test, result):
|
||||
def data(suffix):
|
||||
return os.path.abspath(os.path.join(TEST_DATA_PATH, test.name + suffix))
|
||||
def output(suffix):
|
||||
return test.name + suffix
|
||||
if test.enabled:
|
||||
s = ResultSummary(result)
|
||||
return B.TR(
|
||||
B.TD(test.name),
|
||||
B.TD('%d (%d)' % (s.insertions, s.insertion_blocks)),
|
||||
B.TD('%d (%d)' % (s.deletions, s.deletion_blocks)),
|
||||
B.TD(
|
||||
B.A('original', href = data(ORIGINAL_SUFFIX)),
|
||||
' ',
|
||||
B.A('benchmark', href = output(READABLE_SUFFIX)),
|
||||
' ',
|
||||
B.A('result', href = output(RESULT_SUFFIX)),
|
||||
' ',
|
||||
B.A('diff', href = output(DIFF_SUFFIX))
|
||||
),
|
||||
B.TD(test.notes)
|
||||
)
|
||||
else:
|
||||
return B.TR(
|
||||
B.CLASS('skipped'),
|
||||
B.TD('%s (SKIPPED)' % test.name),
|
||||
B.TD('N/A'),
|
||||
B.TD('N/A'),
|
||||
B.TD('N/A'),
|
||||
B.TD(test.notes)
|
||||
)
|
||||
|
||||
|
||||
def make_summary_doc(tests_w_results):
|
||||
tbody = B.TBODY(
|
||||
B.TR(
|
||||
B.TH('Test Name'),
|
||||
B.TH('Inserted (in # of blocks)'),
|
||||
B.TH('Deleted (in # of blocks)'),
|
||||
B.TH('Links'),
|
||||
B.TH('Notes')
|
||||
)
|
||||
)
|
||||
for (test, result) in tests_w_results:
|
||||
row = make_summary_row(test, result)
|
||||
tbody.append(row)
|
||||
return B.HTML(
|
||||
B.HEAD(
|
||||
B.TITLE('Readability Test Summary'),
|
||||
B.STYLE(SUMMARY_CSS, type = 'text/css')
|
||||
),
|
||||
B.BODY(
|
||||
B.TABLE(
|
||||
tbody
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def write_summary(path, tests_w_results):
|
||||
doc = make_summary_doc(tests_w_results)
|
||||
with open(path, 'w') as f:
|
||||
f.write(lxml.html.tostring(doc))
|
||||
|
||||
|
||||
def add_css(doc):
|
||||
style = B.STYLE(READABILITY_CSS, type = 'text/css')
|
||||
head = B.HEAD(style, content = 'text/html; charset=utf-8')
|
||||
doc.insert(0, head)
|
||||
|
||||
|
||||
def write_output_fragment(fragment, output_dir_path, test_name, suffix):
|
||||
doc = lxml.html.document_fromstring(fragment)
|
||||
add_css(doc)
|
||||
html = lxml.html.tostring(doc)
|
||||
file_name = ''.join([test_name, suffix])
|
||||
path = os.path.join(output_dir_path, file_name)
|
||||
with open(path, 'w') as f:
|
||||
f.write(html)
|
||||
|
||||
|
||||
def write_result(output_dir_path, result):
|
||||
test_name = result.test_data.test.name
|
||||
specs = [
|
||||
(result.test_data.rdbl_html, READABLE_SUFFIX),
|
||||
(result.diff_html, DIFF_SUFFIX),
|
||||
(result.result_html, RESULT_SUFFIX)
|
||||
]
|
||||
for (html, suffix) in specs:
|
||||
write_output_fragment(html, output_dir_path, test_name, suffix)
|
||||
|
||||
|
||||
def print_test_info(test):
|
||||
name_string = '%s' % test.name
|
||||
if test.enabled:
|
||||
skipped = ''
|
||||
else:
|
||||
skipped = ' (SKIPPED)'
|
||||
print('%20s: %s%s' % (name_string, test.desc, skipped))
|
||||
|
||||
def run_readability_tests():
|
||||
files = os.listdir(TEST_DATA_PATH)
|
||||
tests = load_readability_tests(TEST_DATA_PATH, files)
|
||||
test_datas = [load_test_data(t) for t in tests]
|
||||
results = [execute_test(t) for t in test_datas]
|
||||
for (test, result) in zip(tests, results):
|
||||
print_test_info(test)
|
||||
if result:
|
||||
write_result(TEST_OUTPUT_PATH, result)
|
||||
write_summary(TEST_SUMMARY_PATH, zip(tests, results))
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level = logging.DEBUG)
|
||||
if len(sys.argv) > 1 and sys.argv[1] == 'unittest':
|
||||
del sys.argv[1]
|
||||
return unittest.main()
|
||||
run_readability_tests()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,664 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
|
||||
<head>
|
||||
<title>June Web browser stats: Rapid Release edition</title>
|
||||
|
||||
<!-- Begin CSS -->
|
||||
<link rel="stylesheet" type="text/css" href="http://static.arstechnica.net//public/v6/styles/light/light.c.css?1309476728" media="screen" />
|
||||
<link rel="stylesheet" type="text/css" href="http://static.arstechnica.net//public/v6/styles/print/print.css?1309476728" media="print" />
|
||||
<!-- End CSS -->
|
||||
|
||||
<link rel="apple-touch-icon" href="http://static.arstechnica.net/apple-touch-icon.png" />
|
||||
<link rel="canonical" href="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars" />
|
||||
<link rel="shorturl" href="http://arst.ch/q4c" />
|
||||
<link rel="shortlink" href="http://arst.ch/q4c" />
|
||||
<link rev="canonical" href="http://arst.ch/q4c" />
|
||||
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Ars Technica" />
|
||||
<link rel="shortcut icon" href="http://static.arstechnica.net/favicon.ico" />
|
||||
<link rel="icon" type="image/x-icon" href="http://static.arstechnica.net/favicon.ico" />
|
||||
|
||||
<!-- Begin Feeds -->
|
||||
<link rel="alternate" type="application/rssxml" title="The Web" href="http://feeds.arstechnica.com/arstechnica/web/" />
|
||||
|
||||
<link rel="alternate" type="application/rss+xml" title="All Articles " href="http://feeds.arstechnica.com/arstechnica/everything" />
|
||||
<!-- End Feeds -->
|
||||
|
||||
<!-- C-razy IE9 stuff -->
|
||||
<meta name="application-name" content="Ars Technica"/>
|
||||
<meta name="msapplication-starturl" content="http://arstechnica.com/"/>
|
||||
<meta name="msapplication-tooltip" content="Ars Technica: Serving the technologist for 1.2 decades"/>
|
||||
<meta name="msapplication-task" content="name=News;action-uri=http://arstechnica.com/;icon-uri=http://arstechnica.com/favicon.ico"/>
|
||||
<meta name="msapplication-task" content="name=Features;action-uri=http://arstechnica.com/features/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-features.ico"/>
|
||||
<meta name="msapplication-task" content="name=OpenForum;action-uri=http://arstechnica.com/civis/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-forum.ico"/>
|
||||
<meta name="msapplication-task" content="name=One Microsoft Way;action-uri=http://arstechnica.com/microsoft/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-omw.ico"/>
|
||||
<meta name="msapplication-task" content="name=Subscribe;action-uri=http://arstechnica.com/subscriptions/;icon-uri=http://static.arstechnica.net/ie-jump-menu/jump-subscribe.ico"/>
|
||||
|
||||
|
||||
<!-- Begin Metadata -->
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name="viewport" content="width=1000" />
|
||||
<meta name="description" content="In our monthly look at the world of Web browser market share statistics, we take a look at the first impact of Mozilla's new Rapid Release policy for Firefox and also consider why some Chrome users aren't aboard Google's update bandwagon." />
|
||||
<meta name="keywords" content="" />
|
||||
<meta name="title" content="June Web browser stats: Rapid Release edition" />
|
||||
<link rel="image_src" href="http://static.arstechnica.net/assets/2011/03/firefox-09-small-thumb-300x169-20442-f.jpg" />
|
||||
<meta name="medium" content="news" />
|
||||
|
||||
<meta name="entry_id" content="51247" />
|
||||
<meta property="og:title" content="June Web browser stats: Rapid Release edition"/>
|
||||
<meta property="og:site_name" content="Ars Technica"/>
|
||||
<meta property="og:image" content="http://static.arstechnica.net/assets/2011/03/firefox-09-small-thumb-300x169-20442-f.jpg"/>
|
||||
|
||||
<meta name="advertising" content="ask" />
|
||||
<meta property="fb:admins" content="13703630" />
|
||||
<!-- End Metadata -->
|
||||
<!-- Entry - itbiz_general_computing -->
|
||||
<style type="text/css" id="resource-styles"> </style>
|
||||
<script type="text/javascript" src="/public/shared/scripts/da-1.5.js"></script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
cnp.ad.dart.setSite("ars.dart");
|
||||
cnp.ad.dart.setZone('itbiz_general_computing');
|
||||
//cnp.ad.dart.addParameterString('kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
cnp.ad.dart.addParameterString('mtfIFPath=/mt-static/plugins/ArsTheme/ad-campaigns/doubleclick/');
|
||||
cnp.ad.emptyFrameSrc="/public/shared/scripts/empty.html";
|
||||
cnp.ad.loaderFrameSrc="/public/shared/scripts/ad-loader-frame.html";
|
||||
} catch(e) {}
|
||||
</script>
|
||||
|
||||
<script type="text/javascript" charset="utf-8">
|
||||
// In case someone on a desktop clicks a mobile #! link
|
||||
var l = window.location;
|
||||
if(l.hash.indexOf('#!') !== -1){
|
||||
window.location = l.protocol + '//' + l.host + l.hash.slice(2);
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body class="individual">
|
||||
<div id="page" class="">
|
||||
|
||||
<div id="masthead" class="">
|
||||
<div id="logo"><a href="/"><img src="http://static.arstechnica.net//public/v6/styles/light/images/masthead/logo.png?1309476728" alt="Ars Technica: The Art of Technology" width="110" height="81" /></a></div>
|
||||
<div id="ebc51ce07629d0e14d2fbc4236e44067" >
|
||||
<script type="text/javascript">
|
||||
var pbanner_start = new Date();
|
||||
try {
|
||||
var pbanner = cnp.ad.create(cnp.ad.refreshable, false);
|
||||
//pbanner.addParameter({'dcopt':'ist'});
|
||||
pbanner.addParameterString('kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
pbanner.addParameter({'sz': '728x90' });
|
||||
} catch(e) {}
|
||||
</script>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="search-navigation">
|
||||
<div id="search">
|
||||
<a id="search-link" href="http://www.google.com/cse?cx=011835048811694782689:7zpko-isndo">Search</a>
|
||||
|
||||
<div class="form">
|
||||
<span>Search:</span>
|
||||
<form action="http://www.google.com/cse" id="search-form">
|
||||
<div>
|
||||
<input type="hidden" value="011835048811694782689:7zpko-isndo" name="cx"/>
|
||||
<input type="hidden" value="UTF-8" name="ie"/>
|
||||
<input type="text" id="search-form-text" value="" name="q"/>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div id="navigation">
|
||||
<ul id="primary-navigation">
|
||||
<li class=""><a href="/">All</a></li>
|
||||
<li class="apple"><a href="/apple/">Apple</a></li>
|
||||
<li class="ask-ars"><a href="/ask-ars/">Ask Ars</a></li>
|
||||
<li class="business"><a href="/business/">Business</a></li>
|
||||
<li class="gadgets"><a href="/gadgets/">Gadgets</a></li>
|
||||
<li class="gaming"><a href="/gaming/">Gaming</a></li>
|
||||
<li class="microsoft"><a href="/microsoft/">Microsoft</a></li>
|
||||
<li class="open-source"><a href="/open-source/">Open Source</a></li>
|
||||
<li class="science"><a href="/science/">Science</a></li>
|
||||
<li class="tech-policy"><a href="/tech-policy/">Tech Policy</a></li>
|
||||
<li id="primary-navigation-more" style="display:none;">
|
||||
More
|
||||
<ul >
|
||||
<li><a href="/hardware/">Hardware</a></li>
|
||||
<li><a href="/media/">Media</a></li>
|
||||
<li><a href="/security/">Security</a></li>
|
||||
<li><a href="/software/">Software</a></li>
|
||||
<li><a href="/staff/">Staff</a></li>
|
||||
<li><a href="/telecom/">Telecom</a></li>
|
||||
<li><a href="/web/">Web</a></li>
|
||||
<li style="padding:0;"><span style="display:inline;background-color: #920404; padding: 3px; color:white; -webkit-border-radius: 4px;">New</span> <a style="display:inline;" href="/site/tv.ars" title="Ars Technica TV">Ars.TV</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<ul id="secondary-navigation" class="web">
|
||||
<li class="news selected"><a href="/web/news/">News</a></li>
|
||||
<li class="guides"><a href="/web/guides/">Guides</a></li>
|
||||
<li class="reviews"><a href="/web/reviews/">Reviews</a></li>
|
||||
</ul>
|
||||
<ul id="auxiliary-navigation">
|
||||
<li class="subscribe"><a href="/subscriptions/">Upgrade to a Premier Subscription</a>
|
||||
|
||||
</li>
|
||||
<li class="customize" style="display:none;">
|
||||
<a href="#">Customize ▾</a>
|
||||
<ul>
|
||||
<li>
|
||||
<p>Site Theme:</p>
|
||||
<label><input type="radio" checked="checked" value="light.css" class="site-style" name="site-style" /> White</label>
|
||||
<label><input type="radio" value="dark.css" class="site-style" name="site-style" /> Black</label>
|
||||
</li>
|
||||
<li>
|
||||
<p>Choose body font:</p>
|
||||
<label><input type="radio" checked="checked" value="arial" class="body_font" name="body_font" /> Arial</label>
|
||||
<label><input type="radio" value="helvetica" class="body_font" name="body_font" /> Helvetica</label>
|
||||
</li>
|
||||
<li>
|
||||
<p>Layout (beta):</p>
|
||||
<label><input type="radio" checked="checked" value="normal" class="fp_layout" name="fp_layout" /> Normal</label>
|
||||
<label><input type="radio" value="compact" class="fp_layout" name="fp_layout" /> Compact</label>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li class="openforum"><a href="http://arstechnica.com/civis/">OpenForum</a></li>
|
||||
|
||||
<li class="login-join"><a href="/civis/ucp.php?mode=login&return_to=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars">Login/Join</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="main">
|
||||
|
||||
<div id="silo-header" class="">
|
||||
<h1 class="web"><a href="/web/" title="Go to The Web">The Web</a></h1>
|
||||
</div>
|
||||
|
||||
<div id="content" class="normal"> <div id="content-inner">
|
||||
<div id="story">
|
||||
<h2 class="title">June Web browser stats: Rapid Release edition</h2>
|
||||
<div class="byline"><span class="author">By <a rel="author" href="/author/peter-bright/">Peter Bright</a>
|
||||
</span> | <span class="posted"><span class="published updated"><span class="name">Published </span> <abbr class="timeago datetime" title="2011-07-06T16:00:00Z">July 6, 2011 11:00 AM</abbr></span><span class="modified" style="display:none;"><span class="name">Last updated </span> <abbr class="timeago datetime" title="2011-07-06T16:33:33Z">July 6, 2011 11:33 AM</abbr></span></span></div>
|
||||
|
||||
<div class="story-image" style="width:300px;">
|
||||
<img width="300" src="http://static.arstechnica.net/opensource/firefox-09-small.jpg" alt="" />
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div id="" class="body" style="">
|
||||
<!--body--><p>June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different <a href="http://arstechnica.com/open-source/news/2011/06/firefox-5-released-arrives-only-three-months-after-firefox-4.ars">Firefox 5</a>. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.</p><!--page 1-->
|
||||
|
||||
<p>Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/global-browser-share.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
<p>The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points—that's a loss representing 20% of its market share. Our own Ryan Paul <a href="http://arstechnica.com/software/reviews/2011/06/hands-on-opera-1150s-new-featherweight-interface-packs-a-punch.ars">liked Opera 11.50</a>, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.</p>
|
||||
|
||||
<p>Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions—Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista—rather than making converts of the other browsers.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/internet-explorer-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
|
||||
<p>Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we <a href="http://arstechnica.com/microsoft/news/2010/04/why-microsoft-did-the-right-thing-in-ditching-xp-for-ie9.ars">agree with the decision to cut Windows XP off</a>, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent—putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.</p>
|
||||
|
||||
<p>Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/firefox-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
|
||||
<p>Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/chrome-transition.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline"><a href="http://netmarketshare.com/">Net Applications</a></div></div></div>
|
||||
|
||||
<p>Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.</p>
|
||||
|
||||
<p>Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.</p>
|
||||
|
||||
<p>That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).</p>
|
||||
|
||||
<p>To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The <a href="http://data.cnzz.com/main.php?s=brow">most popular</a> of these are <a href="http://se.360.cn/">360 Secure Browser</a> with about 19 percent share of the Chinese market, and <a href="http://ie.sogou.com/">Sogou high speed browser</a>, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like <a href="http://code.google.com/chrome/chromeframe/">Chrome Frame</a>, that allows Internet Explorer users to use Chrome for some browser tabs.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><a href="http://static.arstechnica.com/browsers-june-2011/sogou-ie.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-ie.png" /></a></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-text">Sogou browser running as Internet Explorer</div><div class="news-item-figure-caption-byline">Thanks to Ars reader WJ</div></div></div>
|
||||
|
||||
<p>These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><a href="http://static.arstechnica.com/browsers-june-2011/sogou-chrome.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-chrome.png" /></a></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-text">Sogou browser using its embedded Chrome</div><div class="news-item-figure-caption-byline">Thanks to Ars reader WJ</div></div></div>
|
||||
|
||||
<p>The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.</p>
|
||||
|
||||
<div style="width: 640px;" class="news-item-figure CenteredImage"><div class="news-item-figure-image" style=""><img src="http://static.arstechnica.com/browsers-june-2011/ars-browser-share.png" /></div><div class="news-item-figure-caption"><div class="news-item-figure-caption-byline">Ars Technica</div></div></div>
|
||||
|
||||
<p>Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."</p>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<!-- Article Pager -->
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<noscript>
|
||||
<img style="position: absolute; bottom: 0px; right: 0px; width: 1px; height: 1px;" src="http://arstechnica.com/dragons/brains.gif?id=51247&1396906973" alt="" />
|
||||
</noscript>
|
||||
<script type="text/javascript">
|
||||
document.write('<img style="position: absolute; bottom: 0px; right: 0px; width: 1px; height: 1px;" src="http://arstechnica.com/dragons/brains.gif?id=51247&' + (parseInt(Math.random()*99999999, 10)).toString() + '" alt="" />');
|
||||
</script>
|
||||
|
||||
|
||||
<!--googleoff: all-->
|
||||
|
||||
<div id="comments-bar" class="with-bubble">
|
||||
<h2>User comments</h2>
|
||||
|
||||
<div class="comments-link">
|
||||
<a name="comments-bar" rel="nofollow" href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars?comments=1#comments-bar">Click here to view the 81 comments on this story</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="hiddencomment"></div>
|
||||
<!--<div id="alert"><p><img src="http://arstechnica.com/civis/images/smilies/flail.gif" /> We're making some updates to the commenting system. We should have the kinks worked out soon.</p></div>-->
|
||||
<!--googleon: all-->
|
||||
<div id="links-bar">
|
||||
<ul>
|
||||
|
||||
|
||||
<li class="facebook">
|
||||
<iclint src="http://www.facebook.com/plugins/like.php?href=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars&layout=button_count&show_faces=false&width=85&action=like&font=arial&colorscheme=light&height=21" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:85px; height:21px;" allowTransparency="true"></iclint>
|
||||
</li>
|
||||
|
||||
|
||||
<li><a href="http://twitter.com/share" class="twitter-share-button" data-url="http://arst.ch/q4c" data-counturl="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars" data-count="horizontal" data-via="arstechnica" data-related="drpizza:Peter Bright">Tweet</a></li>
|
||||
|
||||
<li class="reddit">
|
||||
<iclint src="http://www.reddit.com/static/button/button1.html?width=120&url=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars&title=June%20Web%20browser%20stats%3A%20Rapid%20Release%20edition&bgcolor=fff&bordercolor=eee" width="120" height="20" scrolling="no" frameborder="0"></iclint>
|
||||
</li>
|
||||
|
||||
<li class="share">
|
||||
<a class="a2a_dd" href="http://www.addtoany.com/share_save?linkname=June%20Web%20browser%20stats%3A%20Rapid%20Release%20edition&linkurl=http%3A%2F%2Farstechnica.com%2Fweb%2Fnews%2F2011%2F07%2Fjune-browser-stats-rapid-release-edition.ars"><img src="http://static.addtoany.com/buttons/favicon.png" width="16" height="16" border="0" alt="Share/Bookmark" style="display:inline;vertical-align:middle;"/> Share/Email</a>
|
||||
<script type="text/javascript">
|
||||
var a2a_linkname="June Web browser stats: Rapid Release edition",
|
||||
a2a_linkurl="http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars",
|
||||
a2a_onclick=1,
|
||||
a2a_show_title=1,
|
||||
a2a_hide_embeds=0,
|
||||
a2a_num_services=8,
|
||||
a2a_color_main="989EA3",
|
||||
a2a_color_border="989EA3",
|
||||
a2a_color_link_text="FF5B00",
|
||||
a2a_color_link_text_hover="ffffff",
|
||||
a2a_track_links='ga',
|
||||
a2a_prioritize= [
|
||||
"digg",
|
||||
"yahoo_buzz",
|
||||
"stumbleupon",
|
||||
"instapaper",
|
||||
"slashdot",
|
||||
"linkedin",
|
||||
"delicious",
|
||||
"google_reader",
|
||||
"tumblr",
|
||||
"posterous"
|
||||
];
|
||||
var a2a_config = a2a_config || {};
|
||||
a2a_config.no_3p = 1;
|
||||
</script>
|
||||
<style type="text/css">#a2apage_BROWSER { display:none !important; }</style>
|
||||
</li>
|
||||
<li class="copypasta copy-pasta-button">Make a correction</li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
<!--googleoff: all-->
|
||||
<div id="read-more-stories">
|
||||
<h2>Read more stories</h2>
|
||||
<div class="story-navigation">
|
||||
<a href="/gadgets/news/2011/07/amazon-appstore-game-developer-pulls-app-highlights-problems.ars" title="Read the previously published article">< Older Story</a>
|
||||
|
||||
|
|
||||
<a href="/tech-policy/news/2011/07/copyright-troll-righthaven-now-starts-paying-those-it-sued.ars" title="Read the next newest article">Newer Story ></a>
|
||||
</div>
|
||||
<!--googleoff: all-->
|
||||
<script language='JavaScript'>
|
||||
var OB_langJS = "http://static.arstechnica.net//public/v6/scripts/outbrain.lang_en_ars.js",OBITm = '1306449288604',OB_raterMode = 'singlethumb',OB_recMode = 'strip',OutbrainPermaLink='http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars';
|
||||
if (typeof(OB_Script)!='undefined' ){OutbrainStart();}else{var OB_Script = true,str = unescape("%3Cscript src=\'http://widgets.outbrain.com/OutbrainRater.js\' type=\'text/javascript\'%3E%3C/script%3E");document.write(str);}
|
||||
</script>
|
||||
<!--googleon: all-->
|
||||
</div>
|
||||
<!--googleon: all-->
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<!--googleoff: all-->
|
||||
<div id="sidebar">
|
||||
|
||||
<div id="article-links" class="with-divider" style="display:none;">
|
||||
|
||||
<ul>
|
||||
<li class="enlarge-text"><a href="#">Increase text size</a></li>
|
||||
<li class="shrink-text"><a href="#">Reduce text size</a></li>
|
||||
<li class="print"><a href="#">Print this story</a></li>
|
||||
|
||||
<li class="comment"><a href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars?comments=1#comments-bar#comments-bar">Leave a comment (81)</a></li>
|
||||
<li class="copy-pasta-button edit-suggestion" style="display: none;"><a href="#">Make a correction</a></li>
|
||||
<li class="shorturl"><a rel="nofollow" href="http://arst.ch/q4c">http://arst.ch/q4c</a></li>
|
||||
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<style type="text/css" media="screen">
|
||||
#gwmdRBfSihEbZa {
|
||||
height: 250px;
|
||||
width: 300px;
|
||||
min-height: 250px;
|
||||
margin-bottom: 10px;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
#gwmdRBfSihEbZa.tall {
|
||||
height: 600px;
|
||||
}
|
||||
|
||||
body.premium-adset #gwmdRBfSihEbZa {
|
||||
/* height: 600px; */
|
||||
}
|
||||
</style>
|
||||
|
||||
<abbr></abbr>
|
||||
<blah></blah>
|
||||
<abbr></abbr>
|
||||
|
||||
<div id="gwmdRBfSihEbZa" class="">
|
||||
|
||||
<noscript>
|
||||
<div id="help-by-subscribing">
|
||||
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/2"><img src="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/4" alt="Please subscribe" /></a></div>
|
||||
</noscript>
|
||||
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var ppanel = cnp.ad.create(cnp.ad.refreshable, false);
|
||||
ppanel.addParameter({'sz':'300x250'});
|
||||
ppanel.addParameterString('kw=top;kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
ppanel.load();
|
||||
} catch(e) {}
|
||||
</script>
|
||||
</div>
|
||||
<div id="journals-box" class="with-divider">
|
||||
<h2 class="title">Latest Top Stories</h2>
|
||||
<ul class="category">
|
||||
<li class="all selected">
|
||||
<span class="tab-inner">
|
||||
<a href="/" title="All">All</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="apple">
|
||||
<span class="tab-inner">
|
||||
<a href="/apple/" title="Apple">Apple</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="gaming">
|
||||
<span class="tab-inner">
|
||||
<a href="/gaming/" title="Gaming">Gaming</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="microsoft">
|
||||
<span class="tab-inner">
|
||||
<a href="/microsoft/" title="Microsoft">Microsoft</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="gadgets">
|
||||
<span class="tab-inner">
|
||||
<a href="/gadgets/" title="Gadgets">Gadgets</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="open-source">
|
||||
<span class="tab-inner">
|
||||
<a href="/open-source/" title="Open Source">Open Source</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="business">
|
||||
<span class="tab-inner">
|
||||
<a href="/business/" title="Business">Business</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="science">
|
||||
<span class="tab-inner">
|
||||
<a href="/science/" title="Science">Science</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="tech-policy">
|
||||
<span class="tab-inner">
|
||||
<a href="/tech-policy/" title="Tech Policy">Tech Policy</a>
|
||||
</span>
|
||||
</li>
|
||||
<li class="staff">
|
||||
<span class="tab-inner">
|
||||
<a href="/staff/" title="Staff">Staff</a>
|
||||
</span>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="stories">
|
||||
<li id="journal-box-0" class="gadgets">
|
||||
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars">Dual-core Motorola Droid 3 launches July 14 for $199 on Verizon</a>
|
||||
</li>
|
||||
<li id="journal-box-1" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/major-isps-agree-to-six-strikes-copyright-enforcement-plan.ars">Major ISPs agree to "six strikes" copyright enforcement plan</a>
|
||||
</li>
|
||||
<li id="journal-box-2" class="gaming">
|
||||
<a href="/gaming/news/2011/07/sony-to-include-mandatory-psn-pass-codes-in-first-party-games.ars">Sony to include one-time use "PSN Pass" code in its games</a>
|
||||
</li>
|
||||
<li id="journal-box-3" class="gaming">
|
||||
<a href="/gaming/news/2011/07/journey-turns-strangers-to-friends-in-odd-desolate-landscape.ars"><em>Journey</em> turns strangers into friends in odd, desolate landscape</a>
|
||||
</li>
|
||||
<li id="journal-box-4" class="science">
|
||||
<a href="/science/news/2011/07/is-science-getting-harder-first-define-easy.ars">Is scientific progress slowing? Depends how you measure it</a>
|
||||
</li>
|
||||
<li id="journal-box-5" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/did-the-titanic-disaster-let-uncle-sam-take-over-the-airwaves.ars">How the <em>Titanic</em> disaster pushed Uncle Sam to "rule the air"</a>
|
||||
</li>
|
||||
<li id="journal-box-6" class="web">
|
||||
<a href="/web/news/2011/07/facebook-video-chatting-handy-definitely-not-awesome.ars">Analysis: Facebook video chatting handy, definitely not "awesome"</a>
|
||||
</li>
|
||||
<li id="journal-box-7" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/dozens-of-law-professors-protect-ip-act-is-unconstitutional.ars">Dozens of law professors: PROTECT IP Act is unconstitutional</a>
|
||||
</li>
|
||||
<li id="journal-box-8" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/should-net-neutrality-protect-third-party-mobile-tethering-apps.ars">Does net neutrality protect mobile tethering apps?</a>
|
||||
</li>
|
||||
<li id="journal-box-9" class="apple">
|
||||
<a href="/apple/news/2011/07/wsj-next-iphone-to-be-thinner-and-lighter-than-iphone-4.ars">WSJ: next iPhone to be "thinner and lighter" than iPhone 4</a>
|
||||
</li>
|
||||
<li id="journal-box-10" class="apple">
|
||||
<a href="/apple/news/2011/07/iphone-users-spend-147-hours-a-month-playing-games.ars">iPhone users spend 14.7 hours a month playing games</a>
|
||||
</li>
|
||||
<li id="journal-box-11" class="tech-policy">
|
||||
<a href="/tech-policy/news/2011/07/copyright-troll-righthaven-now-starts-paying-those-it-sued.ars">Copyright troll Righthaven now starts paying those it sued</a>
|
||||
</li>
|
||||
<li id="journal-box-12" class="web">
|
||||
<a href="/web/news/2011/07/june-browser-stats-rapid-release-edition.ars">June Web browser stats: Rapid Release edition</a>
|
||||
</li>
|
||||
<li id="journal-box-13" class="gadgets">
|
||||
<a href="/gadgets/news/2011/07/amazon-appstore-game-developer-pulls-app-highlights-problems.ars">Amazon Appstore problems: why one developer pulled its game</a>
|
||||
</li>
|
||||
<li id="journal-box-14" class="science">
|
||||
<a href="/science/news/2011/07/ocean-sediment-promising-source-of-rare-earth-metals.ars">Why ocean mud might matter to your future iPhone</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="with-divider" id="fb">
|
||||
<iclint src="http://www.facebook.com/plugins/likebox.php?href=http%3A%2F%2Ffacebook.com%2Farstechnica&width=300&colorscheme=light&show_faces=false&stream=false&header=false&height=62&border_color=%23FFFFFF" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:300px; height:62px;" allowTransparency="true"></iclint>
|
||||
<iclint src="http://www.facebook.com/plugins/activity.php?site=arstechnica.com&width=300&height=370&header=false&colorscheme=light&recommendations=false&border_color=%23FFFFFF" scrolling="no" frameborder="0" style="border:none; overflow:hidden; width:300px; height:370px;" allowTransparency="true"></iclint>
|
||||
|
||||
<p><a href="#" class="anonymous">Disable Facebook on Ars</a></p>
|
||||
</div>
|
||||
<style type="text/css" media="screen">
|
||||
#mieBfNdjZYK {
|
||||
height: 250px;
|
||||
width: 300px;
|
||||
min-height: 250px;
|
||||
margin-bottom: 10px;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
#mieBfNdjZYK.tall {
|
||||
height: 600px;
|
||||
}
|
||||
|
||||
body.premium-adset #mieBfNdjZYK {
|
||||
/* height: 600px; */
|
||||
}
|
||||
</style>
|
||||
|
||||
<kjaskjas></kjaskjas>
|
||||
<blah></blah>
|
||||
<sakjasd></sakjasd>
|
||||
<div></div>
|
||||
<kjaskjas></kjaskjas>
|
||||
<div></div>
|
||||
<span></span>
|
||||
<clint></clint>
|
||||
|
||||
<div id="mieBfNdjZYK" class="">
|
||||
|
||||
<noscript>
|
||||
<div id="help-by-subscribing">
|
||||
<a href="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/2"><img src="/gadgets/news/2011/07/dual-core-motorola-droid-3-launches-july-14-for-199-on-verizon.ars/4" alt="Please subscribe" /></a></div>
|
||||
</noscript>
|
||||
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var ppanel = cnp.ad.create(cnp.ad.refreshable, false);
|
||||
ppanel.addParameter({'sz':'300x250'});
|
||||
ppanel.addParameterString('kw=bottom;kw=june-browser-stats-rapid-release-edition;kw=07;kw=2011;kw=news;kw=web;');
|
||||
ppanel.load();
|
||||
} catch(e) {}
|
||||
</script>
|
||||
</div>
|
||||
<div id="jobs-ars" class="with-divider">
|
||||
<h2 class="title">
|
||||
<span class="title">Job.Ars</span>:
|
||||
<span class="subtitle">looking for a new job?</span>
|
||||
</h2>
|
||||
<div class="body">
|
||||
<ul>
|
||||
<div id="jobs-ars-content">
|
||||
<ul>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1027/">Software Engineer</a> at minerva-associates.com</div>
|
||||
<div class="job-location">San Diego, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1026/">Software Engineer</a> at minerva-associates.com</div>
|
||||
<div class="job-location">San Diego, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1025/">Senior Java / Scala Developer - Sequencing Informatics </a> at The Broad Institute</div>
|
||||
<div class="job-location">Cambridge, MA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1024/">Senior Java / Scala Developer - Sequencing Informatics </a> at The Broad Institute</div>
|
||||
<div class="job-location">Cambridge, MA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1022/">Web Developer for Online Organizing Incubator</a> at Citizen Engagement Laboratory</div>
|
||||
<div class="job-location">San Francisco Bay Area required</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1021/">.NET Developer (Oklahoma City & Salt Lake City) </a> at a la mode, inc.</div>
|
||||
<div class="job-location">Oklahoma City and Salt Lake City</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1019/">Senior Systems Administrator</a> at Synacor</div>
|
||||
<div class="job-location">Buffalo, NY</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1018/">Network Engineer</a> at Box.net</div>
|
||||
<div class="job-location">Palo Alto, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1017/">Software Engineer - Operations</a> at imo</div>
|
||||
<div class="job-location">Palo Alto, CA</div>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<div class="job-title"><a href="//jobs.arstechnica.com/list/1016/">Software Engineer</a> at imo</div>
|
||||
<div class="job-location">Palo Alto, CA</div>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
<div id="more-jobs"><a href="//jobs.arstechnica.com">More Job Listings</a></div>
|
||||
</div> </ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<!--googleon: all-->
|
||||
</div>
|
||||
<div id="footer">
|
||||
<div id="slogan">Serving the technologist for <span id="decades">1</span> × 10<sup>-1</sup> centuries</div>
|
||||
<iframe src="http://static.arstechnica.net//public/v6/footer.html?1309476727" frameborder="0" scrolling="no" width="1000" height="350"></iframe>
|
||||
</div>
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-31997-1']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
_gaq.push(['_trackPageLoadTime']);
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
var page_class = 'individual',
|
||||
site_root = "",
|
||||
site_root_rel = '/',
|
||||
discussion_url = "",
|
||||
entry_author = {
|
||||
"peter bright":true,
|
||||
"peter bright":true,
|
||||
"drpizza":true
|
||||
},
|
||||
entry_id = 51247,
|
||||
fp_layout = 'normal',
|
||||
syntaxhighlighter = "http://arstechnica.com/public/full/scripts/syntaxhighlighter.js",
|
||||
new_comments = true,
|
||||
disable_fb = 'false';
|
||||
</script>
|
||||
|
||||
|
||||
<script src="http://static.arstechnica.net//public/v6/scripts/site.min.js?1309476727" type="text/javascript" charset="utf-8"></script>
|
||||
|
||||
<noscript>
|
||||
<img src="http://b.scorecardresearch.com/b?c1=2&c2=6035094&c3=&c4=&c5=&c6=&c15=&cv=1.3&cj=1" style="position:absolute; bottom: 0px; right:0px;"
|
||||
width="1" height="1" alt="" />
|
||||
</noscript>
|
||||
|
||||
<span style="display: none" id="ArsTechnicaNews" class="hslice">
|
||||
<span style="display: none" class="entry-title">Ars Technica News</span>
|
||||
<a style="display: none" href="http://www.ieaddons.com/en/ie8slice/Content.ashx?id=330" rel="entry-content"></a>
|
||||
</span>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,53 @@
|
||||
<div id="article"><div id="" class="body">
|
||||
<p>June brought the first result of Mozilla's new Rapid Release strategy for Firefox. Firefox 4, just three months old, was superceded by the all-new but not-too-different <a href="http://arstechnica.com/open-source/news/2011/06/firefox-5-released-arrives-only-three-months-after-firefox-4.ars">Firefox 5</a>. Firefox's market growth was all but ended by the release of Chrome, and Mozilla is hoping that by adopting a similar release schedule to Google, it will be able to reignite the growth of its user base.</p>
|
||||
|
||||
<p>Internet Explorer is down 0.59 points at 53.68 percent. Firefox is essentially unchanged, down 0.04 points to 21.67 percent. Chrome is up 0.59 points to 13.11 percent. Safari is also up, gaining 0.2 points to reach 7.48 percent. Opera dropped 0.3 points to 1.73 percent.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/global-browser-share.png"/></div></div>
|
||||
<p>The trends established over the last few months are continuing: Firefox is treading water, while Internet Explorer is losing users, which seem to be being picked up by Chrome. In the past two months, Opera has dropped 0.41 points—that's a loss representing 20% of its market share. Our own Ryan Paul <a href="http://arstechnica.com/software/reviews/2011/06/hands-on-opera-1150s-new-featherweight-interface-packs-a-punch.ars">liked Opera 11.50</a>, which was released just a couple of days ago, so perhaps this will help turn around a perilous slide.</p>
|
||||
|
||||
<p>Looking at individual versions, Internet Explorer 6, 7, and 8 are all down, by 0.18, 0.46, and 1.21 points respectively. Internet Explorer 9 made strong gains, of 1.44 points, but not enough to undo the losses. Internet Explorer 9's gains seem to be occurring at the expense of older versions—Internet Explorer 8 on Windows 7, versions 7 and 8 on Windows Vista—rather than making converts of the other browsers.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/internet-explorer-transition.png"/></div></div>
|
||||
|
||||
<p>Internet Explorer 9 is of course at something of a disadvantage, as it won't run on Windows XP. While we <a href="http://arstechnica.com/microsoft/news/2010/04/why-microsoft-did-the-right-thing-in-ditching-xp-for-ie9.ars">agree with the decision to cut Windows XP off</a>, one consequence is that not a single Internet Explorer 6 user can upgrade to Internet Explorer 9. Nor can anyone using Internet Explorer 7 or 8 on Windows XP. If the focus is narrowed from all users to just those using Windows 7, the Internet Explorer 9 situation looks a little more promising. Though Internet Explorer 8, which ships with Windows 7, commands the highest market share, at 38.47 percent of Windows 7 users, Internet Explorer 9 takes second place, at 15.61 percent—putting it ahead of Firefox 4 and Chrome 12, at 13.74 and 11.60 percent, respectively.</p>
|
||||
|
||||
<p>Internet Explorer 9 seems, therefore, to be performing well among users of Microsoft's latest and greatest operating system; it's just that only 27 percent of the global audience is running that platform. Windows XP still commands a slim majority, with a global share of 51 percent. As Windows XP declines and Windows 7 grows, we can expect to see Internet Explorer 9 lifted by this transition.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/firefox-transition.png"/></div></div>
|
||||
|
||||
<p>Firefox versions 3.5 and 3.6 both saw drops last month, by 2.06 and 0.28 points, respectively, and versions 4 and 5 rose by 0.38 and 2.05 points, respectively. This suggests that the transition from "old" Firefox (3.x) to "modern" Firefox (4 and 5) is slowing down; in May, the 3.x versions dropped by an aggregate of more than 4.5 points, with the then-current Firefox 4 picking up all of those users. This month, only around half as many users made the switch. Though "modern" Firefox versions are now used by a majority of Firefox users, it looks like a hard core of "old" users is going to stick around. Over the next few months, we can expect Firefox 3.5 to decline more heavily, as Mozilla intends to push out a patch that will upgrade users to the newest 3.6 version.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/chrome-transition.png"/></div></div>
|
||||
|
||||
<p>Chrome as ever shows rapid migration between versions. Over the course of June, the browser's stable version went from 11 to 12, and the rapid cutover we've grown to expect occurred. However, that transition isn't complete. 1.39 percent of users are on Chrome 10 or older, and it looks like Google's generally seamless automatic upgrades aren't touching these users. The source of these users isn't clear, though there a few plausible explanations. Obviously, some individuals and corporate users may simply have opted to disable the updates. Automatic updating is the default, but it can be turned off. Though this gives these users and enterprises greater control over the browser version they're using, this comes at some risk; Google doesn't have security updates for old versions of Chrome, so these people are using browsers with known exploitable flaws.</p>
|
||||
|
||||
<p>Chrome's automatic updating is also dependent on a system service. Though the browser can be installed by non-administrators, installation of the service requires administrator privileges. Unlike Firefox, which checks for and performs updates within the browser itself, Chrome depends on its service to do this task. If the service doesn't exist, updates don't happen.</p>
|
||||
|
||||
<p>That's probably not enough to account for every legacy Chrome user, however. To do that, we probably have to look towards the East Asian market. A long-standing feature of various markets in the region, most notably China and South Korea, is the entrenchment of Internet Explorer, variously attributed to legal mandates (especially in South Korea, where until last year a specific ActiveX control was required for online banking) and widespread software piracy making users reluctant to use Windows Update (even though Internet Explorer upgrades are available to pirated copies of the operating system).</p>
|
||||
|
||||
<p>To support this market, a range of browsers based on Internet Explorer's rendering engine, but with substantially greater features, sprung up. The <a href="http://data.cnzz.com/main.php?s=brow">most popular</a> of these are <a href="http://se.360.cn/">360 Secure Browser</a> with about 19 percent share of the Chinese market, and <a href="http://ie.sogou.com/">Sogou high speed browser</a>, with a little under 6 percent. Though these browsers originally just used the Trident engine that powers Internet Explorer, recent versions extend this by also embedding Chrome. In so doing, they give their users a choice between a relatively modern Chrome browser engine, and the older Internet Explorer engine needed for compatibility. Conceptually, this is very similar to software like <a href="http://code.google.com/chrome/chromeframe/">Chrome Frame</a>, that allows Internet Explorer users to use Chrome for some browser tabs.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><a href="http://static.arstechnica.com/browsers-june-2011/sogou-ie.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-ie.png"/></a></div><div class="news-item-figure-caption"><p class="news-item-figure-caption-text">Sogou browser running as Internet Explorer</p><p class="news-item-figure-caption-byline">Thanks to Ars reader WJ</p></div></div>
|
||||
|
||||
<p>These dual-engine browsers tend to modify Chrome in several ways, one of which is that they exclude Google's automatic update service. They also tend to embed stale versions of Chrome; the current Sogou uses Chrome 6. The result is that users of these browsers, who may well prefer using Chrome for day-to-day browsing, will be stuck with obsolete versions of the browser. And because of the way they're using Chrome, they're out of reach of Google's update system.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><a href="http://static.arstechnica.com/browsers-june-2011/sogou-chrome.png"><img src="http://static.arstechnica.com/browsers-june-2011/thumb-sogou-chrome.png"/></a></div><div class="news-item-figure-caption"><p class="news-item-figure-caption-text">Sogou browser using its embedded Chrome</p><p class="news-item-figure-caption-byline">Thanks to Ars reader WJ</p></div></div>
|
||||
|
||||
<p>The net result of these various usage scenarios is that Chrome's non-upgrading userbase is likely to grow ever larger, with ten percent of Chrome users, and climbing, sticking with versions of the browser that are no longer supported.</p>
|
||||
|
||||
<div class="news-item-figure CenteredImage"><div class="news-item-figure-image"><img src="http://static.arstechnica.com/browsers-june-2011/ars-browser-share.png"/></div></div>
|
||||
|
||||
<p>Ars' audience continues to show marked differences from the Internet's norms. Firefox, Safari, Internet Explorer, and Opera all saw drops, of 0.94, 0.37, 0.04, and 0.10 points respectively; Chrome saw gains of 0.88 points, with the remainder of the difference picked up by "other."</p>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
@ -0,0 +1,2 @@
|
||||
test_description: standard article from arstechnica
|
||||
url: http://arstechnica.com/web/news/2011/07/june-browser-stats-rapid-release-edition.ars
|
@ -0,0 +1,52 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>A Simple Multi-Page Article For Testing : Page 2</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 2</h1>
|
||||
<p>
|
||||
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
|
||||
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam
|
||||
risus vehicula nibh, in scelerisque lorem risus et risus. Aliquam
|
||||
erat volutpat. Pellentesque habitant morbi tristique senectus et
|
||||
netus et malesuada fames ac turpis egestas. Donec blandit venenatis
|
||||
feugiat. Ut quis turpis ac urna consectetur sagittis. Vestibulum
|
||||
aliquet eros et orci placerat vitae tempus tellus pretium. Quisque
|
||||
rutrum sapien quis nibh facilisis quis posuere ipsum elementum. In
|
||||
ac pretium justo. Sed egestas luctus mollis. Donec rutrum leo a
|
||||
turpis facilisis commodo. Nam quis quam eget mi malesuada
|
||||
scelerisque. Pellentesque semper condimentum sagittis. Nam
|
||||
lobortis, tortor ut placerat viverra, ante felis vehicula sem,
|
||||
blandit ultricies purus urna eget elit. Pellentesque habitant morbi
|
||||
tristique senectus et netus et malesuada fames ac turpis egestas.
|
||||
Sed vel nulla sollicitudin dolor adipiscing dapibus aliquam vitae
|
||||
leo. Phasellus at turpis tempus lectus pellentesque faucibus.
|
||||
</p>
|
||||
<p>
|
||||
Quisque egestas congue metus quis semper. Integer in ornare nunc.
|
||||
Nunc in est eget risus pulvinar tincidunt. Nullam eu tempus tortor.
|
||||
Suspendisse potenti. Aliquam erat volutpat. Praesent sem leo,
|
||||
molestie a dignissim eget, aliquet sit amet est. Suspendisse sed
|
||||
libero in urna tincidunt viverra. Maecenas posuere risus non elit
|
||||
adipiscing a tristique nibh aliquet. Nullam varius risus vitae
|
||||
turpis lacinia pharetra bibendum magna aliquam. Nam consectetur
|
||||
mattis lectus, vitae hendrerit lectus iaculis ut. Curabitur commodo
|
||||
pharetra nibh mollis pulvinar. Nulla in metus dui, vitae ultrices
|
||||
nibh. Cum sociis natoque penatibus et magnis dis parturient montes,
|
||||
nascetur ridiculus mus. Cras sed condimentum mi. Morbi vitae velit
|
||||
in neque tincidunt imperdiet quis quis orci. Proin molestie, erat
|
||||
convallis vulputate consectetur, diam odio interdum arcu, non
|
||||
semper neque ante a dolor.
|
||||
</p>
|
||||
<ul id="pageNumbers">
|
||||
<li> 1 </li>
|
||||
<li>
|
||||
<a title="Page 1" href="/article.html">1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a title="Page 3" href="/article.html?pagewanted=3">3</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,60 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>A Simple Multi-Page Article For Testing : Page 3</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
|
||||
<p>
|
||||
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
|
||||
erat, lobortis varius est massa quis metus. Donec vitae justo
|
||||
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
|
||||
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
|
||||
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
|
||||
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
|
||||
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
|
||||
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
|
||||
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
|
||||
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
|
||||
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
|
||||
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
|
||||
imperdiet est.
|
||||
</p>
|
||||
<p>
|
||||
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
|
||||
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
|
||||
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
|
||||
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
|
||||
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
|
||||
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
|
||||
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
|
||||
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
|
||||
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
|
||||
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
|
||||
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
|
||||
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
|
||||
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
|
||||
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
|
||||
dolor, imperdiet eget rutrum tempus, euismod nec augue.
|
||||
</p>
|
||||
<p>
|
||||
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
|
||||
neque magna, in laoreet felis. Aenean elit ligula, tempor in
|
||||
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
|
||||
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
|
||||
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
|
||||
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
|
||||
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
|
||||
magna scelerisque vitae vulputate ipsum luctus.
|
||||
</p>
|
||||
<ul id="pageNumbers">
|
||||
<li> 1 </li>
|
||||
<li>
|
||||
<a title="Page 1" href="/article.html">1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,123 @@
|
||||
<div id="article">
|
||||
<h1>A Simple Multi-Page Article For Testing</h1>
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
<p>
|
||||
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
|
||||
pharetra at consequat tristique, convallis nec turpis. Vestibulum
|
||||
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
|
||||
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
|
||||
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
|
||||
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
|
||||
dignissim vitae libero. Nunc at mauris et ante accumsan
|
||||
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
|
||||
venenatis eu consectetur non, vehicula vel metus. Curabitur
|
||||
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
|
||||
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
|
||||
risus, sagittis ut elementum ut, porttitor non libero. Integer
|
||||
fringilla magna quis augue dapibus malesuada. Nulla consectetur
|
||||
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
|
||||
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
|
||||
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
|
||||
massa.
|
||||
</p>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 2</h1>
|
||||
<p>
|
||||
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
|
||||
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam
|
||||
risus vehicula nibh, in scelerisque lorem risus et risus. Aliquam
|
||||
erat volutpat. Pellentesque habitant morbi tristique senectus et
|
||||
netus et malesuada fames ac turpis egestas. Donec blandit venenatis
|
||||
feugiat. Ut quis turpis ac urna consectetur sagittis. Vestibulum
|
||||
aliquet eros et orci placerat vitae tempus tellus pretium. Quisque
|
||||
rutrum sapien quis nibh facilisis quis posuere ipsum elementum. In
|
||||
ac pretium justo. Sed egestas luctus mollis. Donec rutrum leo a
|
||||
turpis facilisis commodo. Nam quis quam eget mi malesuada
|
||||
scelerisque. Pellentesque semper condimentum sagittis. Nam
|
||||
lobortis, tortor ut placerat viverra, ante felis vehicula sem,
|
||||
blandit ultricies purus urna eget elit. Pellentesque habitant morbi
|
||||
tristique senectus et netus et malesuada fames ac turpis egestas.
|
||||
Sed vel nulla sollicitudin dolor adipiscing dapibus aliquam vitae
|
||||
leo. Phasellus at turpis tempus lectus pellentesque faucibus.
|
||||
</p>
|
||||
<p>
|
||||
Quisque egestas congue metus quis semper. Integer in ornare nunc.
|
||||
Nunc in est eget risus pulvinar tincidunt. Nullam eu tempus tortor.
|
||||
Suspendisse potenti. Aliquam erat volutpat. Praesent sem leo,
|
||||
molestie a dignissim eget, aliquet sit amet est. Suspendisse sed
|
||||
libero in urna tincidunt viverra. Maecenas posuere risus non elit
|
||||
adipiscing a tristique nibh aliquet. Nullam varius risus vitae
|
||||
turpis lacinia pharetra bibendum magna aliquam. Nam consectetur
|
||||
mattis lectus, vitae hendrerit lectus iaculis ut. Curabitur commodo
|
||||
pharetra nibh mollis pulvinar. Nulla in metus dui, vitae ultrices
|
||||
nibh. Cum sociis natoque penatibus et magnis dis parturient montes,
|
||||
nascetur ridiculus mus. Cras sed condimentum mi. Morbi vitae velit
|
||||
in neque tincidunt imperdiet quis quis orci. Proin molestie, erat
|
||||
convallis vulputate consectetur, diam odio interdum arcu, non
|
||||
semper neque ante a dolor.
|
||||
</p>
|
||||
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
|
||||
<p>
|
||||
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
|
||||
erat, lobortis varius est massa quis metus. Donec vitae justo
|
||||
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
|
||||
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
|
||||
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
|
||||
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
|
||||
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
|
||||
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
|
||||
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
|
||||
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
|
||||
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
|
||||
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
|
||||
imperdiet est.
|
||||
</p>
|
||||
<p>
|
||||
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
|
||||
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
|
||||
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
|
||||
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
|
||||
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
|
||||
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
|
||||
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
|
||||
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
|
||||
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
|
||||
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
|
||||
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
|
||||
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
|
||||
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
|
||||
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
|
||||
dolor, imperdiet eget rutrum tempus, euismod nec augue.
|
||||
</p>
|
||||
<p>
|
||||
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
|
||||
neque magna, in laoreet felis. Aenean elit ligula, tempor in
|
||||
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
|
||||
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
|
||||
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
|
||||
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
|
||||
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
|
||||
magna scelerisque vitae vulputate ipsum luctus.
|
||||
</p>
|
||||
</div>
|
@ -0,0 +1,60 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>A Simple Multi-Page Article For Testing</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>A Simple Multi-Page Article For Testing</h1>
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
<p>
|
||||
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
|
||||
pharetra at consequat tristique, convallis nec turpis. Vestibulum
|
||||
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
|
||||
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
|
||||
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
|
||||
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
|
||||
dignissim vitae libero. Nunc at mauris et ante accumsan
|
||||
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
|
||||
venenatis eu consectetur non, vehicula vel metus. Curabitur
|
||||
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
|
||||
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
|
||||
risus, sagittis ut elementum ut, porttitor non libero. Integer
|
||||
fringilla magna quis augue dapibus malesuada. Nulla consectetur
|
||||
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
|
||||
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
|
||||
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
|
||||
massa.
|
||||
</p>
|
||||
<ul id="pageNumbers">
|
||||
<li> 1 </li>
|
||||
<li>
|
||||
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
|
||||
</li>
|
||||
<li>
|
||||
<a title="Page 3" href="/article.html?pagewanted=3">3</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,11 @@
|
||||
<div id="article"><div class="comment-content" id="comment-content-4e141229cadcbbb33f050000">
|
||||
|
||||
|
||||
<p class="comment-text">
|
||||
Yep, you gotta love that almost 90% market share failure. Like I said before, if that's failure than sign me up for some of that. I'm pretty sure the good people over at Apple, Google, etc. would like to be signed up for some of that failure too.<br/><br/>
|
||||
For the, "If this, if that, (insert scenario)" people, enjoy your new OS and whatever other new software you may choose to use. However, don't be surprised when those metro ui interface imitations start to land on those products too. Did you really think that static grid-icons on a screen was going to last forever? I think 20+ years is enough, it's time for new innovation in design and don't be surprised when the copycats jump on board. That's the way the industry works. One group comes up with a new design or concept and the others tend to follow suit and you don't have to be a market leader to get that following. Just ask the Opera/Chrome developers. That's just one of many, many examples that could be pointed out. The metro ui is a very suitable design for the touch screen world that we're migrating to. Sure, there will be changes and enhancements as time goes on and everyone will put their own spin on it, but I'd get used to similar offerings from MSFT's competitors if I were you.<br/><br/>
|
||||
Also, for those who like to comment, but seem to have little info about what's expected in things like Windows 8, let me fill you in a bit. The info. out right now is that Windows 8 will let you choose to use the new ui or to use the more, "Windows past" icon ui. I think anyone with some modicum of common sense can see how that would be a wise move from MSFT. For instance: The metro ui may not appeal to the corporate world as much as the consumer world. Plus, it give long-time Window's users the option to stick with what they know, but still gain the newest features and security measures that new OS's tend to bring. So, if your going to use another product, but all means, have fun with it, but don't try to justify it to yourself with reasons that are unlikely to exist. Just say you want to move on and anyone else can respect that, but when you seem to have little knowledge of what your options will be, it just makes you look like the typical sheep some people can be.<br/><br/>
|
||||
Personally, I love the new direction MSFT is going in and for the first time in years, they seem to be thinking more and more consumer friendly. That's not an easy task for a company who has to appeal to business the way MSFT does and I commend the effort. Believe me, or don't, but Apple, Google and any other group would suffer the same balancing act if they dominated the corporate world the way Microsoft does. Corporate and consumers are very different beasts and it's not always easy to appeal to both, yet Microsoft has kept a large following in both sectors and anyone who doesn't see the skill it takes to do that, has a lot to learn my friends. </p>
|
||||
|
||||
</div>
|
||||
</div>
|
@ -0,0 +1,3 @@
|
||||
test_description: businessinsider article
|
||||
notes: missed the article completely; got a long comment instead
|
||||
url: http://www.businessinsider.com/where-windows-8-came-from-microsoft-ui-ideas-that-never-took-off-2011-7
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,2 @@
|
||||
test_description: cnet article
|
||||
url: http://howto.cnet.com/8301-11310_39-20078249-285/best-free-alternatives-to-top-selling-software/?tag=epicStories
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,2 @@
|
||||
test_description: deadspin article
|
||||
url: http://deadspin.com/5820463/would-you-kill-a-stranger-to-save-football
|
@ -0,0 +1,48 @@
|
||||
<div id="article">
|
||||
<div id="page-1" class="article-page">
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
</div>
|
||||
<div id="page-2" class="article-page">
|
||||
<p>
|
||||
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
|
||||
pharetra at consequat tristique, convallis nec turpis. Vestibulum
|
||||
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
|
||||
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
|
||||
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
|
||||
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
|
||||
dignissim vitae libero. Nunc at mauris et ante accumsan
|
||||
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
|
||||
venenatis eu consectetur non, vehicula vel metus. Curabitur
|
||||
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
|
||||
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
|
||||
risus, sagittis ut elementum ut, porttitor non libero. Integer
|
||||
fringilla magna quis augue dapibus malesuada. Nulla consectetur
|
||||
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
|
||||
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
|
||||
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
|
||||
massa.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
@ -0,0 +1,25 @@
|
||||
<div id="page-1" class="article-page">
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
|
||||
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
|
||||
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
|
||||
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
|
||||
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
|
||||
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
|
||||
suscipit posuere velit. Proin est orci, sollicitudin at luctus
|
||||
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
|
||||
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
|
||||
eu, placerat sed sem.
|
||||
</p>
|
||||
<p>
|
||||
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
|
||||
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
|
||||
habitant morbi tristique senectus et netus et malesuada fames ac
|
||||
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
|
||||
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
|
||||
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
|
||||
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
|
||||
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
|
||||
nisi.
|
||||
</p>
|
||||
</div>
|
@ -0,0 +1,20 @@
|
||||
<div id="page-3" class="article-page">
|
||||
<p>
|
||||
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
|
||||
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam risus
|
||||
vehicula nibh, in scelerisque lorem risus et risus. Aliquam erat
|
||||
volutpat. Pellentesque habitant morbi tristique senectus et netus et
|
||||
malesuada fames ac turpis egestas. Donec blandit venenatis feugiat. Ut
|
||||
quis turpis ac urna consectetur sagittis. Vestibulum aliquet eros et
|
||||
orci placerat vitae tempus tellus pretium. Quisque rutrum sapien quis
|
||||
nibh facilisis quis posuere ipsum elementum. In ac pretium justo. Sed
|
||||
egestas luctus mollis. Donec rutrum leo a turpis facilisis commodo. Nam
|
||||
quis quam eget mi malesuada scelerisque. Pellentesque semper
|
||||
condimentum sagittis. Nam lobortis, tortor ut placerat viverra, ante
|
||||
felis vehicula sem, blandit ultricies purus urna eget elit.
|
||||
Pellentesque habitant morbi tristique senectus et netus et malesuada
|
||||
fames ac turpis egestas. Sed vel nulla sollicitudin dolor adipiscing
|
||||
dapibus aliquam vitae leo. Phasellus at turpis tempus lectus
|
||||
pellentesque faucibus.
|
||||
</p>
|
||||
</div>
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,31 @@
|
||||
<div id="article"><div class="mod-article-title">
|
||||
<div class="datehead"><span class="page-actions">
|
||||
<p id="fb-root"/><p class="date"><span>Updated: </span>July 12, 2011, 4:52 PM ET</p>
|
||||
</span></div>
|
||||
<p class="headline">
|
||||
</p><h1 class="h2">Roger Clemens' defense sets strategy</h1>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div><p>
|
||||
WASHINGTON -- <a href="http://espn.go.com/mlb/player/_/id/1427/roger-clemens">Roger Clemens</a>' attorney revealed Tuesday that the ex-baseball star plans to begin his defense against charges of lying to Congress by questioning if the lawmakers' investigation into whether he used performance-enhancing drugs was proper.</p><p>Clemens attorney Michael Attanasio said in court that the hearing the House Oversight and Government Reform Committee held in February 2008 had nothing to do with Congress' responsibility for legislation. He said the hearing was only concerned with airing a "credibility contest" between Clemens and his longtime trainer, Brian McNamee, who said he injected the pitcher with steroids and human growth hormone.</p><p/><div class="mod-container mod-inline content-box mod-podcast floatright mod-no-header-footer">
|
||||
<div class="mod-content"><h4>Mike and Mike in the Morning</h4><p class="podcast-player"/>
|
||||
<p>ESPN legal analyst Roger Cossack explains what is going on with the Roger Clemens trial.</p>
|
||||
<p class="footer clear"><a href="http://espn.go.com/espnradio/podcast/"> More Podcasts »</a></p></div></div>
|
||||
<p>Clemens denied those allegations and has been charged with perjury, false statements and obstruction of Congress. The obstruction count charges Clemens with making 15 false or misleading statements to the committee, including his repeated denials he didn't take performance-enhancing drugs during his 24-season career and even whether he attended a 1998 pool party at then-<a href="http://espn.go.com/mlb/team/_/name/tor/toronto-blue-jays">Toronto Blue Jays</a> teammate Jose Canseco's home in Miami.</p><p>McNamee says he saw Clemens and admitted steroids user Canseco talking at the party with another man and that after they returned to Canada, Clemens asked McNamee to inject him with steroids for the first time. </p><p>
|
||||
Clemens and Canseco say Clemens was never at the party but was golfing at the time. Attanasio said that dispute suggests how improper the whole inquiry was and that jurors should be able to determine whether a "he said, he said debate" between Clemens and McNamee was a legitimate congressional concern.</p><p>"We're going to have a mini-trial on whether Roger Clemens went swimming," Attanasio said. "We're going to have a trial in U.S. District Court, Congress is going to have a hearing on these things? That's our point."</p><p>Assistant U.S. attorney Daniel Butler responded that the committee has responsibility for oversight that is broad and goes beyond legislation. He said steroids in baseball is a drug matter and pointed out that a 2005 hearing into the issue led to legislation to regulate steroids and triggered Major League Baseball to commission a report by former Sen. George Mitchell into the extent of the problem in the league.</p><p/><div class="mod-container mod-no-footer mod-inline content-box floatright mod-no-header-footer">
|
||||
<div class="mod-content"><h4>Follow the trial</h4>
|
||||
<img class="io-img" src="http://a.espncdn.com/photo/2010/0116/quinn_tj_m.jpg" border="0"/><p>ESPN's T.J. Quinn will provide live coverage from the courtroom during the Clemens trial. Follow along with our up-to-the-minute <a href="http://twitter.com/#!/TJQuinnESPN" target="_blank"><b>Twitter coverage</b></a>.<br/>
|
||||
•  <b><a href="http://espn.go.com/photo/preview/!pdfs/espn_voir_dire_questions.pdf">Voir dire questions</a></b>
|
||||
</p></div>
|
||||
</div><p>The Mitchell report was released in December 2007 and named Clemens and 85 other current and former ballplayers as using drugs. Clemens denied the allegations and Butler pointed out that leaders of the House committee said they needed to investigate Clemens' denials to determine what weight to give the Mitchell report and its recommendations.</p><p>Attanasio argued that if the committee's purpose was to come full circle on the Mitchell report, it had done so with a January 2008 hearing featuring testimony by Mitchell, baseball commissioner Bud Selig and former players union director Donald Fehr.</p><p>"That ship had left. That work was done. And now it becomes a question between Mr. Clemens and Mr. McNamee," Attanasio said.</p><p>But U.S. District Judge Reggie Walton said if "one of the icons of baseball" was taking exception to the Mitchell report, "it seems to me that Congress has the authority to hold hearings to determine which view is correct."</p><p>Attanasio said the issue will be addressed in testimony from the first two witnesses prosecutors plan to call after opening arguments Wednesday morning. He said the first will be retired House Parliamentarian Charles Johnson, followed by Phil Barnett, who was chief counsel for the committee at the time it investigated Clemens.</p><p>The dispute over the committee's proper role came as Walton considered what preliminary instructions to give the jury, which was seated Tuesday afternoon after 3½ days of screening potential members.</p><p>The jury of 10 women and two men includes a woman whose cousin, former outfielder Al Bumbry, was a coach for the <a href="http://espn.go.com/mlb/team/_/name/bos/boston-red-sox">Boston Red Sox</a> when Clemens played for the team. Another woman on the jury said she believes <a href="http://espn.go.com/nfl/team/_/name/phi/philadelphia-eagles">Philadelphia Eagles</a> quarterback <a href="http://sports.espn.go.com/nfl/players/profile?playerId=2549">Michael Vick</a> was "done wrong" in his criminal conviction in connection with dogfighting.</p><p>Four other people were seated as alternate jurors in case any of the 12 can't serve.</p><p>Prosecutors and Clemens' defense team removed 20 people from the pool of 36 jurors, offering no public explanation for their decisions.</p><p>Clemens' attorney pressed potential jurors not to hold it against Clemens if he chooses not to testify, his strongest hint yet that the ex-pitcher might not take the stand.</p><p>Walton also said he was upset to read a New York Daily News item that members of Clemens' family have been criticizing McNamee and other government witnesses on Twitter and elsewhere online. The judge has a gag order on parties involved in the case, but he said he doesn't have any authority over anyone who isn't before him and hopes that those that are were not involved. </p><p>Clemens' attorney Rusty Hardin said he would look into it but that it's been "extremely difficult" for Clemens' family to see harsh criticisms of the baseball star online and in the media and not be able to respond.</p><p><i>Information from The Associated Press was used in this report.</i>
|
||||
</p>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
@ -0,0 +1,2 @@
|
||||
test_description: espn article
|
||||
url: http://sports.espn.go.com/mlb/news/story?id=6760720
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
||||
test_description: mit news article
|
||||
notes: links are broken out into paragraph divs
|
||||
url: http://web.mit.edu/newsoffice/2011/compare-recommendation-systems-0708.html
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,2 @@
|
||||
test_description: nytimes article
|
||||
url: http://thecaucus.blogs.nytimes.com/2011/07/12/mcconnell-proposal-gives-obama-power-to-increase-debt-limit/?hp
|
@ -0,0 +1,134 @@
|
||||
<div id="article"><div id="page-1" class="article-page"><div class="articleSpanImage"><img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad_span/10bad_span-articleLarge.jpg" alt="" border="0"/><p class="credit">Robert Yager for The New York Times</p>
|
||||
<p class="caption"><strong/>Gilligan on the set with the actors Bryan Cranston and Aaron Paul. </p>
|
||||
</div>
|
||||
<div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p>
|
||||
In the first three seasons of the AMC series “Breaking Bad,” Aaron Paul — or rather, his meth-dealing character, Jesse Pinkman — has been slapped, mauled and beaten purple by, respectively, a hit man, a sociopath and a federal drug-enforcement agent. If he were a piñata, the candy would have poured out of this guy long ago. And apparently there is little mercy for Paul in the new season on the way. For there Paul was, one day in late May, standing on Tijeras Avenue in downtown Albuquerque, being tasered by a brawny man in sunglasses. </p>
|
||||
</nyt_text></div>
|
||||
<div class="articleInline runaroundLeft"><p class="articleInline runaroundLeft"/>
|
||||
|
||||
<div class="inlineImage module">
|
||||
<div class="image">
|
||||
|
||||
<a href="http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html">
|
||||
<img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad1/mag-10Bad-t_CA1-articleInline.jpg" alt=""/></a>
|
||||
</div>
|
||||
<h6 class="credit">Robert Yager for The New York Times</h6>
|
||||
<p class="caption">The goal, Gilligan says, was to turn "Mr. Chips into Scarface." </p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
The street had been blocked off, and a crew of dozens waited as the actors rehearsed the assault with Vince Gilligan, the creator, head writer and show runner, who was also directing the episode. </p><p>
|
||||
“Maybe we play this moment just a little longer, so we know for sure he got zapped,” Gilligan said. “Otherwise, Jesse would fight back more.” </p><p>
|
||||
“Yeah, I like that,” Paul said. </p><p>
|
||||
“And let’s go back to the brass-knuckle-looking taser,” Gilligan said. </p><p>
|
||||
“Fly in the brass-knuckle taser!” a nearby crew member shouted into a walkie-talkie. </p><p>
|
||||
As the cameras were moved into place, Gilligan, who is 44 and speaks in a lyrical Southern drawl, reminisced fondly about some of the torments he has inflicted on Jesse Pinkman. One of the most gruesome was a plunge through the roof of a Port-a-Potty in a junkyard in Season 2. </p><p>
|
||||
“The original version was that he was going to get bit by a guard dog,” Gilligan said, leaning up against a rail and squinting against the New Mexico sun. “But the guard dog would have cost us $25,000, and we didn’t have the money. So we came up with the $5,000 outhouse gag. Which is quite a bit more memorable.” </p><p>
|
||||
Mordantly amusing ordeals are a specialty on “Breaking Bad,” which begins its fourth season on July 17. Credit the show’s forbiddingly grim premise: A 50-year-old high-school chemistry teacher named Walter White (played by Bryan Cranston) finds out he has terminal lung cancer and starts making crystal meth, hoping to leave behind a nest egg for his son and pregnant wife. Walter, it emerges, is a chemistry wizard, and after teaming up with Pinkman, a burnout student he once flunked, the pair drive a ramshackle R.V. into the desert and confect the purest, most coveted meth that local dealers have ever known. With the death penalty of his diagnosis looming, Walt wakes from the slumber of an unfulfilling life, evolving from feckless drudge to reluctant part-time criminal, then gradually to something worse. </p><p>
|
||||
In its first season, “Breaking Bad” seemed like the story of the nuttiest midlife crisis ever, told with elements that felt vaguely familiar. The structure — felonious dad copes with stress of work and family; complications ensue — owed an obvious debt to “The Sopranos,” and the collision of regular people and colorfully violent thugs nodded to Tarantino. The story and setting were an update of the spaghetti Western, minus the cowboys and set in the present. </p><p>
|
||||
But it was soon clear that “Breaking Bad” was something much more satisfying and complex: a revolutionary take on the serial drama. What sets the show apart from its small-screen peers is a subtle metaphysical layer all its own. As Walter inches toward damnation, Gilligan and his writers have posed some large questions about good and evil, questions with implications for every kind of malefactor you can imagine, from Ponzi schemers to terrorists. Questions like: Do we live in a world where terrible people go unpunished for their misdeeds? Or do the wicked ultimately suffer for their sins? </p><p>
|
||||
Gilligan has the nerve to provide his own hopeful answer. “Breaking Bad” takes place in a universe where nobody gets away with anything and karma is the great uncredited player in the cast. This moral dimension might explain why “Breaking Bad” has yet to achieve pop cultural breakthrough status, at least on the scale of other cable hits set in decidedly amoral universes, like “True Blood” or “Mad Men,” AMC’s far-more-buzzed-about series that takes place in an ad agency in the ’60s. The total audience for “Breaking Bad” is only slightly smaller than that of “Mad Men” — 19.5 million versus 22.4 million cumulative viewers in their respective third seasons — but the top three markets for “Breaking Bad” are Albuquerque/Santa Fe, Kansas City and Memphis; neither New York nor Los Angeles are in its top 10. The show, in other words, doesn’t play on the coasts. It gets chatter, just not among what has long been considered the chattering class. </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-2" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 2 of 5)</font></p><p/><p/><p>
|
||||
Which might make Gilligan TV’s first true red-state auteur. His characters lead middle-American lives in a middle-American place, and they are beset with middle-American problems. They speak like middle Americans too, and they inhabit a realm of moral ambiguities that’s overseen by a man with both a wicked sense of humor and a highly refined sense of right and wrong. </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
“If there’s a larger lesson to ‘Breaking Bad,’ it’s that actions have consequences,” Gilligan said during lunch one day in his trailer. “If religion is a reaction of man, and nothing more, it seems to me that it represents a human desire for wrongdoers to be punished. I hate the idea of Idi Amin living in Saudi Arabia for the last 25 years of his life. That galls me to no end.” </p><p>
|
||||
He paused for a moment and speared a few tater tots in a white plastic-foam tray perched on his lap. </p><p>
|
||||
“I feel some sort of need for biblical atonement, or justice, or something,” he said between chews. “I like to believe there is some comeuppance, that karma kicks in at some point, even if it takes years or decades to happen,” he went on. “My girlfriend says this great thing that’s become my philosophy as well. ‘I want to believe there’s a heaven. But I can’t not believe there’s a hell.’ ” </p><p>
|
||||
‘Breaking Bad” was born out of a conversation in 2004 between Gilligan and a friend named Thomas Schnauz, who is now a writer on the show. Schnauz had just read a story about a man cooking meth in an apartment complex, which had sickened kids in apartments above. Saddam Hussein’s putative mobile chemical-weapons labs came up in the conversation, too. </p><p>
|
||||
“Neither of us were working,” Schnauz says, “and we were like two 70-year-old men who like to complain about the world. And somehow we spun off into the idea of driving around in a mobile lab, cooking meth. It was a joke and not something I would have ever thought about again. But a couple days later Vince called back and said: ‘Remember we were talking about that mobile lab and meth? Do you mind if I run with that?’ ” </p><p>
|
||||
A show about a very smart middle-aged guy who hadn’t quite achieved his dreams had a faintly autobiographical whiff for Gilligan at the time. He grew up in Farmville, Va., a town of roughly 6,000 people, not far from Appomattox, the site of the South’s surrender in the Civil War. His father was an insurance claims adjuster, and his mother was a grade-school teacher who had a brief career as a wing walker. “Vince was an acolyte in the Catholic Church,” Gail Gilligan says, though she notes that he also played Dungeons and Dragons. “There was certainly a lot of evil in that game, but it never seemed to affect him adversely.” </p><p>
|
||||
Gilligan earned a partial scholarship to attend New York University’s film program, where his instructors included Jesse Kornbluth, who remembers a polite kid who was so good at drawing bent, violent characters that Kornbluth initially pegged him as the “go postal” type. “In the end, he turned us all into his audience,” Kornbluth said to me. “We were all just mesmerized. Attendance was unnaturally high on days when he was reading his scenes.” </p><p>
|
||||
After graduating, Gilligan won a screenplay contest in 1989, and one of the judges, a producer named Mark Johnson (now an executive producer on “Breaking Bad”), helped him find an agent and sell scripts to Hollywood. Two of them, “Home Fries,” starring Drew Barrymore, and “Wilder Napalm,” starring Debra Winger and Dennis Quaid, were turned into films. It was a promising start. Gilligan bought a house outside Richmond, assuming that he would keep lobbing movie scripts to Los Angeles, which would keep lobbing money back. That did not happen. By 1994, the money dried up and he lost his writer’s guild health insurance. That year, his agent got Gilligan a meeting with Chris Carter, the creator of “The X-Files.” </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-3" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 3 of 5)</font></p><p/><p/><p>
|
||||
“I pitched them an idea about a guy whose shadow comes to life and sucks people in like a black hole and kills them,” he recalls. “They bought that as a freelance episode, and then I moved to California.” He spent seven years as a writer and producer on “The X-Files,” his first full-time TV job. The gig died with the show in 2002, and what followed was another succession of false starts and disappointments. There was “Lone Gunman,” a show for Fox, which expired after one year, and one for CBS called “Battle Creek,” which failed to ignite. </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
“I’ve had two fallow periods in my life,” Gilligan said. The first one was after his two movies were made. “The second was the five years after ‘X-Files.’ Money wasn’t as big an issue as it was the first time, but as a writer you always want to be working on something that has a hope in hell of being made.” </p><p>
|
||||
In its basic outline, “Breaking Bad” — the title is a Southern phrase for going wild — also seemed destined for rejection. Its concept sounded a lot like that of “Weeds,” Showtime’s suburban pot-dealer series. Plus, its lead character is given a diagnosis of cancer within the first 20 minutes, and the action centers on one of the most destructive (and unglamorous) drugs known to man. Not to mention that the show ditches Rule No. 1 of series TV: the personality of the main character must stay the same. </p><p>
|
||||
“Television is really good at protecting the franchise,” Gilligan said. “It’s good at keeping the Korean War going for 11 seasons, like ‘M*A*S*H.’ It’s good at keeping Marshal Dillon policing his little town for 20 years. By their very nature TV shows are open-ended. So I thought, Wouldn’t it be interesting to have a show that takes the protagonist and transforms him into the antagonist?” </p><p>
|
||||
That was the pitch to AMC executives in 2007. The network was searching for a second original series, to go along with “Mad Men,” which made its debut that year. The goal was to find something set in the present, so that AMC wasn’t pigeonholed as the home of period television. And management wanted a conceit that would skew male and complement the network’s library of antihero action movies, the kind that star Clint Eastwood and Charles Bronson. Sitting in his Manhattan office, Charlie Collier, the president of AMC, recalls his introduction to Gilligan’s work: “Our development team put the pilot script on my desk and said, ‘Just read this.’ ” </p><p>
|
||||
At the time that Gilligan conceived “Breaking Bad,” his past success, plus all the hackwork offers that could have kept him busy for years, fortified his sense that only a show built to his iconoclastic sensibility was worth doing. He wanted a show devoid of snappy banter (of the kind that Aaron Sorkin writes), and one that doesn’t flatter you for getting its winking references (as Matthew Weiner does in “Mad Men,” with his chain-smoking doctors and kids playing with dry-cleaning bags). And he wanted a leading man who would not only change over the course of the series but also suffer crushing reversals with lasting impact. </p><p>
|
||||
That is something new. The depravities of leading men in TV dramas traditionally don’t leave permanent scars. Don Draper of “Mad Men” is still pretty much the tippling rake he has been from the start, despite a flirtation or two with confession and reform. Tony Soprano tried, through therapy, to improve as a human being, but he didn’t get very far. Dr. House of “House” will always be a brilliant cuss. Walter White progresses from unassuming savant to opportunistic gangster — and as he does so, the show dares you to excuse him, or find a moral line that you deem a point of no return. </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-4" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 4 of 5)</font></p><p/><p/><p>
|
||||
In 2007, if you needed an actor to dramatize so profound a transformation, Bryan Cranston would have seemed an unlikely choice. Before “Breaking Bad,” he was known as the dad in “Malcolm in the Middle,” a broadly comic role. When Gilligan told AMC executives that he wanted Cranston to play Walter, they initially were baffled. Then Gilligan explained that years earlier, he cast Cranston in an episode of “The X-Files.” “We had this villain, and we needed the audience to feel bad for him when he died,” Gilligan said. “Bryan alone was the only actor who could do that, who could pull off that trick. And it is a trick. I have no idea how he does it.” </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
Meeting Bryan Cranston only deepens the mystery. He is Walter’s opposite. The character is coiled and burdened, while Cranston in person is buoyant. Walter’s default facial expression is a rictus of angst, while Cranston’s is a mischievous smile. Cranston looks at least five years younger than the character, and his co-stars say, he often behaves like a 10-year-old. Aaron Paul described Cranston as “a kid trapped in a man’s body.” Anna Gunn, who plays Skyler, Walter’s wife, says that she has never seen an adult more amused by stuffing fruit down his pants. But Cranston’s performance as Walter White has made history, winning three Emmys in a row for outstanding lead in a drama series, the first actor to do so since Bill Cosby in “I Spy” in the mid-’60s. </p><p>
|
||||
“Physically, to create Walter White, I use my dad,” he said one night over dinner. “My dad is 87 years old. I’m not going to dodder, but Walter is always a little hunched over, never erect. The message to the audience is that the weight of the world is on this man’s shoulders.” </p><p>
|
||||
Cranston is from the total-commitment school of acting, and he once famously did a scene in “Malcolm in the Middle” while covered head to toe with bees. When Gilligan declined to fill in large holes in Walter’s back story, Cranston sat down and wrote out one of his own. On a handful of occasions, he has flagged lines in the script that felt false to him. Cranston reads each episode about a week in advance so that these bumps can be smoothed over before it’s time to start shooting. When he can’t resolve the issue with the writer on the set that week, a call is placed to Gilligan, who is usually in the writer’s room in Burbank. “It’s up to them, but I won’t bend unless I’m convinced it’s the right thing to do,” Cranston says. “Convince me and I’ll do it. I have a theory — our job isn’t to lie to the audience, our job is to find the truth in the character. If we lie, we’re giving the audience a little pinch of poison. They won’t even know they ingested it. But if you lie again and again and again, all of a sudden, your audience is going, ‘This isn’t working for me.’ They just feel sick, and they turn you off.” </p><p>
|
||||
Cranston has found many nuanced ways to enact Walt’s many miseries, the most wrenching of which was the loss of his wife’s love. There is a long history in art of foisting suffering on characters who sin, but it seems to have fallen out of favor. As awful as Tony Soprano was, it’s left purposefully unclear at the end of “The Sopranos” whether he paid the ultimate price. Or consider the “simple chaos” take on the universe as represented in movies by Woody Allen, a director whom Gilligan admires. “And Woody Allen may be right,” Gilligan says. “I’m pretty much agnostic at this point in my life. But I find <a href="http://topics.nytimes.com/top/reference/timestopics/subjects/a/atheism/index.html?inline=nyt-classifier" title="More articles about atheism." class="meta-classifier">atheism</a> just as hard to get my head around as I find fundamental Christianity. Because if there is no such thing as cosmic justice, what is the point of being good? That’s the one thing that no one has ever explained to me. Why shouldn’t I go rob a bank, especially if I’m smart enough to get away with it? What’s stopping me?” </p><p>
|
||||
On a cloudless day in May, five members of the cast and a scrum of crew members were shooting in what is referred to as “the Schrader house,” the home of Walter’s in-laws, Hank and Marie Schrader. It’s rented from a local couple and sits in the shadows of the type of steep, reddish mountains that Wile E. Coyote tumbled off chasing the Road Runner. Gilligan was the ringmaster of this circus, standing on the balcony and sipping a jumbo-size and constantly refilled McDonald’s container of unsweetened iced tea, which he calls brain juice. He was wearing what turned out to be his first pair of designer jeans. They were acquired during a recent shopping spree urged upon him by his girlfriend of 20 years, Holly Rice. His go-to pants have been $12 Wal-Mart jeans, he said, which is what he wore the following day. </p><p>
|
||||
He watched as a crew member put a series of sunglasses on the face of a 20ish Latino man with a nonspeaking background role. </p><p>
|
||||
“I like that one,” he said when the first pair of dark wraparounds were put on the actor’s face. </p><p>
|
||||
On went the second. “Not as good as the first,” Gilligan said. </p><p>
|
||||
Then the third. “Not as good as the first,” Gilligan repeated. </p><p>
|
||||
A fourth. “Let’s go with the first.” </p><p>
|
||||
This, it turns out, is an abbreviated version of a process that Gilligan goes through with virtually every article of clothing, every choice of color, every prop and every extra who appears in “Breaking Bad.” “You see this shirt?” said Dean Norris, who plays Hank Schrader, as he sat on the veranda between takes. He spoke in a stage whisper, out of the side of his mouth, like an inmate describing a warden who has gone insane. “Vince had to see five versions of it before he chose it. <em>Five different shades of a gray T-shirt.</em> That’s unique,” he said, heading into the house. “That’s beyond.” </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-5" class="article-page"><div class="articleBody">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 5 of 5)</font></p><p/><p/><p>
|
||||
Perfectionists often don’t play well with others, but Gilligan seems eager to accommodate everyone with an idea. It’s a running joke in the cast, the disconnect between Gilligan the person and Gilligan the writer. The former is sweet-tempered and polite; the latter strapped a character’s severed head to a tortoise, which was then rigged with explosives and blown up as D.E.A. agents swarmed around it. </p>
|
||||
</nyt_text></div>
|
||||
|
||||
<div class="articleBody">
|
||||
<p>
|
||||
During a break in the shooting, I asked Gilligan if, now four seasons into his show, he could explain the gulf between his manners and his material. </p><p>
|
||||
“I’m not the happiest person,” he said. “But I respect this crew and these actors. I try to be as cheerful as possible. I fake it pretty well.” </p><p>
|
||||
Well, a lot of people can fake cheerful. But how does such a benign-seeming person come up with such malign tales? Gilligan thought for a moment, then quoted Flaubert. “I’m not going to get this exactly right, but it’s something like, ‘You should be neat and orderly in your life so you can be violent and original in your work,’ and there’s something to that,” he said. “It’s fun to explore that darkness and that criminal behavior on the page, but I’m too timid to do it in real life.” </p><p>
|
||||
The pilot of the show opened, memorably, with just such a burst of darkness and violence: Walt driving that R.V. through a desert in a crazed dash, wearing nothing but tighty-whitey briefs and a gas mask. Two male bodies roll in a soup of liquid, broken beakers and cash in the cabin. Cut to three weeks earlier. Walt is a regular schlub, in an unremarkable house, on his way to a mundane job. Gilligan slyly signals his overarching theme when Walter stands before his class and tells his students, “Chemistry is . . . well, technically it’s the study of matter, but I prefer to see it as the study of <em>change</em>.” </p><p>
|
||||
When you give your lead character a terminal illness, usher him into the underworld and embroil him in ever bolder and more ambitious criminal plans, you create a man who is rushing toward the ultimate change — from being alive to being dead. Walter White is surely the most doomed character on television, meaning that, just as “Breaking Bad” is finally winning acclaim, the end of the series is in sight. Which is just fine with Gilligan. He can imagine a fifth season of “Breaking Bad,” but that’s it. </p><p>
|
||||
Driving to the set after lunch one day, he told me that Walter White had started off as a person he could imagine chatting with over a beer. </p><p>
|
||||
“Now he’s not quite at the point where I’d cross the street if I saw him coming,” he said, with a smile. “But I wouldn’t want to be stuck in an elevator with him too long.” Plotting Walt’s transgressions has proved wearying enough. “It’s hard to write a character that dark and morally ambiguous,” he said. “I’m going to miss the show when it’s over, but on some level, it’ll be a relief to not have Walt in my head anymore.” </p><nyt_author_id><div class="authorIdentification">
|
||||
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
|
||||
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
|
||||
</p>
|
||||
</nyt_correction_bottom><nyt_update_bottom/></div> </div></div>
|
@ -0,0 +1,9 @@
|
||||
test_description: multi-page article from nytimes
|
||||
notes: wrongly includes author identification from each page
|
||||
url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html
|
||||
url_map:
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1: nytimes-001-orig-2.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2: nytimes-001-orig-2.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=3: nytimes-001-orig-3.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=4: nytimes-001-orig-4.html
|
||||
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=5: nytimes-001-orig-5.html
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,6 @@
|
||||
<div id="article"><article><p>Put another way, Democrats reacted to the “grand bargain” proposed by President Obama and House Speaker John Boehner by squawking, complaining and highlighting elements they didn’t like. This is known throughout the world as the way to begin a process of negotiation.</p><p>Republicans, by contrast, answered with a definitive “no” and then covered their ears. Given the looming Aug. 2 deadline for default if the debt ceiling is not raised, the proper term for this approach is blackmail.</p><p>Yet the “both sides are to blame” narrative somehow gained currency after <a href="http://www.washingtonpost.com/business/economy/boehner-abandons-efforts-to-reach-comprehensive-debt-reduction-deal/2011/07/09/gIQARUJ55H_story.html">Boehner announced Saturday</a> that House Republicans would not support any increase in revenue, period. A false equivalence was drawn between the absolute Republican rejection of “revenue-positive” tax reform and the less-than-absolute Democratic opposition to “benefit cuts” in Medicare and Social Security.</p><p>The bogus story line is that the radical right-wing base of the GOP and the radical left-wing base of the Democratic Party are equally to blame for sinking the deal. </p><p>Leave aside, for the moment, the fact that in the Obama-Boehner proposal, there would be roughly three dollars’ worth of budget cuts for every dollar of new revenue. Don’t pause to ask whether it makes sense to slash government spending when the economy is still sputtering out of the worst recession in decades. Instead, focus narrowly on the politics of the deal.</p><p>It is true that House Minority Leader Nancy Pelosi howled like a blindsided politician when she learned that entitlement programs were on the table. But her objections — and those of Democrats in general — are philosophical and tactical, not absolute.</p><p>Progressives understand that Medicare and Social Security are not sustainable on their current trajectories; in the long term, both must have their revenue and costs brought into balance. Pelosi’s position is that each program should be addressed with an eye toward sustainability — not as a part of a last-minute deal for a hike in the debt ceiling that covers us for two or three years.</p><p>It’s also true that Democrats believe they can win back a passel of House seats next year by highlighting the GOP plan to convert Medicare into a voucher program. They don’t want Republicans to be able to point and say, “See, the Democrats want to cut Medicare, too.”</p><p>There’s nothing in these Democratic objections, however, that couldn’t be creatively finessed. You can claim you haven’t actually “cut” a benefit, for example, if what you’ve done is restrained the rate at which its cost will grow. You can offset spending with new revenue, and you can do so in a way that gives low-income taxpayers a break. Democrats left the door open and these options could have been explored.</p><p>The story on the Republican side is entirely different. There are ways to finesse a “no new taxes” pledge, too. Instead of raising tax rates, you close loopholes in the name of reform; you add an enhancement here, a “user fee” there, and you can manage to get the revenue you need and still claim you haven’t voted to raise taxes.</p><p>But Republicans are taking the position that not a cent of new revenue can be raised, no matter the euphemism. Some Democrats, yes, are being scratchy and cantankerous. But Republicans are refusing to negotiate at all. That’s not the same thing.</p><p>I understand why President Obama, <a href="http://projects.washingtonpost.com/obama-speeches/speech/736/">in his news conference Monday</a>, chided “each side” for taking a “maximalist position.” For political and practical reasons, it’s advantageous for him to be seen as an honest broker.</p><p>Meanwhile, though, the clock ticks toward Aug. 2 and the possibility of a catastrophic default becomes more real. And no one should be confused about what the president confronts: On one side, grousing and grumbling. On the other, a brick wall. </p><p>
|
||||
|
||||
<i>
|
||||
<a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Eugene Robinson will be online</a> to chat with readers at 1 p.m. Eastern time Tuesday. <a href="http://live.washingtonpost.com/eugene-robinson-07-12-11.html">Submit your questions</a> before or during the discussion.</i>
|
||||
|
||||
</p></article></div>
|
@ -0,0 +1,2 @@
|
||||
test_description: washingtonpost.com op-ed
|
||||
url: http://www.washingtonpost.com/opinions/dont-blame-both-sides-for-debt-impasse/2011/07/11/gIQA0XDg9H_story.html?hpid=z1
|
@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
@ -0,0 +1,55 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from readability_lxml.readability import Document
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
|
||||
|
||||
def load_sample(filename):
|
||||
"""Helper to get the content out of the sample files"""
|
||||
return open(os.path.join(SAMPLES, filename)).read()
|
||||
|
||||
|
||||
class TestArticleOnly(unittest.TestCase):
|
||||
"""The option to not get back a full html doc should work
|
||||
|
||||
Given a full html document, the call can request just divs of processed
|
||||
content. In this way the developer can then wrap the article however they
|
||||
want in their own view or application.
|
||||
|
||||
"""
|
||||
|
||||
def test_si_sample(self):
|
||||
"""Using the si sample, load article with only opening body element"""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(
|
||||
sample,
|
||||
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary()
|
||||
self.assertEqual('<html><body id="page"><div><div class', res[0:37])
|
||||
|
||||
def test_si_sample_html_partial(self):
|
||||
"""Using the si sample, make sure we can get the article alone."""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary(enclose_with_html_tag=False)
|
||||
self.assertEqual('<div id="page"><div class="', res[0:27])
|
||||
|
||||
def test_si_sample_full_summary(self):
|
||||
"""We should parse the doc and get a full summary with confidence"""
|
||||
sample = load_sample('si-game.sample.html')
|
||||
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
|
||||
res = doc.summary_with_metadata(enclose_with_html_tag=False)
|
||||
self.assertTrue(hasattr(res, 'html'),
|
||||
'res should have an html attrib')
|
||||
self.assertTrue(hasattr(res, 'confidence'),
|
||||
'res should have an html attrib')
|
||||
self.assertTrue(hasattr(res, 'title'),
|
||||
'res should have an titile attrib')
|
||||
self.assertTrue(hasattr(res, 'short_title'),
|
||||
'res should have an short_title attrib')
|
||||
self.assertEqual('<div id="page"><div class="', res.html[0:27])
|
||||
self.assertTrue(res.confidence > 50,
|
||||
'The confidence score should be larger than 50: ' + str(res.confidence))
|
@ -0,0 +1,253 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html.diff import htmldiff
|
||||
|
||||
from helpers import load_regression_data
|
||||
from helpers import REGRESSION_DATA
|
||||
from readability_lxml.readability import Document
|
||||
from readability_lxml import readability as r
|
||||
from readability_lxml import urlfetch
|
||||
|
||||
|
||||
class TestReadabilityDocument(unittest.TestCase):
|
||||
"""Test the Document parser."""
|
||||
|
||||
def test_none_input_raises_exception(self):
|
||||
"""Feeding a None input to the document should blow up."""
|
||||
|
||||
doc = None
|
||||
self.assertRaises(ValueError, Document, doc)
|
||||
|
||||
|
||||
class TestFindBaseUrl(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.longMessage = True
|
||||
|
||||
def _assert_url(self, url, expected_base_url, msg = None):
|
||||
actual_base_url = r.find_base_url(url)
|
||||
self.assertEqual(expected_base_url, actual_base_url, msg)
|
||||
|
||||
def _run_urls(self, specs):
|
||||
"""
|
||||
Asserts expected results on a sequence of specs, where each spec is a
|
||||
pair: (URL, expected base URL).
|
||||
"""
|
||||
for spec in specs:
|
||||
url = spec[0]
|
||||
expected = spec[1]
|
||||
if len(spec) > 2:
|
||||
msg = spec[2]
|
||||
else:
|
||||
msg = None
|
||||
self._assert_url(url, expected, msg)
|
||||
|
||||
def test_none(self):
|
||||
self._assert_url(None, None)
|
||||
|
||||
def test_no_change(self):
|
||||
url = 'http://foo.com/article'
|
||||
self._assert_url(url, url)
|
||||
|
||||
def test_extension_stripping(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/article.html',
|
||||
'http://foo.com/article',
|
||||
'extension should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/article.html',
|
||||
'http://foo.com/path/to/article',
|
||||
'extension should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/article.123not',
|
||||
'http://foo.com/article.123not',
|
||||
'123not is not extension'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/article.123not',
|
||||
'http://foo.com/path/to/article.123not',
|
||||
'123not is not extension'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_ewcms(self):
|
||||
self._assert_url(
|
||||
'http://www.ew.com/ew/article/0,,20313460_20369436,00.html',
|
||||
'http://www.ew.com/ew/article/0,,20313460_20369436'
|
||||
)
|
||||
|
||||
def test_page_numbers(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/page5.html',
|
||||
'http://foo.com',
|
||||
'page number should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/page5.html',
|
||||
'http://foo.com/path/to',
|
||||
'page number should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/article-5.html',
|
||||
'http://foo.com/article',
|
||||
'page number should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_numbers(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/5.html',
|
||||
'http://foo.com',
|
||||
'number should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/5.html',
|
||||
'http://foo.com/path/to',
|
||||
'number should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_index(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/index.html',
|
||||
'http://foo.com',
|
||||
'index should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/path/to/index.html',
|
||||
'http://foo.com/path/to',
|
||||
'index should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
def test_short(self):
|
||||
specs = [
|
||||
(
|
||||
'http://foo.com/en/1234567890',
|
||||
'http://foo.com/1234567890',
|
||||
'short segment should be stripped'
|
||||
),
|
||||
(
|
||||
'http://foo.com/en/de/1234567890',
|
||||
'http://foo.com/en/1234567890',
|
||||
'short segment should be stripped'
|
||||
)
|
||||
]
|
||||
self._run_urls(specs)
|
||||
|
||||
|
||||
class TestMultiPageHelpers(unittest.TestCase):
|
||||
|
||||
def test_find_next_page_url(self):
|
||||
"""Verify we can find a next page url in the html body"""
|
||||
html = """
|
||||
<html><body><a href="/?page=2">next</a></body></html>
|
||||
"""
|
||||
from lxml.html import document_fromstring
|
||||
doc = document_fromstring(html)
|
||||
|
||||
res = r.find_next_page_url(set(), None, doc)
|
||||
self.assertEqual('/?page=2', res,
|
||||
'Should find out page 2 url in the body.')
|
||||
|
||||
|
||||
class TestFindNextPageLink(unittest.TestCase):
|
||||
|
||||
def _test_page(self, url, html_path, expected):
|
||||
html = load_regression_data(html_path)
|
||||
doc = r.parse(html, url)
|
||||
parsed_urls = {url}
|
||||
actual = r.find_next_page_url(parsed_urls, url, doc)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_basic(self):
|
||||
self._test_page(
|
||||
'http://basic.com/article.html',
|
||||
'basic-multi-page.html',
|
||||
'http://basic.com/article.html?pagewanted=2'
|
||||
)
|
||||
|
||||
def test_nytimes(self):
|
||||
# This better work for the New York Times.
|
||||
self._test_page(
|
||||
'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html',
|
||||
'nytimes-next-page.html',
|
||||
'http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1'
|
||||
)
|
||||
|
||||
|
||||
class TestMultiPage(unittest.TestCase):
|
||||
"""
|
||||
Tests the full path of generating a readable page for a multi-page article.
|
||||
The test article is very simple, so this test should be resilient to tweaks
|
||||
of the algorithm.
|
||||
"""
|
||||
|
||||
def _make_basic_urldict(self):
|
||||
url_fmt = 'http://basic.com/article.html?pagewanted=%s'
|
||||
file_fmt = 'basic-multi-page-%s.html'
|
||||
|
||||
pairs = [(url_fmt % i, os.path.join(REGRESSION_DATA, file_fmt % i)) for i in ['2', '3']]
|
||||
return dict(pairs)
|
||||
|
||||
def test_basic(self):
|
||||
html = load_regression_data('basic-multi-page.html')
|
||||
urldict = self._make_basic_urldict()
|
||||
fetcher = urlfetch.MockUrlFetch(urldict)
|
||||
options = {
|
||||
'url': 'http://basic.com/article.html',
|
||||
'multipage': True,
|
||||
'urlfetch': fetcher
|
||||
}
|
||||
doc = Document(html, **options)
|
||||
res = doc.summary_with_metadata()
|
||||
|
||||
self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
|
||||
self.assertIn('Page 3', res.html, 'Should find the page 3 heading')
|
||||
|
||||
expected_html = load_regression_data('basic-multi-page-expected.html')
|
||||
diff_html = htmldiff(expected_html, res.html)
|
||||
diff_doc = document_fromstring(diff_html)
|
||||
|
||||
insertions = diff_doc.xpath('//ins')
|
||||
deletions = diff_doc.xpath('//del')
|
||||
|
||||
if len(insertions) != 0:
|
||||
for i in insertions:
|
||||
print('unexpected insertion: %s' % i.xpath('string()'))
|
||||
self.fail('readability result does not match expected')
|
||||
|
||||
if len(deletions) != 0:
|
||||
for i in deletions:
|
||||
print('unexpected deletion: %s' % i.xpath('string()'))
|
||||
self.fail('readability result does not match expected')
|
||||
|
||||
|
||||
class TestIsSuspectedDuplicate(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(TestIsSuspectedDuplicate, self).setUp()
|
||||
html = load_regression_data('duplicate-page-article.html')
|
||||
self._article = r.fragment_fromstring(html)
|
||||
|
||||
def test_unique(self):
|
||||
html = load_regression_data('duplicate-page-unique.html')
|
||||
page = r.fragment_fromstring(html)
|
||||
self.assertFalse(r.is_suspected_duplicate(self._article, page))
|
||||
|
||||
def test_duplicate(self):
|
||||
html = load_regression_data('duplicate-page-duplicate.html')
|
||||
page = r.fragment_fromstring(html)
|
||||
self.assertTrue(r.is_suspected_duplicate(self._article, page))
|
@ -0,0 +1,24 @@
|
||||
"""Process all of the samples and make sure that process without error."""
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from helpers import load_sample
|
||||
from readability_lxml.readability import Document
|
||||
|
||||
sample_list = [
|
||||
'nyt.sample.html',
|
||||
'si-game.sample.html',
|
||||
]
|
||||
|
||||
|
||||
def test_processes():
|
||||
for article in sample_list:
|
||||
yield process_article, article
|
||||
|
||||
|
||||
def process_article(article):
|
||||
sample = load_sample(article)
|
||||
doc = Document(sample)
|
||||
res = doc.summary()
|
||||
failed_msg = "Failed to process the article: " + res[0:37]
|
||||
assert '<html><body id="page"><div><div class' == res[0:37], failed_msg
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -1,126 +0,0 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from readability import Document
|
||||
import timeout_decorator
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
|
||||
|
||||
def load_sample(filename):
|
||||
"""Helper to get the content out of the sample files"""
|
||||
with open(os.path.join(SAMPLES, filename)) as f:
|
||||
html = f.read()
|
||||
return html
|
||||
|
||||
|
||||
class TestArticleOnly(unittest.TestCase):
|
||||
"""The option to not get back a full html doc should work
|
||||
|
||||
Given a full html document, the call can request just divs of processed
|
||||
content. In this way the developer can then wrap the article however they
|
||||
want in their own view or application.
|
||||
|
||||
"""
|
||||
|
||||
def test_si_sample(self):
|
||||
"""Using the si sample, load article with only opening body element"""
|
||||
sample = load_sample("si-game.sample.html")
|
||||
doc = Document(
|
||||
sample,
|
||||
url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
|
||||
)
|
||||
res = doc.summary()
|
||||
self.assertEqual("<html><body><div><div class", res[0:27])
|
||||
|
||||
def test_si_sample_html_partial(self):
|
||||
"""Using the si sample, make sure we can get the article alone."""
|
||||
sample = load_sample("si-game.sample.html")
|
||||
doc = Document(
|
||||
sample,
|
||||
url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
|
||||
)
|
||||
res = doc.summary(html_partial=True)
|
||||
self.assertEqual('<div><div class="', res[0:17])
|
||||
|
||||
def test_too_many_images_sample_html_partial(self):
|
||||
"""Using the too-many-images sample, make sure we still get the article."""
|
||||
sample = load_sample("too-many-images.sample.html")
|
||||
doc = Document(sample)
|
||||
res = doc.summary(html_partial=True)
|
||||
self.assertEqual('<div><div class="post-body', res[0:26])
|
||||
|
||||
def test_wrong_link_issue_49(self):
|
||||
"""We shouldn't break on bad HTML."""
|
||||
sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
|
||||
doc = Document(sample)
|
||||
res = doc.summary(html_partial=True)
|
||||
self.assertEqual('<div><div class="content__article-body ', res[0:39])
|
||||
|
||||
def test_best_elem_is_root_and_passing(self):
|
||||
sample = (
|
||||
'<html class="article" id="body">'
|
||||
" <body>"
|
||||
" <p>1234567890123456789012345</p>"
|
||||
" </body>"
|
||||
"</html>"
|
||||
)
|
||||
doc = Document(sample)
|
||||
doc.summary()
|
||||
|
||||
def test_correct_cleanup(self):
|
||||
sample = """
|
||||
<html>
|
||||
<body>
|
||||
<section>test section</section>
|
||||
<article class="">
|
||||
<p>Lot of text here.</p>
|
||||
<div id="advertisement"><a href="link">Ad</a></div>
|
||||
<p>More text is written here, and contains punctuation and dots.</p>
|
||||
</article>
|
||||
<aside id="comment1"/>
|
||||
<div id="comment2">
|
||||
<a href="asd">spam</a>
|
||||
<a href="asd">spam</a>
|
||||
<a href="asd">spam</a>
|
||||
</div>
|
||||
<div id="comment3"/>
|
||||
<aside id="comment4">A small comment.</aside>
|
||||
<div id="comment5"><p>The comment is also helpful, but it's
|
||||
still not the correct item to be extracted.</p>
|
||||
<p>It's even longer than the article itself!"</p></div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
doc = Document(sample)
|
||||
s = doc.summary()
|
||||
# print(s)
|
||||
assert "punctuation" in s
|
||||
assert not "comment" in s
|
||||
assert not "aside" in s
|
||||
|
||||
# Many spaces make some regexes run forever
|
||||
@timeout_decorator.timeout(seconds=3, use_signals=False)
|
||||
def test_many_repeated_spaces(self):
|
||||
long_space = " " * 1000000
|
||||
sample = "<html><body><p>foo" + long_space + "</p></body></html>"
|
||||
|
||||
doc = Document(sample)
|
||||
s = doc.summary()
|
||||
|
||||
assert "foo" in s
|
||||
|
||||
def test_not_self_closing(self):
|
||||
sample = '<h2><a href="#"></a>foobar</h2>'
|
||||
doc = Document(sample)
|
||||
assert (
|
||||
'<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>'
|
||||
== doc.summary()
|
||||
)
|
||||
|
||||
def test_utf8_kanji(self):
|
||||
"""Using the UTF-8 kanji sample, load article which is written in kanji"""
|
||||
sample = load_sample("utf-8-kanji.sample.html")
|
||||
doc = Document(sample)
|
||||
res = doc.summary()
|
@ -1,33 +0,0 @@
|
||||
# Tox (http://tox.testrun.org/) is a tool for running tests
|
||||
# in multiple virtualenvs. This configuration file will run the
|
||||
# test suite on all supported python versions. To use it, "pip install tox"
|
||||
# and then run "tox" from this directory.
|
||||
|
||||
[tox]
|
||||
envlist =
|
||||
py{27,35,36,37,38,py,py3}, doc
|
||||
skip_missing_interpreters =
|
||||
True
|
||||
|
||||
[testenv]
|
||||
deps =
|
||||
pytest
|
||||
doc: sphinx
|
||||
doc: sphinx_rtd_theme
|
||||
doc: recommonmark
|
||||
|
||||
# This creates the virtual envs with --site-packages so already packages
|
||||
# that are already installed will be reused. This is especially useful on
|
||||
# Windows. Since we use lxml instead of compiling it locally (which in turn
|
||||
# requires a Compiler and the build dependencies), you can download
|
||||
# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
|
||||
# $PYTHONDIR\Scripts\pip.exe install *.whl
|
||||
sitepackages=
|
||||
True
|
||||
commands =
|
||||
pip install -r requirements.txt -e ".[test]"
|
||||
py.test
|
||||
|
||||
[testenv:doc]
|
||||
commands =
|
||||
python setup.py build_sphinx
|
Loading…
Reference in New Issue