Several updates from dev version.

Makefile updates
Fix for Mac OS X 10.10
29 changed files with 523 additions and 5033 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,7 @@
 *.pyc
-__pycache__
 *.egg-info
-/build
-/dist
+build
+dist
 /bin
 /include
 /lib
@ -10,8 +9,3 @@ __pycache__
 /man
 nosetests.xml
 .coverage
-.tox
-.idea
-.cache
-/.noseids
-/.venv
--- a/.travis.yml
+++ b/.travis.yml
@ -1,60 +0,0 @@
-language: python
-os: linux
-cache: pip
-
-matrix:
-  include:
-    - name: "Python 2.7 on Linux"
-      python: 2.7
-      env: PIP=pip
-    - name: "Python 3.5 on Linux"
-      python: 3.5
-    - name: "Python 3.6 on Linux"
-      python: 3.6
-    - name: "Python 3.7 on Linux"
-      python: 3.7
-    - name: "Python 3.8 on Linux"
-      dist: xenial
-      python: 3.8
-    - name: "Python 3.9 Nightly on Linux"
-      dist: bionic
-      python: nightly
-    - name: "Pypy on Linux"
-      python: pypy
-      env: PIP=pip
-    - name: "Pypy 3 on Linux"
-      python: pypy3
-    - name: "Python 3.7 on older macOS"
-      os: osx
-      osx_image: xcode9.4
-      language: shell
-      env: TOXENV=py37
-      before_install:
-        - sw_vers
-        - python3 --version
-        - pip3 --version
-    - name: "Python 3.7 on macOS"
-      os: osx
-      osx_image: xcode11
-      language: shell
-      env: TOXENV=py37
-      before_install:
-        - sw_vers
-        - python3 --version
-        - pip3 --version
-  allow_failures:
-    - python: nightly
-    - python: pypy
-    - python: pypy3
-    - os: osx
-
-install:
-  - if [ $PIP ]; then true; else PIP=pip3; fi
-  - travis_retry $PIP install -U pip wheel tox-travis pytest-cov codecov
-  - travis_retry $PIP install -U -r requirements.txt -e ".[test]"
-
-script:
-  - tox
-
-after_success:
-  - codecov
--- a/201
+++ b/201
@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/42
+++ b/42
@ -1,10 +1,10 @@
 # Makefile to help automate tasks
 WD := $(shell pwd)
-PY := .venv/bin/python
-PIP := .venv/bin/pip
-PEP8 := .venv/bin/pep8
-NOSE := .venv/bin/nosetests
-TWINE := twine
+PY := .env/bin/python
+PIP := .env/bin/pip
+PEP8 := .env/bin/pep8
+NOSE := .env/bin/nosetests
+

 # ###########
 # Tests rule!
@ -13,29 +13,25 @@ TWINE := twine
 test: venv develop $(NOSE)
 	$(NOSE) --with-id -s tests

-$(NOSE): setup
+$(NOSE):
+	$(PIP) install nose pep8 coverage

 # #######
 # INSTALL
 # #######
 .PHONY: all
-all: setup develop
-
-venv: .venv/bin/python
-
-setup: venv
-	$(PIP) install -r requirements-dev.txt
+all: venv develop

-.venv/bin/python:
-	test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv
+venv: bin/python
+bin/python:
+	virtualenv .env

-.PHONY: clean
-clean:
-	rm -rf .venv
+.PHONY: clean_venv
+clean_venv:
+	rm -rf .env

-develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
-
-.venv/lib/python*/site-packages/readability-lxml.egg-link:
+develop: .env/lib/python*/site-packages/readability-lxml.egg-link
+.env/lib/python*/site-packages/readability-lxml.egg-link:
 	$(PY) setup.py develop


@ -45,17 +41,17 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
 .PHONY: clean_all
 clean_all: clean_venv

+
 # ###########
 # Deploy
 # ###########
 .PHONY: dist
 dist:
-	$(PY) setup.py sdist bdist_wheel
-	$(TWINE) check dist/*
+	$(PY) setup.py sdist

 .PHONY: upload
 upload:
-	$(TWINE) upload dist/*
+	$(PY) setup.py sdist upload

 .PHONY: version_update
 version_update:
--- a/59
+++ b/59
@ -0,0 +1,59 @@
+This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
+
+This is a python port of a ruby port of arc90's readability project
+
+http://lab.arc90.com/experiments/readability/
+
+In few words,
+Given a html document, it pulls out the main body text and cleans it up.
+It also can clean up title based on latest readability.js code.
+
+Based on:
+ - Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
+ - Ruby port by starrhorne and iterationlabs
+ - Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
+ - Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
+ - "BR to P" fix from readability.js which improves quality for smaller texts.
+ - Github users contributions.
+
+Installation::
+
+    easy_install readability-lxml
+    or
+    pip install readability-lxml
+
+Usage::
+
+    from readability.readability import Document
+    import urllib
+    html = urllib.urlopen(url).read()
+    readable_article = Document(html).summary()
+    readable_title = Document(html).short_title()
+
+Command-line usage::
+
+    python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
+
+
+Using positive/negative keywords example::
+
+    python -m readability.readability -p intro -n newsindex,homepage-box,news-section -u http://python.org
+
+
+Document() kwarg options:
+
+ - attributes:
+ - debug: output debug messages
+ - min_text_length:
+ - retry_length:
+ - url: will allow adjusting links to be absolute
+ - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
+ - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
+
+
+Updates
+
+ - 0.2.5 Update setup.py for uploading .tar.gz to pypi
+ - 0.2.6 Don't crash on documents with no title
+ - 0.2.6.1 Document.short_title() properly works
+ - 0.3 Added Document.encoding, positive_keywords and negative_keywords
--- a/README.rst
+++ b/README.rst
@ -1,68 +0,0 @@
-.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master
-    :target: https://travis-ci.org/buriy/python-readability
-
-
-python-readability
-==================
-
-Given a html document, it pulls out the main body text and cleans it up.
-
-This is a python port of a ruby port of `arc90's readability
-project <http://lab.arc90.com/experiments/readability/>`__.
-
-Installation
------------
-
-It's easy using ``pip``, just run:
-
-.. code-block:: bash
-
-    $ pip install readability-lxml
-
-Usage
-----
-
-.. code-block:: python
-
-    >>> import requests
-    >>> from readability import Document
-
-    >>> response = requests.get('http://example.com')
-    >>> doc = Document(response.text)
-    >>> doc.title()
-    'Example Domain'
-
-    >>> doc.summary()
-    """<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
-    <p>This domain is established to be used for illustrative examples in documents. You may
-    use this\n    domain in examples without prior coordination or asking for permission.</p>
-    \n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
-    \n</body>\n</div></body></html>"""
-
-Change Log
----------
-
-  0.8.1 Fixed processing of non-ascii HTMLs via regexps.
-  0.8 Replaced XHTML output with HTML5 output in summary() call.
-  0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
-  0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
-  0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
-  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
-  0.4 Added Videos loading and allowed more images per paragraph
-  0.3 Added Document.encoding, positive\_keywords and negative\_keywords
-
-Licensing
---------
-
-This code is under `the Apache License
-2.0 <http://www.apache.org/licenses/LICENSE-2.0>`__ license.
-
-Thanks to
---------
-
-  Latest `readability.js <https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js>`__
-  Ruby port by starrhorne and iterationlabs
-  `Python port <https://github.com/gfxmonk/python-readability>`__ by gfxmonk
-  `Decruft effort <http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/>` to move to lxml
-  "BR to P" fix from readability.js which improves quality for smaller texts
-  Github users contributions.
--- a/doc/init.py
+++ b/doc/init.py
--- a/doc/source/init.py
+++ b/doc/source/init.py
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@ -1,30 +0,0 @@
-Reference
-=========
-
-.. automodule:: readability
-    :members:
-    :show-inheritance:
-
-.. automodule:: readability.browser
-    :members:
-    :show-inheritance:
-
-.. automodule:: readability.cleaners
-    :members:
-    :show-inheritance:
-
-.. automodule:: readability.debug
-    :members:
-    :show-inheritance:
-
-.. automodule:: readability.encoding
-    :members:
-    :show-inheritance:
-
-.. automodule:: readability.htmls
-    :members:
-    :show-inheritance:
-
-.. automodule:: readability.readability
-    :members:
-    :show-inheritance:
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-# readability documentation build configuration file, created by
-# sphinx-quickstart on Thu Mar 23 16:29:38 2017.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath("../.."))
-
-import readability
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.doctest",
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.todo",
-    "recommonmark",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-source_suffix = [".rst", ".md"]
-
-# The master toctree document.
-master_doc = "index"
-
-# General information about the project.
-project = "readability"
-copyright = "2020, Yuri Baburov"
-author = "Yuri Baburov"
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-
-# The short X.Y version.
-version = readability.__version__
-
-# The full version, including alpha/beta/rc tags.
-release = readability.__version__
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = []
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = []  #'_static']
-
-
-# -- Options for HTMLHelp output ------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "readabilitydoc"
-
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [(master_doc, "readability.tex", "Readability Documentation", "Yuri Baburov", "manual")]
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "readability", "readability Documentation", [author], 1)]
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (
-        master_doc,
-        "readability",
-        "Readability Documentation",
-        author,
-        "readability",
-        "One line description of project.",
-        "Miscellaneous",
-    )
-]
-
-
-intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
-}
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -1,13 +0,0 @@
-.. include:: ../../README.rst
-
-.. toctree::
-    :maxdepth: 2
-
-    api
-
-Indices and tables
------------------
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
--- a/readability/init.py
+++ b/readability/init.py
@ -1,3 +1 @@
-__version__ = "0.8.1.1"
-
 from .readability import Document
--- a/readability/browser.py
+++ b/readability/browser.py
@ -1,21 +0,0 @@
-def open_in_browser(html):
-    """
-    Open the HTML document in a web browser, saving it to a temporary
-    file to open it.  Note that this does not delete the file after
-    use.  This is mainly meant for debugging.
-    """
-    import os
-    import webbrowser
-    import tempfile
-
-    handle, fn = tempfile.mkstemp(suffix=".html")
-    f = os.fdopen(handle, "wb")
-    try:
-        f.write(b"<meta charset='UTF-8' />")
-        f.write(html.encode("utf-8"))
-    finally:
-        # we leak the file itself here, but we should at least close it
-        f.close()
-    url = "file://" + fn.replace(os.path.sep, "/")
-    webbrowser.open(url)
-    return url
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@ -1,52 +1,39 @@
-# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+# -*- encoding: utf-8 -*-
+
+# strip out a set of nuisance html attributes that can mess up rendering
+# in RSS feeds
+
 import re
 from lxml.html.clean import Cleaner

-bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
+bad_attrs = ['width', 'height', 'style',
+             '[-a-z]*color', 'background[-a-z]*', 'on*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
-non_space = "[^ \"'>]+"
-htmlstrip = re.compile(
-    "<"  # open
+non_space = '[^ "\'>]+'
+htmlstrip = re.compile("<"  # open
    "([^>]+) "  # prefix
-    "(?:%s) *" % ("|".join(bad_attrs),)
-    + "= *(?:%s|%s|%s)"  # undesirable attributes
-    % (non_space, single_quoted, double_quoted)
-    + "([^>]*)"  # value  # postfix
-    ">",  # end
-    re.I,
-)
-
+    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
+    "([^>]*)"  # postfix
+    ">"        # end
+, re.I)

 def clean_attributes(html):
    while htmlstrip.search(html):
-        html = htmlstrip.sub("<\\1\\2>", html)
+        html = htmlstrip.sub('<\\1\\2>', html)
    return html

-
 def normalize_spaces(s):
    if not s:
-        return ""
+        return ''
    """replace any sequence of whitespace
    characters with a single space"""
-    return " ".join(s.split())
-
+    return ' '.join(s.split())

-html_cleaner = Cleaner(
-    scripts=True,
-    javascript=True,
-    comments=True,
-    style=True,
-    links=True,
-    meta=False,
-    add_nofollow=False,
-    page_structure=False,
-    processing_instructions=True,
-    embedded=False,
-    frames=False,
-    forms=False,
-    annoying_tags=False,
-    remove_tags=None,
-    remove_unknown_tags=False,
-    safe_attrs_only=False,
-)
+html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
+                  style=True, links=True, meta=False, add_nofollow=False,
+                  page_structure=False, processing_instructions=True,
+                  embedded=False, frames=False, forms=False,
+                  annoying_tags=False, remove_tags=None,
+                  remove_unknown_tags=False, safe_attrs_only=False)
--- a/readability/compat/init.py
+++ b/readability/compat/init.py
@ -1,20 +0,0 @@
-"""
-This module contains compatibility helpers for Python 2/3 interoperability.
-
-It mainly exists because their are certain incompatibilities in the Python
-syntax that can only be solved by conditionally importing different functions.
-"""
-import sys
-from lxml.etree import tostring
-
-if sys.version_info[0] == 2:
-    bytes_ = str
-    str_ = unicode
-    def tostring_(s):
-        return tostring(s, encoding='utf-8').decode('utf-8')
-
-elif sys.version_info[0] == 3:
-    bytes_ = bytes
-    str_ = str
-    def tostring_(s):
-        return tostring(s, encoding='utf-8')
--- a/readability/compat/three.py
+++ b/readability/compat/three.py
@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
-    """
-    Raise a new exception of type `exc_type` with an existing `traceback`. All
-    additional (keyword-)arguments are forwarded to `exc_type`
-    """
-    raise exc_type(*args, **kwargs).with_traceback(traceback)
--- a/readability/compat/two.py
+++ b/readability/compat/two.py
@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
-    """
-    Raise a new exception of type `exc_type` with an existing `traceback`. All
-    additional (keyword-)arguments are forwarded to `exc_type`
-    """
-    raise exc_type(*args, **kwargs), None, traceback
--- a/readability/debug.py
+++ b/readability/debug.py
@ -1,25 +1,45 @@
 import re


-# FIXME: use with caution, can leak memory
 uids = {}
-uids_document = None
+RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
+
+
+def open_in_browser(html):
+    """
+    Open the HTML document in a web browser, saving it to a temporary
+    file to open it.  Note that this does not delete the file after
+    use.  This is mainly meant for debugging.
+    """
+    import os
+    import webbrowser
+    import tempfile
+    handle, fn = tempfile.mkstemp(suffix='.html')
+    f = os.fdopen(handle, 'wb')
+    try:
+        f.write("<meta charset='UTF-8' />")
+        f.write(html.encode('utf-8'))
+    finally:
+        # we leak the file itself here, but we should at least close it
+        f.close()
+    url = 'file://' + fn.replace(os.path.sep, '/')
+    webbrowser.open(url)
+    return url


 def describe_node(node):
-    global uids
    if node is None:
-        return ""
-    if not hasattr(node, "tag"):
+        return ''
+    if not hasattr(node, 'tag'):
        return "[%s]" % type(node)
    name = node.tag
-    if node.get("id", ""):
-        name += "#" + node.get("id")
-    if node.get("class", "").strip():
-        name += "." + ".".join(node.get("class").split())
-    if name[:4] in ["div#", "div."]:
+    if node.get('id', ''):
+        name += '#' + node.get('id')
+    if node.get('class', ''):
+        name += '.' + node.get('class').replace(' ', '.')
+    if name[:4] in ['div#', 'div.']:
        name = name[3:]
-    if name in ["tr", "td", "div", "p"]:
+    if name in ['tr', 'td', 'div', 'p']:
        uid = uids.get(node)
        if uid is None:
            uid = uids[node] = len(uids) + 1
@ -27,25 +47,16 @@ def describe_node(node):
    return name


-def describe(node, depth=1):
-    global uids, uids_document
-    doc = node.getroottree().getroot()
-    if doc != uids_document:
-        uids = {}
-        uids_document = doc
-
-    # return repr(NodeRepr(node))
-    parent = ""
+def describe(node, depth=2):
+    #return repr(NodeRepr(node))
+    parent = ''
    if depth and node.getparent() is not None:
-        parent = describe(node.getparent(), depth=depth - 1) + ">"
-    return parent + describe_node(node)
-
-
-RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
+        parent = describe(node.getparent(), depth=depth - 1)
+    return parent + '/' + describe_node(node)


 def text_content(elem, length=40):
-    content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
+    content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
    if len(content) < length:
        return content
-    return content[:length] + "..."
+    return content[:length] + '...'
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,20 +1,22 @@
 import re
 import chardet
-import sys
+import logging

+log = logging.getLogger(__name__)

-RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+
+RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
+RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
+RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

 CHARSETS = {
-    "big5": "big5hkscs",
-    "gb2312": "gb18030",
-    "ascii": "utf-8",
-    "maccyrillic": "cp1251",
-    "win1251": "cp1251",
-    "win-1251": "cp1251",
-    "windows-1251": "cp1251",
+    'big5': 'big5hkscs',
+    'gb2312': 'gb18030',
+    'ascii': 'utf-8',
+    'maccyrillic': 'cp1251',
+    'win1251': 'cp1251',
+    'win-1251': 'cp1251',
+    'windows-1251': 'cp1251',
 }


@ -27,37 +29,32 @@ def fix_charset(encoding):


 def get_encoding(page):
-    # Regex for XML and HTML Meta charset declaration
-    declared_encodings = (
-        RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
-    )
+    declared_encodings = (RE_CHARSET.findall(page) +
+                          RE_PRAGMA.findall(page) +
+                          RE_XML.findall(page))
+
+    log.debug("Document has the following encodings: %s" % declared_encodings)

-    # Try any declared encodings
+    # Try declared encodings, if any
    for declared_encoding in declared_encodings:
+        encoding = fix_charset(declared_encoding)
        try:
-            if sys.version_info[0] == 3:
-                # declared_encoding will actually be bytes but .decode() only
-                # accepts `str` type. Decode blindly with ascii because no one should
-                # ever use non-ascii characters in the name of an encoding.
-                declared_encoding = declared_encoding.decode("ascii", "replace")
-
-            encoding = fix_charset(declared_encoding)
-
-            # Now let's decode the page
            page.decode(encoding)
-            # It worked!
+            log.info('Using encoding "%s"' % encoding)
            return encoding
-        except (UnicodeDecodeError, LookupError):
-            pass
+        except UnicodeDecodeError:
+            log.info('Encoding "%s", specified in the document as "%s" '
+                     'didn\'t work' % (encoding, declared_encoding))

    # Fallback to chardet if declared encodings fail
-    # Remove all HTML tags, and leave only text for chardet
-    text = re.sub(br"(\s*</?[^>]*>)+\s*", b" ", page).strip()
-    enc = "utf-8"
-    if len(text) < 10:
-        return enc  # can't guess
+    text = re.sub('</?[^>]*>\s*', ' ', page)
+    enc = 'utf-8'
+    if not text.strip() or len(text) < 10:
+        log.debug("Can't guess encoding because text is too short")
+        return enc
    res = chardet.detect(text)
-    enc = res["encoding"] or "utf-8"
-    # print '->', enc, "%.2f" % res['confidence']
-    enc = fix_charset(enc)
+    enc = fix_charset(res['encoding'])
+    log.info('Trying encoding "%s" guessed '
+             'with confidence %.2f' % (enc, res['confidence']))
+    #print '->', enc, "%.2f" % res['confidence']
    return enc
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -1,45 +1,46 @@
+from cleaners import normalize_spaces, clean_attributes
+from encoding import get_encoding
 from lxml.html import tostring
+import logging
 import lxml.html
 import re

-from .cleaners import normalize_spaces, clean_attributes
-from .encoding import get_encoding
-from .compat import str_
+log = logging.getLogger(__name__)

-utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
+utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+
+
+def lxml_fromstring(doc):
+    return lxml.html.document_fromstring(doc, parser=utf8_parser)


 def build_doc(page):
-    if isinstance(page, str_):
-        encoding = None
-        decoded_page = page
+    if isinstance(page, unicode):
+        enc = None
+        unicode_page = page
    else:
-        encoding = get_encoding(page) or "utf-8"
-        decoded_page = page.decode(encoding, "replace")
-
-    # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
-    doc = lxml.html.document_fromstring(
-        decoded_page.encode("utf-8", "replace"), parser=utf8_parser
-    )
-    return doc, encoding
+        enc = get_encoding(page) or 'utf-8'
+        unicode_page = page.decode(enc, 'replace')
+    doc = lxml_fromstring(unicode_page.encode('utf-8', 'replace').replace('\r', ''))
+    return doc, enc


 def js_re(src, pattern, flags, repl):
-    return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
+    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


 def normalize_entities(cur_title):
    entities = {
-        u"\u2014": "-",
-        u"\u2013": "-",
-        u"&mdash;": "-",
-        u"&ndash;": "-",
-        u"\u00A0": " ",
-        u"\u00AB": '"',
-        u"\u00BB": '"',
-        u"&quot;": '"',
+        u'\u2014': '-',
+        u'\u2013': '-',
+        u'&mdash;': '-',
+        u'&ndash;': '-',
+        u'\u00A0': ' ',
+        u'\u00AB': '"',
+        u'\u00BB': '"',
+        u'&quot;': '"',
    }
-    for c, r in entities.items():
+    for c, r in entities.iteritems():
        if c in cur_title:
            cur_title = cur_title.replace(c, r)

@ -51,9 +52,9 @@ def norm_title(title):


 def get_title(doc):
-    title = doc.find(".//title")
-    if title is None or title.text is None or len(title.text) == 0:
-        return "[no-title]"
+    title = doc.find('.//title')
+    if title is None or len(title.text) == 0:
+        return '[no-title]'

    return norm_title(title.text)

@ -61,34 +62,25 @@ def get_title(doc):
 def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
-        if text.replace('"', "") in orig.replace('"', ""):
+        if text.replace('"', '') in orig.replace('"', ''):
            collection.add(text)


-TITLE_CSS_HEURISTICS = [
-    "#title",
-    "#head",
-    "#heading",
-    ".pageTitle",
-    ".news_title",
-    ".title",
-    ".head",
-    ".heading",
-    ".contentheading",
-    ".small_header_red",
-]
+TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
+                        '.news_title', '.title', '.head', '.heading',
+                        '.contentheading', '.small_header_red']


 def shorten_title(doc):
-    title = doc.find(".//title")
+    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
-        return ""
+        return ''

    title = orig = norm_title(title.text)

    candidates = set()

-    for item in [".//h1", ".//h2", ".//h3"]:
+    for item in ['.//h1', './/h2', './/h3']:
        for e in list(doc.iterfind(item)):
            if e.text:
                add_match(candidates, e.text, orig)
@ -105,7 +97,7 @@ def shorten_title(doc):
    if candidates:
        title = sorted(candidates, key=len)[-1]
    else:
-        for delimiter in [" | ", " - ", " :: ", " / "]:
+        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
            if delimiter in title:
                parts = orig.split(delimiter)
                if len(parts[0].split()) >= 4:
@ -115,12 +107,12 @@ def shorten_title(doc):
                    title = parts[-1]
                    break
        else:
-            if ": " in title:
-                parts = orig.split(": ")
+            if ': ' in title:
+                parts = orig.split(': ')
                if len(parts[-1].split()) >= 4:
                    title = parts[-1]
                else:
-                    title = orig.split(": ", 1)[1]
+                    title = orig.split(': ', 1)[1]

    if not 15 < len(title) < 150:
        return orig
@ -128,17 +120,15 @@ def shorten_title(doc):
    return title


-# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
 def get_body(doc):
-    for elem in doc.xpath(".//script | .//link | .//style"):
+    for elem in doc.xpath('.//script | .//link | .//style'):
        elem.drop_tree()
-    # tostring() always return utf-8 encoded string
-    # FIXME: isn't better to use tounicode?
-    raw_html = str_(tostring(doc.body or doc))
+    raw_html = unicode(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
-        # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
+        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
    except Exception:  # FIXME find the equivalent lxml error
-        # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+        log.error("cleaning broken html content: "
+                  "%s\n---------\n%s" % (raw_html, cleaned))
        return raw_html
--- a/readability/readability.py
+++ b/readability/readability.py
@ -1,9 +1,9 @@
 #!/usr/bin/env python
-from __future__ import print_function
 import logging
 import re
 import sys

+from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
@ -14,36 +14,24 @@ from .htmls import build_doc
 from .htmls import get_body
 from .htmls import get_title
 from .htmls import shorten_title
-from .compat import str_, bytes_, tostring_
-from .debug import describe, text_content
+from encoding import get_encoding
+from debug import describe, text_content, open_in_browser

-
-log = logging.getLogger("readability.readability")
+log = logging.getLogger(__file__)

 REGEXES = {
-    "unlikelyCandidatesRe": re.compile(
-        r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
-        re.I,
-    ),
-    "okMaybeItsACandidateRe": re.compile(r"and|article|body|column|main|shadow", re.I),
-    "positiveRe": re.compile(
-        r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story",
-        re.I,
-    ),
-    "negativeRe": re.compile(
-        r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget",
-        re.I,
-    ),
-    "divToPElementsRe": re.compile(
-        r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
-    ),
-    #'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
-    #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
-    #'trimRe': re.compile(r'^\s+|\s+$/'),
-    #'normalizeRe': re.compile(r'\s{2,}/'),
-    #'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
-    "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
-    # skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
+    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
+    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
+    'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
+    'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
+    'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
+    #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+    #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
+    #'trimRe': re.compile('^\s+|\s+$/'),
+    #'normalizeRe': re.compile('\s{2,}/'),
+    #'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+    #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
+    #skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 }


@ -55,18 +43,16 @@ def to_int(x):
    if not x:
        return None
    x = x.strip()
-    if x.endswith("px"):
+    if x.endswith('px'):
        return int(x[:-2])
-    if x.endswith("em"):
+    if x.endswith('em'):
        return int(x[:-2]) * 12
    return int(x)


 def clean(text):
-    # Many spaces make the following regexes run forever
-    text = re.sub(r"\s{255,}", " " * 255, text)
-    text = re.sub(r"\s*\n\s*", "\n", text)
-    text = re.sub(r"\t|[ \t]{2,}", " ", text)
+    text = re.sub('[ \t]+', ' ', text)
+    text = re.sub('\s*\n\s*', '\n', text)
    return text.strip()


@ -74,147 +60,97 @@ def text_length(i):
    return len(clean(i.text_content() or ""))


+regexp_type = type(re.compile('hello, world'))
+
+
 def compile_pattern(elements):
    if not elements:
        return None
-    elif isinstance(elements, re._pattern_type):
+    if isinstance(elements, regexp_type):
        return elements
-    elif isinstance(elements, (str_, bytes_)):
-        if isinstance(elements, bytes_):
-            elements = str_(elements, "utf-8")
-        elements = elements.split(u",")
-    if isinstance(elements, (list, tuple)):
-        return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
-    else:
-        raise Exception("Unknown type for the pattern: {}".format(type(elements)))
-        # assume string or string like object
+
+    if isinstance(elements, _basestring):
+        elements = elements.split(',')
+    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)


 class Document:
    """Class to build a etree document out of html."""
+    TEXT_LENGTH_THRESHOLD = 25
+    RETRY_LENGTH = 250

-    def __init__(
-        self,
-        input,
-        positive_keywords=None,
-        negative_keywords=None,
-        url=None,
-        min_text_length=25,
-        retry_length=250,
-        xpath=False,
-        handle_failures="discard",
-    ):
+    def __init__(self, input, positive_keywords=None, negative_keywords=None,
+                 **options):
        """Generate the document

        :param input: string of the html content.
-        :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
-        :param negative_keywords: regex, list or comma-separated string in classes and ids
-        :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
-        :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
-        :param xpath: If set to True, adds x="..." attribute to each HTML node,
-        containing xpath path pointing to original document path (allows to
-        reconstruct selected summary in original document).
-        :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
-        Support options = ["discard", "ignore", None]
-
-        Examples:
-            positive_keywords=["news-item", "block"]
-            positive_keywords=["news-item, block"]
-            positive_keywords=re.compile("news|block")
-            negative_keywords=["mysidebar", "related", "ads"]
-
-        The Document class is not re-enterable.
-        It is designed to create a new Document() for each HTML file to process it.
-
-        API methods:
-        .title() -- full title
-        .short_title() -- cleaned up title
-        .content() -- full content
-        .summary() -- cleaned up content
+
+        kwargs:
+            - attributes:
+            - min_text_length:
+            - retry_length:
+            - url: will allow adjusting links to be absolute
+            - positive_keywords: the list of positive search patterns in
+                classes and ids, for example: ["news-item", "block"]
+            - negative_keywords: the list of negative
+                search patterns in classes
+                and ids, for example: ["mysidebar", "related", "ads"]
+            Also positive_keywords and negative_keywords could be a regexp.
        """
        self.input = input
+        self.options = options
        self.html = None
        self.encoding = None
        self.positive_keywords = compile_pattern(positive_keywords)
        self.negative_keywords = compile_pattern(negative_keywords)
-        self.url = url
-        self.min_text_length = min_text_length
-        self.retry_length = retry_length
-        self.xpath = xpath
-        self.handle_failures = handle_failures

    def _html(self, force=False):
        if force or self.html is None:
            self.html = self._parse(self.input)
-            if self.xpath:
-                root = self.html.getroottree()
-                for i in self.html.getiterator():
-                    # print root.getpath(i)
-                    i.attrib["x"] = root.getpath(i)
        return self.html

    def _parse(self, input):
        doc, self.encoding = build_doc(input)
        doc = html_cleaner.clean_html(doc)
-        base_href = self.url
+        base_href = self.options.get('url', None)
        if base_href:
-            # trying to guard against bad links like <a href="http://[http://...">
-            try:
-                # such support is added in lxml 3.3.0
-                doc.make_links_absolute(
-                    base_href,
-                    resolve_base_href=True,
-                    handle_failures=self.handle_failures,
-                )
-            except TypeError:  # make_links_absolute() got an unexpected keyword argument 'handle_failures'
-                # then we have lxml < 3.3.0
-                # please upgrade to lxml >= 3.3.0 if you're failing here!
-                doc.make_links_absolute(
-                    base_href,
-                    resolve_base_href=True,
-                    handle_failures=self.handle_failures,
-                )
+            doc.make_links_absolute(base_href, resolve_base_href=True)
        else:
-            doc.resolve_base_href(handle_failures=self.handle_failures)
+            doc.resolve_base_href()
+        if self.options.get('xpath'):
+            root = doc.getroottree()
+            for i in doc.getiterator():
+                #print root.getpath(i)
+                i.attrib['x'] = root.getpath(i)
        return doc

    def content(self):
-        """Returns document body"""
        return get_body(self._html(True))

    def title(self):
-        """Returns document title"""
        return get_title(self._html(True))

    def short_title(self):
-        """Returns cleaned up document title"""
        return shorten_title(self._html(True))

    def get_clean_html(self):
-        """
-        An internal method, which can be overridden in subclasses, for example,
-        to disable or to improve DOM-to-text conversion in .summary() method
-        """
-        return clean_attributes(tounicode(self.html, method="html"))
+        return clean_attributes(tounicode(self.html))

    def summary(self, html_partial=False):
-        """
-        Given a HTML file, extracts the text of the article.
+        """Generate the summary of the html docuemnt

        :param html_partial: return only the div of the document, don't wrap
-                             in html and body tags.
+        in html and body tags.

-        Warning: It mutates internal DOM representation of the HTML document,
-        so it is better to call other API methods before this one.
        """
        try:
            ruthless = True
            while True:
                self._html(True)
-                for i in self.tags(self.html, "script", "style"):
+                for i in self.tags(self.html, 'script', 'style'):
                    i.drop_tree()
-                for i in self.tags(self.html, "body"):
-                    i.set("id", "readabilityBody")
+                for i in self.tags(self.html, 'body'):
+                    i.set('id', 'readabilityBody')
                if ruthless:
                    self.remove_unlikely_candidates()
                self.transform_misused_divs_into_paragraphs()
@ -223,35 +159,29 @@ class Document:
                best_candidate = self.select_best_candidate(candidates)

                if best_candidate:
-                    article = self.get_article(
-                        candidates, best_candidate, html_partial=html_partial
-                    )
+                    article = self.get_article(candidates, best_candidate,
+                                               html_partial=html_partial)
                else:
                    if ruthless:
                        log.info("ruthless removal did not work. ")
                        ruthless = False
-                        log.debug(
-                            (
-                                "ended up stripping too much - "
-                                "going for a safer _parse"
-                            )
-                        )
+                        log.info(
+                            ("ended up stripping too much - "
+                             "going for a safer parse"))
                        # try again
                        continue
                    else:
-                        log.debug(
-                            (
-                                "Ruthless and lenient parsing did not work. "
-                                "Returning raw html"
-                            )
-                        )
-                        article = self.html.find("body")
+                        log.info(
+                            ("Ruthless and lenient parsing did not work. "
+                             "Returning raw html"))
+                        article = self.html.find('body')
                        if article is None:
                            article = self.html
                cleaned_article = self.sanitize(article, candidates)
-
-                article_length = len(cleaned_article or "")
-                retry_length = self.retry_length
+                article_length = len(cleaned_article or '')
+                retry_length = self.options.get(
+                    'retry_length',
+                    self.RETRY_LENGTH)
                of_acceptable_length = article_length >= retry_length
                if ruthless and not of_acceptable_length:
                    ruthless = False
@ -260,37 +190,32 @@ class Document:
                else:
                    return cleaned_article
        except Exception as e:
-            log.exception("error getting summary: ")
-            if sys.version_info[0] == 2:
-                from .compat.two import raise_with_traceback
-            else:
-                from .compat.three import raise_with_traceback
-            raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
+            log.exception('error getting summary: ')
+            raise Unparseable(str(e)), None, sys.exc_info()[2]

    def get_article(self, candidates, best_candidate, html_partial=False):
        # Now that we have the top candidate, look through its siblings for
        # content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.
-        sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
+        sibling_score_threshold = max([
+            10,
+            best_candidate['content_score'] * 0.2])
        # create a new html document with a html->body->div
        if html_partial:
-            output = fragment_fromstring("<div/>")
+            output = fragment_fromstring('<div/>')
        else:
-            output = document_fromstring("<div/>")
-        best_elem = best_candidate["elem"]
-        parent = best_elem.getparent()
-        siblings = parent.getchildren() if parent is not None else [best_elem]
-        for sibling in siblings:
+            output = document_fromstring('<div/>')
+        best_elem = best_candidate['elem']
+        for sibling in best_elem.getparent().getchildren():
            # in lxml there no concept of simple text
            # if isinstance(sibling, NavigableString): continue
            append = False
            if sibling is best_elem:
                append = True
            sibling_key = sibling  # HashableElement(sibling)
-            if (
-                sibling_key in candidates
-                and candidates[sibling_key]["content_score"] >= sibling_score_threshold
-            ):
+            if sibling_key in candidates and \
+                    candidates[sibling_key]['content_score'] >= \
+                    sibling_score_threshold:
                append = True

            if sibling.tag == "p":
@ -300,11 +225,9 @@ class Document:

                if node_length > 80 and link_density < 0.25:
                    append = True
-                elif (
-                    node_length <= 80
-                    and link_density == 0
-                    and re.search(r"\.( |$)", node_content)
-                ):
+                elif node_length <= 80 \
+                        and link_density == 0 \
+                        and re.search('\.( |$)', node_content):
                    append = True

            if append:
@ -315,7 +238,7 @@ class Document:
                else:
                    output.getchildren()[0].getchildren()[0].append(sibling)
        # if output is not None:
-        #    output.append(best_elem)
+        # output.append(best_elem)
        return output

    def select_best_candidate(self, candidates):
@ -323,11 +246,16 @@ class Document:
            return None

        sorted_candidates = sorted(
-            candidates.values(), key=lambda x: x["content_score"], reverse=True
+            candidates.values(),
+            key=lambda x: x['content_score'],
+            reverse=True
        )
+
        for candidate in sorted_candidates[:5]:
-            elem = candidate["elem"]
-            log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
+            elem = candidate['elem']
+            log.info("Top 5 : %6.3f %s" % (
+                candidate['content_score'],
+                describe(elem)))

        best_candidate = sorted_candidates[0]
        return best_candidate
@ -336,13 +264,15 @@ class Document:
        link_length = 0
        for i in elem.findall(".//a"):
            link_length += text_length(i)
-        # if len(elem.findall(".//div") or elem.findall(".//p")):
+        #if len(elem.findall(".//div") or elem.findall(".//p")):
        #    link_length = link_length
        total_length = text_length(elem)
        return float(link_length) / max(total_length, 1)

-    def score_paragraphs(self):
-        MIN_LEN = self.min_text_length
+    def score_paragraphs(self, ):
+        MIN_LEN = self.options.get(
+            'min_text_length',
+            self.TEXT_LENGTH_THRESHOLD)
        candidates = {}
        ordered = []
        for elem in self.tags(self._html(), "p", "pre", "td"):
@ -363,20 +293,22 @@ class Document:
                candidates[parent_node] = self.score_node(parent_node)
                ordered.append(parent_node)

-            if grand_parent_node is not None and grand_parent_node not in candidates:
-                candidates[grand_parent_node] = self.score_node(grand_parent_node)
+            if grand_parent_node is not None and \
+                grand_parent_node not in candidates:
+                candidates[grand_parent_node] = self.score_node(
+                    grand_parent_node)
                ordered.append(grand_parent_node)

            content_score = 1
-            content_score += len(inner_text.split(","))
+            content_score += len(inner_text.split(','))
            content_score += min((inner_text_len / 100), 3)
-            # if elem not in candidates:
+            #if elem not in candidates:
            #    candidates[elem] = self.score_node(elem)

-            # WTF? candidates[elem]['content_score'] += content_score
-            candidates[parent_node]["content_score"] += content_score
+            #WTF? candidates[elem]['content_score'] += content_score
+            candidates[parent_node]['content_score'] += content_score
            if grand_parent_node is not None:
-                candidates[grand_parent_node]["content_score"] += content_score / 2.0
+                candidates[grand_parent_node]['content_score'] += content_score / 2.0

        # Scale the final candidates score based on link density. Good content
        # should have a relatively small link density (5% or less) and be
@ -384,35 +316,37 @@ class Document:
        for elem in ordered:
            candidate = candidates[elem]
            ld = self.get_link_density(elem)
-            score = candidate["content_score"]
-            log.debug(
-                "Branch %6.3f %s link density %.3f -> %6.3f"
-                % (score, describe(elem), ld, score * (1 - ld))
-            )
-            candidate["content_score"] *= 1 - ld
+            score = candidate['content_score']
+            log.debug("Branch %6.3f %s link density %.3f -> %6.3f" % (
+                      score, describe(elem), ld, score * (1 - ld)))
+            candidate['content_score'] *= (1 - ld)

        return candidates

    def class_weight(self, e):
        weight = 0
-        for feature in [e.get("class", None), e.get("id", None)]:
+        for feature in [e.get('class', None), e.get('id', None)]:
            if feature:
-                if REGEXES["negativeRe"].search(feature):
+                if REGEXES['negativeRe'].search(feature):
                    weight -= 25

-                if REGEXES["positiveRe"].search(feature):
+                if REGEXES['positiveRe'].search(feature):
                    weight += 25

-                if self.positive_keywords and self.positive_keywords.search(feature):
+                if self.positive_keywords and self.positive_keywords.search(
+                        feature):
                    weight += 25

-                if self.negative_keywords and self.negative_keywords.search(feature):
+                if self.negative_keywords and self.negative_keywords.search(
+                        feature):
                    weight -= 25

-        if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
+        if self.positive_keywords and self.positive_keywords.match(
+                'tag-' + e.tag):
            weight += 25

-        if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag):
+        if self.negative_keywords and self.negative_keywords.match(
+                'tag-' + e.tag):
            weight -= 25

        return weight
@ -420,58 +354,49 @@ class Document:
    def score_node(self, elem):
        content_score = self.class_weight(elem)
        name = elem.tag.lower()
-        if name in ["div", "article"]:
+        if name == "div":
            content_score += 5
        elif name in ["pre", "td", "blockquote"]:
            content_score += 3
-        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
+        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
            content_score -= 3
-        elif name in [
-            "h1",
-            "h2",
-            "h3",
-            "h4",
-            "h5",
-            "h6",
-            "th",
-            "header",
-            "footer",
-            "nav",
-        ]:
+        elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
            content_score -= 5
-        return {"content_score": content_score, "elem": elem}
+        return {
+            'content_score': content_score,
+            'elem': elem
+        }
+
+    def debug(self, *a):
+        log.warn("debug: " + a[0], *a[1:])

    def remove_unlikely_candidates(self):
-        for elem in self.html.findall(".//*"):
-            s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
+        for elem in self.html.iter():
+            s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
            if len(s) < 2:
                continue
-            if (
-                REGEXES["unlikelyCandidatesRe"].search(s)
-                and (not REGEXES["okMaybeItsACandidateRe"].search(s))
-                and elem.tag not in ["html", "body"]
-            ):
+            #self.debug(s)
+            if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
                log.debug("Removing unlikely candidate - %s" % describe(elem))
                elem.drop_tree()

    def transform_misused_divs_into_paragraphs(self):
-        for elem in self.tags(self.html, "div"):
+        for elem in self.tags(self.html, 'div'):
            # transform <div>s that do not contain other block elements into
            # <p>s
            # FIXME: The current implementation ignores all descendants that
            # are not direct children of elem
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
-            if not REGEXES["divToPElementsRe"].search(
-                str_(b"".join(map(tostring_, list(elem))))
-            ):
-                # log.debug("Altering %s to p" % (describe(elem)))
+            if not REGEXES['divToPElementsRe'].search(
+                    unicode(''.join(map(tostring, list(elem))))):
+                # self.debug("Altering %s to p" % describe(elem))
                elem.tag = "p"
-                # print "Fixed element "+describe(elem)
+                # self.debug("Fixed element "+describe(elem))

-        for elem in self.tags(self.html, "div"):
+        for elem in self.tags(self.html, 'div'):
            if elem.text and elem.text.strip():
-                p = fragment_fromstring("<p/>")
+                p = fragment_fromstring('<p/>')
                p.text = elem.text
                elem.text = None
                elem.insert(0, p)
@ -479,65 +404,56 @@ class Document:

            for pos, child in reversed(list(enumerate(elem))):
                if child.tail and child.tail.strip():
-                    p = fragment_fromstring("<p/>")
+                    p = fragment_fromstring('<p/>')
                    p.text = child.tail
                    child.tail = None
                    elem.insert(pos + 1, p)
                    # print "Inserted "+tounicode(p)+" to "+describe(elem)
-                if child.tag == "br":
+                if child.tag == 'br':
                    # print 'Dropped <br> at '+describe(elem)
                    child.drop_tree()

    def tags(self, node, *tag_names):
        for tag_name in tag_names:
-            for e in node.findall(".//%s" % tag_name):
+            for e in node.findall('.//%s' % tag_name):
                yield e

    def reverse_tags(self, node, *tag_names):
        for tag_name in tag_names:
-            for e in reversed(node.findall(".//%s" % tag_name)):
+            for e in reversed(node.findall('.//%s' % tag_name)):
                yield e

    def sanitize(self, node, candidates):
-        MIN_LEN = self.min_text_length
+        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
        for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
-            if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
+            if self.class_weight(header) < 0 or \
+                    self.get_link_density(header) > 0.33:
                header.drop_tree()

-        for elem in self.tags(node, "form", "textarea"):
+        for elem in self.tags(node, "form", "iframe", "textarea"):
            elem.drop_tree()

-        for elem in self.tags(node, "iframe"):
-            if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]):
-                elem.text = "VIDEO"  # ADD content to iframe text node to force <iframe></iframe> proper output
-            else:
-                elem.drop_tree()
-
        allowed = {}
        # Conditionally clean <table>s, <ul>s, and <div>s
-        for el in self.reverse_tags(
-            node, "table", "ul", "div", "aside", "header", "footer", "section"
-        ):
+        for el in self.reverse_tags(node, "table", "ul", "div"):
            if el in allowed:
                continue
            weight = self.class_weight(el)
            if el in candidates:
-                content_score = candidates[el]["content_score"]
+                content_score = candidates[el]['content_score']
                # print '!',el, '-> %6.3f' % content_score
            else:
                content_score = 0
            tag = el.tag

            if weight + content_score < 0:
-                log.debug(
-                    "Removed %s with score %6.3f and weight %-3s"
-                    % (describe(el), content_score, weight,)
-                )
+                log.info("Removed %s with score %6.3f and weight %-3s" %
+                    (describe(el), content_score, weight, ))
                el.drop_tree()
            elif el.text_content().count(",") < 10:
                counts = {}
-                for kind in ["p", "img", "li", "a", "embed", "input"]:
-                    counts[kind] = len(el.findall(".//%s" % kind))
+                for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
+                    counts[kind] = len(el.findall('.//%s' % kind))
                counts["li"] -= 100
                counts["input"] -= len(el.findall('.//input[@type="hidden"]'))

@ -547,210 +463,196 @@ class Document:
                parent_node = el.getparent()
                if parent_node is not None:
                    if parent_node in candidates:
-                        content_score = candidates[parent_node]["content_score"]
+                        content_score = candidates[
+                            parent_node]['content_score']
                    else:
                        content_score = 0
                # if parent_node is not None:
-                # pweight = self.class_weight(parent_node) + content_score
-                # pname = describe(parent_node)
+                    # pweight = self.class_weight(parent_node) + content_score
+                    # pname = describe(parent_node)
                # else:
-                # pweight = 0
-                # pname = "no parent"
+                    # pweight = 0
+                    # pname = "no parent"
                to_remove = False
                reason = ""

                # if el.tag == 'div' and counts["img"] >= 1:
-                #    continue
-                if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
-                    reason = "too many images (%s)" % counts["img"]
+                # continue
+                if content_length and counts["img"] * 100 >= content_length:
+                    reason = "too many images (%s) for text " % counts["img"]
                    to_remove = True
-                elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
+                elif counts["li"] > counts["p"] \
+                        and tag != "ul" and tag != "ol":
                    reason = "more <li>s than <p>s"
                    to_remove = True
                elif counts["input"] > (counts["p"] / 3):
                    reason = "less than 3x <p>s than <input>s"
                    to_remove = True
-                elif content_length < MIN_LEN and counts["img"] == 0:
-                    reason = (
-                        "too short content length %s without a single image"
-                        % content_length
-                    )
+                elif content_length < MIN_LEN and not counts["img"]:
+                    reason = "too short content length %s and no images" % content_length
                    to_remove = True
                elif content_length < MIN_LEN and counts["img"] > 2:
-                    reason = (
-                        "too short content length %s and too many images"
-                        % content_length
-                    )
+                    reason = "too short content length %s and too much images" % content_length
                    to_remove = True
                elif weight < 25 and link_density > 0.2:
-                    reason = "too many links %.3f for its weight %s" % (
-                        link_density,
-                        weight,
-                    )
-                    to_remove = True
+                        reason = "too many links %.3f for its weight %s" % (
+                            link_density, weight)
+                        to_remove = True
                elif weight >= 25 and link_density > 0.5:
                    reason = "too many links %.3f for its weight %s" % (
-                        link_density,
-                        weight,
-                    )
+                        link_density, weight)
                    to_remove = True
-                elif (counts["embed"] == 1 and content_length < 75) or counts[
-                    "embed"
-                ] > 1:
-                    reason = (
-                        "<embed>s with too short content length, or too many <embed>s"
-                    )
+                elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
+                    reason = "<embed>s with too short content length, or too many <embed>s"
                    to_remove = True
                elif not content_length:
                    reason = "no content"
                    to_remove = True
-                    #                if el.tag == 'div' and counts['img'] >= 1 and to_remove:
-                    #                    imgs = el.findall('.//img')
-                    #                    valid_img = False
-                    #                    log.debug(tounicode(el))
-                    #                    for img in imgs:
-                    #
-                    #                        height = img.get('height')
-                    #                        text_length = img.get('text_length')
-                    #                        log.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
-                    #                        if to_int(height) >= 100 or to_int(text_length) >= 100:
-                    #                            valid_img = True
-                    #                            log.debug("valid image" + tounicode(img))
-                    #                            break
-                    #                    if valid_img:
-                    #                        to_remove = False
-                    #                        log.debug("Allowing %s" %el.text_content())
-                    #                        for desnode in self.tags(el, "table", "ul", "div"):
-                    #                            allowed[desnode] = True
-
-                    # find x non empty preceding and succeeding siblings
+#                if el.tag == 'div' and counts['img'] >= 1 and to_remove:
+#                    imgs = el.findall('.//img')
+#                    valid_img = False
+#                    self.debug(tounicode(el))
+#                    for img in imgs:
+#
+#                        height = img.get('height')
+#                        text_length = img.get('text_length')
+#                        self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
+#                        if to_int(height) >= 100 or to_int(text_length) >= 100:
+#                            valid_img = True
+#                            self.debug("valid image" + tounicode(img))
+#                            break
+#                    if valid_img:
+#                        to_remove = False
+#                        self.debug("Allowing %s" %el.text_content())
+#                        for desnode in self.tags(el, "table", "ul", "div"):
+#                            allowed[desnode] = True
+
+                    #find x non empty preceding and succeeding siblings
                    i, j = 0, 0
                    x = 1
                    siblings = []
                    for sib in el.itersiblings():
-                        # log.debug(sib.text_content())
+                        #self.debug(sib.text_content())
                        sib_content_length = text_length(sib)
                        if sib_content_length:
-                            i = +1
+                            i += 1
                            siblings.append(sib_content_length)
-                            if i == x:
+                            if i >= x:
                                break
                    for sib in el.itersiblings(preceding=True):
-                        # log.debug(sib.text_content())
+                        #self.debug(sib.text_content())
                        sib_content_length = text_length(sib)
                        if sib_content_length:
-                            j = +1
+                            j += 1
                            siblings.append(sib_content_length)
-                            if j == x:
+                            if j >= x:
                                break
-                    # log.debug(str_(siblings))
+                    #self.debug(str(siblings))
                    if siblings and sum(siblings) > 1000:
                        to_remove = False
-                        log.debug("Allowing %s" % describe(el))
-                        for desnode in self.tags(el, "table", "ul", "div", "section"):
+                        log.info("Allowing %s" % describe(el))
+                        for desnode in self.tags(el, "table", "ul", "div"):
                            allowed[desnode] = True

                if to_remove:
-                    log.debug(
-                        "Removed %6.3f %s with weight %s cause it has %s."
-                        % (content_score, describe(el), weight, reason)
-                    )
-                    # print tounicode(el)
-                    # log.debug("pname %s pweight %.3f" %(pname, pweight))
+                    log.info("Cleaned %s (score=%6.3f, weight=%s) cause it has %s: %s" %
+                              (describe(el), content_score, weight, reason, text_content(el)))
+                    #print tounicode(el)
+                    #self.debug("pname %s pweight %.3f" %(pname, pweight))
                    el.drop_tree()
                else:
-                    log.debug(
-                        "Not removing %s of length %s: %s"
-                        % (describe(el), content_length, text_content(el))
-                    )
+                    log.info("Not cleaned %s of length %s: %s" %
+                        (describe(el), content_length, text_content(el)))
+
+        for el in ([node] + [n for n in node.iter()]):
+            if not self.options.get('attributes', None):
+                # el.attrib = {} #FIXME:Checkout the effects of disabling this
+                pass

        self.html = node
        return self.get_clean_html()


-def main():
-    VERBOSITY = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}
+class HashableElement():
+    def __init__(self, node):
+        self.node = node
+        self._path = None
+
+    def _get_path(self):
+        if self._path is None:
+            reverse_path = []
+            node = self.node
+            while node is not None:
+                node_id = (node.tag, tuple(node.attrib.items()), node.text)
+                reverse_path.append(node_id)
+                node = node.getparent()
+            self._path = tuple(reverse_path)
+        return self._path
+    path = property(_get_path)
+
+    def __hash__(self):
+        return hash(self.path)
+
+    def __eq__(self, other):
+        return self.path == other.path
+
+    def __getattr__(self, tag):
+        return getattr(self.node, tag)
+
+VERBOSITY = {
+    1: logging.WARNING,
+    2: logging.INFO,
+    3: logging.DEBUG
+}

-    from optparse import OptionParser

+def main():
+    from optparse import OptionParser
    parser = OptionParser(usage="%prog: [options] [file]")
-    parser.add_option("-v", "--verbose", action="count", default=0)
-    parser.add_option(
-        "-b", "--browser", default=None, action="store_true", help="open in browser"
-    )
-    parser.add_option(
-        "-l", "--log", default=None, help="save logs into file (appended)"
-    )
-    parser.add_option(
-        "-u", "--url", default=None, help="use URL instead of a local file"
-    )
-    parser.add_option("-x", "--xpath", default=None, help="add original xpath")
-    parser.add_option(
-        "-p",
-        "--positive-keywords",
-        default=None,
-        help="positive keywords (comma-separated)",
-        action="store",
-    )
-    parser.add_option(
-        "-n",
-        "--negative-keywords",
-        default=None,
-        help="negative keywords (comma-separated)",
-        action="store",
-    )
+    parser.add_option('-v', '--verbose', action='count', default=0)
+    parser.add_option('-b', '--browser', default=None, action='store_true', help="open in browser")
+    parser.add_option('-l', '--log', default=None, help="use filename for logs (appended)")
+    parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
+    parser.add_option('-s', '--show-xpath', default=None, help="show xpath")
+    parser.add_option('-x', '--xpath', default=None, help="use xpath")
+    parser.add_option('-t', '--support-text', default=None, help="use this support text")
+    parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
+    parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
    (options, args) = parser.parse_args()

-    if options.verbose:
-        logging.basicConfig(
-            level=VERBOSITY[options.verbose],
-            filename=options.log,
-            format="%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)",
-        )
-
    if not (len(args) == 1 or options.url):
        parser.print_help()
        sys.exit(1)

+    if options.verbose:
+        logging.basicConfig(level=VERBOSITY[options.verbose], filename=options.log,
+                            format='%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)')
+
    file = None
    if options.url:
-        headers = {"User-Agent": "Mozilla/5.0"}
-        if sys.version_info[0] == 3:
-            import urllib.request, urllib.parse, urllib.error
-
-            request = urllib.request.Request(options.url, None, headers)
-            file = urllib.request.urlopen(request)
-        else:
-            import urllib2
-
-            request = urllib2.Request(options.url, None, headers)
-            file = urllib2.urlopen(request)
+        import urllib
+        file = urllib.urlopen(options.url)
    else:
-        file = open(args[0], "rt")
+        file = open(args[0], 'rt')
+    html = file.read()  # bytes object
+
+    encoding = get_encoding(html)
+    html = html.decode(encoding)
    try:
-        doc = Document(
-            file.read(),
-            url=options.url,
-            positive_keywords=options.positive_keywords,
-            negative_keywords=options.negative_keywords,
-        )
+        doc = Document(html, url=options.url,
+                       positive_keywords=options.positive_keywords,
+                       negative_keywords=options.negative_keywords)
        if options.browser:
-            from .browser import open_in_browser
-
-            result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
+            result = 'Title: ' + doc.short_title() + '<br/>' + doc.summary()
            open_in_browser(result)
        else:
-            enc = (
-                sys.__stdout__.encoding or "utf-8"
-            )  # XXX: this hack could not always work, better to set PYTHONIOENCODING
-            result = "Title:" + doc.short_title() + "\n" + doc.summary()
-            if sys.version_info[0] == 3:
-                print(result)
-            else:
-                print(result.encode(enc, "replace"))
+		    # XXX: a hack, better to set PYTHONIOENCODING explicitly
+		    output_encoding = sys.__stdout__.encoding or 'utf-8'
+
+            print 'Title:', doc.short_title().encode(output_encoding, 'replace')
+            print doc.summary().encode(output_encoding, 'replace')
    finally:
        file.close()

-
-if __name__ == "__main__":
+if __name__ == '__main__':
    main()
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,6 +0,0 @@
-lxml
-chardet
-nose
-pep8
-coverage
-timeout_decorator
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +0,0 @@
-e .
--- a/setup.py
+++ b/setup.py
@ -1,78 +1,34 @@
 #!/usr/bin/env python
-
-from __future__ import print_function
-import codecs
-import os
-import re
-from setuptools import setup
+from setuptools import setup, find_packages
 import sys

 lxml_requirement = "lxml"
-if sys.platform == "darwin":
+if sys.platform == 'darwin':
    import platform
-
    mac_ver = platform.mac_ver()[0]
-    mac_major, mac_minor = mac_ver.split('.')[:2]
-    if int(mac_major) == 10 and int(mac_minor) < 9:
-        print("Using lxml<2.4")
+    if int(mac_ver.split('.')[1]) < 9:
+        print "Using lxml<2.4 for Mac OS X < 10.9"
        lxml_requirement = "lxml<2.4"

-test_deps = [
-    # Test timeouts
-    "timeout_decorator",
-]
-
-extras = {
-    "test": test_deps,
-}
-
-# Adapted from https://github.com/pypa/pip/blob/master/setup.py
-def find_version(*file_paths):
-    here = os.path.abspath(os.path.dirname(__file__))
-
-    # Intentionally *not* adding an encoding option to open, See:
-    #   https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
-    with codecs.open(os.path.join(here, *file_paths), "r") as fp:
-        version_file = fp.read()
-        version_match = re.search(
-            r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M,
-        )
-        if version_match:
-            return version_match.group(1)
-
-    raise RuntimeError("Unable to find version string.")
-
-
 setup(
    name="readability-lxml",
-    version=find_version("readability", "__init__.py"),
+    version="0.5.0.3",
    author="Yuri Baburov",
    author_email="burchik@gmail.com",
-    description="fast html to text parser (article readability tool) with python 3 support",
-    test_suite="tests.test_article_only",
-    long_description=open("README.rst").read(),
-    long_description_content_type='text/x-rst',
+    description="fast python port of arc90's readability tool",
+    test_suite = "tests.test_article_only",
+    long_description=open("README").read(),
    license="Apache License 2.0",
    url="http://github.com/buriy/python-readability",
-    packages=["readability", "readability.compat"],
-    install_requires=["chardet", lxml_requirement, "cssselect"],
-    tests_require=test_deps,
-    extras_require=extras,
+    packages=['readability'],
+    install_requires=[
+        "chardet",
+        lxml_requirement
+        ],
    classifiers=[
        "Environment :: Web Environment",
        "Intended Audience :: Developers",
        "Operating System :: OS Independent",
-        "Topic :: Text Processing :: Indexing",
-        "Topic :: Utilities",
-        "Topic :: Internet",
-        "Topic :: Software Development :: Libraries :: Python Modules",
        "Programming Language :: Python",
-        "Programming Language :: Python :: 2",
-        "Programming Language :: Python :: 2.7",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-    ],
+        ],
 )
--- a/tests/samples/the-hurricane-rubin-carter-denzel-washington.html
+++ b/tests/samples/the-hurricane-rubin-carter-denzel-washington.html
--- a/tests/samples/too-many-images.sample.html
+++ b/tests/samples/too-many-images.sample.html
--- a/tests/samples/utf-8-kanji.sample.html
+++ b/tests/samples/utf-8-kanji.sample.html
@ -1,60 +0,0 @@
-<!DOCTYPE html>
-<html lang="ja">
-  <body>
-    <div>
-      <article>
-        <div>
-          草枕
-          夏目漱石
-
-
-          一
-
-          　山路を登りながら、こう考えた。
-          　智に働けば角が立つ。情に棹させば流される。意地を通せば窮屈だ。とかくに人の世は住みにくい。
-          　住みにくさが高じると、安い所へ引き越したくなる。どこへ越しても住みにくいと悟った時、詩が生れて、画が出来る。
-          　人の世を作ったものは神でもなければ鬼でもない。やはり向う三軒両隣りにちらちらするただの人である。ただの人が作った人の世が住みにくいからとて、越す国はあるまい。あれば人でなしの国へ行くばかりだ。人でなしの国は人の世よりもなお住みにくかろう。
-          　越す事のならぬ世が住みにくければ、住みにくい所をどれほどか、寛容て、束の間の命を、束の間でも住みよくせねばならぬ。ここに詩人という天職が出来て、ここに画家という使命が降る。あらゆる芸術の士は人の世を長閑にし、人の心を豊かにするが故に尊とい。
-          　住みにくき世から、住みにくき煩いを引き抜いて、ありがたい世界をまのあたりに写すのが詩である、画である。あるは音楽と彫刻である。こまかに云えば写さないでもよい。ただまのあたりに見れば、そこに詩も生き、歌も湧く。着想を紙に落さぬとも※(「王＋膠のつくり」、第3水準1-88-22)鏘の音は胸裏に起る。丹青は画架に向って塗抹せんでも五彩の絢爛は自から心眼に映る。ただおのが住む世を、かく観じ得て、霊台方寸のカメラに澆季溷濁の俗界を清くうららかに収め得れば足る。この故に無声の詩人には一句なく、無色の画家には尺※(「糸＋賺のつくり」、第3水準1-90-17)なきも、かく人世を観じ得るの点において、かく煩悩を解脱するの点において、かく清浄界に出入し得るの点において、またこの不同不二の乾坤を建立し得るの点において、我利私慾の覊絆を掃蕩するの点において、――千金の子よりも、万乗の君よりも、あらゆる俗界の寵児よりも幸福である。
-          　世に住むこと二十年にして、住むに甲斐ある世と知った。二十五年にして明暗は表裏のごとく、日のあたる所にはきっと影がさすと悟った。三十の今日はこう思うている。――喜びの深きとき憂いよいよ深く、楽みの大いなるほど苦しみも大きい。これを切り放そうとすると身が持てぬ。片づけようとすれば世が立たぬ。金は大事だ、大事なものが殖えれば寝る間も心配だろう。恋はうれしい、嬉しい恋が積もれば、恋をせぬ昔がかえって恋しかろ。閣僚の肩は数百万人の足を支えている。背中には重い天下がおぶさっている。うまい物も食わねば惜しい。少し食えば飽き足らぬ。存分食えばあとが不愉快だ。……
-          　余の考がここまで漂流して来た時に、余の右足は突然坐りのわるい角石の端を踏み損くなった。平衡を保つために、すわやと前に飛び出した左足が、仕損じの埋め合せをすると共に、余の腰は具合よく方三尺ほどな岩の上に卸りた。肩にかけた絵の具箱が腋の下から躍り出しただけで、幸いと何の事もなかった。
-          　立ち上がる時に向うを見ると、路から左の方にバケツを伏せたような峰が聳えている。杉か檜か分からないが根元から頂きまでことごとく蒼黒い中に、山桜が薄赤くだんだらに棚引いて、続ぎ目が確と見えぬくらい靄が濃い。少し手前に禿山が一つ、群をぬきんでて眉に逼る。禿げた側面は巨人の斧で削り去ったか、鋭どき平面をやけに谷の底に埋めている。天辺に一本見えるのは赤松だろう。枝の間の空さえ判然している。行く手は二丁ほどで切れているが、高い所から赤い毛布が動いて来るのを見ると、登ればあすこへ出るのだろう。路はすこぶる難義だ。
-          　土をならすだけならさほど手間も入るまいが、土の中には大きな石がある。土は平らにしても石は平らにならぬ。石は切り砕いても、岩は始末がつかぬ。掘崩した土の上に悠然と峙って、吾らのために道を譲る景色はない。向うで聞かぬ上は乗り越すか、廻らなければならん。巌のない所でさえ歩るきよくはない。左右が高くって、中心が窪んで、まるで一間幅を三角に穿って、その頂点が真中を貫いていると評してもよい。路を行くと云わんより川底を渉ると云う方が適当だ。固より急ぐ旅でないから、ぶらぶらと七曲りへかかる。
-          　たちまち足の下で雲雀の声がし出した。谷を見下したが、どこで鳴いてるか影も形も見えぬ。ただ声だけが明らかに聞える。せっせと忙しく、絶間なく鳴いている。方幾里の空気が一面に蚤に刺されていたたまれないような気がする。あの鳥の鳴く音には瞬時の余裕もない。のどかな春の日を鳴き尽くし、鳴きあかし、また鳴き暮らさなければ気が済まんと見える。その上どこまでも登って行く、いつまでも登って行く。雲雀はきっと雲の中で死ぬに相違ない。登り詰めた揚句は、流れて雲に入って、漂うているうちに形は消えてなくなって、ただ声だけが空の裡に残るのかも知れない。
-          　巌角を鋭どく廻って、按摩なら真逆様に落つるところを、際どく右へ切れて、横に見下すと、菜の花が一面に見える。雲雀はあすこへ落ちるのかと思った。いいや、あの黄金の原から飛び上がってくるのかと思った。次には落ちる雲雀と、上る雲雀が十文字にすれ違うのかと思った。最後に、落ちる時も、上る時も、また十文字に擦れ違うときにも元気よく鳴きつづけるだろうと思った。
-          　春は眠くなる。猫は鼠を捕る事を忘れ、人間は借金のある事を忘れる。時には自分の魂の居所さえ忘れて正体なくなる。ただ菜の花を遠く望んだときに眼が醒める。雲雀の声を聞いたときに魂のありかが判然する。雲雀の鳴くのは口で鳴くのではない、魂全体が鳴くのだ。魂の活動が声にあらわれたもののうちで、あれほど元気のあるものはない。ああ愉快だ。こう思って、こう愉快になるのが詩である。
-          　たちまちシェレーの雲雀の詩を思い出して、口のうちで覚えたところだけ暗誦して見たが、覚えているところは二三句しかなかった。その二三句のなかにこんなのがある。
-          　　We look before and after
-          　　　　And pine for what is not:
-          　　Our sincerest laughter
-          　　　　With some pain is fraught;
-          Our sweetest songs are those that tell of saddest thought.
-          「前をみては、後えを見ては、物欲しと、あこがるるかなわれ。腹からの、笑といえど、苦しみの、そこにあるべし。うつくしき、極みの歌に、悲しさの、極みの想、籠るとぞ知れ」
-          　なるほどいくら詩人が幸福でも、あの雲雀のように思い切って、一心不乱に、前後を忘却して、わが喜びを歌う訳には行くまい。西洋の詩は無論の事、支那の詩にも、よく万斛の愁などと云う字がある。詩人だから万斛で素人なら一合で済むかも知れぬ。して見ると詩人は常の人よりも苦労性で、凡骨の倍以上に神経が鋭敏なのかも知れん。超俗の喜びもあろうが、無量の悲も多かろう。そんならば詩人になるのも考え物だ。
-          　しばらくは路が平で、右は雑木山、左は菜の花の見つづけである。足の下に時々蒲公英を踏みつける。鋸のような葉が遠慮なく四方へのして真中に黄色な珠を擁護している。菜の花に気をとられて、踏みつけたあとで、気の毒な事をしたと、振り向いて見ると、黄色な珠は依然として鋸のなかに鎮座している。呑気なものだ。また考えをつづける。
-          　詩人に憂はつきものかも知れないが、あの雲雀を聞く心持になれば微塵の苦もない。菜の花を見ても、ただうれしくて胸が躍るばかりだ。蒲公英もその通り、桜も――桜はいつか見えなくなった。こう山の中へ来て自然の景物に接すれば、見るものも聞くものも面白い。面白いだけで別段の苦しみも起らぬ。起るとすれば足が草臥れて、旨いものが食べられぬくらいの事だろう。
-          　しかし苦しみのないのはなぜだろう。ただこの景色を一幅の画として観、一巻の詩として読むからである。画であり詩である以上は地面を貰って、開拓する気にもならねば、鉄道をかけて一儲けする了見も起らぬ。ただこの景色が――腹の足しにもならぬ、月給の補いにもならぬこの景色が景色としてのみ、余が心を楽ませつつあるから苦労も心配も伴わぬのだろう。自然の力はここにおいて尊とい。吾人の性情を瞬刻に陶冶して醇乎として醇なる詩境に入らしむるのは自然である。
-          　恋はうつくしかろ、孝もうつくしかろ、忠君愛国も結構だろう。しかし自身がその局に当れば利害の旋風に捲き込まれて、うつくしき事にも、結構な事にも、目は眩んでしまう。したがってどこに詩があるか自身には解しかねる。
-          　これがわかるためには、わかるだけの余裕のある第三者の地位に立たねばならぬ。三者の地位に立てばこそ芝居は観て面白い。小説も見て面白い。芝居を見て面白い人も、小説を読んで面白い人も、自己の利害は棚へ上げている。見たり読んだりする間だけは詩人である。
-          　それすら、普通の芝居や小説では人情を免かれぬ。苦しんだり、怒ったり、騒いだり、泣いたりする。見るものもいつかその中に同化して苦しんだり、怒ったり、騒いだり、泣いたりする。取柄は利慾が交らぬと云う点に存するかも知れぬが、交らぬだけにその他の情緒は常よりは余計に活動するだろう。それが嫌だ。
-          　苦しんだり、怒ったり、騒いだり、泣いたりは人の世につきものだ。余も三十年の間それを仕通して、飽々した。飽き飽きした上に芝居や小説で同じ刺激を繰り返しては大変だ。余が欲する詩はそんな世間的の人情を鼓舞するようなものではない。俗念を放棄して、しばらくでも塵界を離れた心持ちになれる詩である。いくら傑作でも人情を離れた芝居はない、理非を絶した小説は少かろう。どこまでも世間を出る事が出来ぬのが彼らの特色である。ことに西洋の詩になると、人事が根本になるからいわゆる詩歌の純粋なるものもこの境を解脱する事を知らぬ。どこまでも同情だとか、愛だとか、正義だとか、自由だとか、浮世の勧工場にあるものだけで用を弁じている。いくら詩的になっても地面の上を馳けてあるいて、銭の勘定を忘れるひまがない。シェレーが雲雀を聞いて嘆息したのも無理はない。
-          　うれしい事に東洋の詩歌はそこを解脱したのがある。採菊東籬下、悠然見南山。ただそれぎりの裏に暑苦しい世の中をまるで忘れた光景が出てくる。垣の向うに隣りの娘が覗いてる訳でもなければ、南山に親友が奉職している次第でもない。超然と出世間的に利害損得の汗を流し去った心持ちになれる。独坐幽篁裏、弾琴復長嘯、深林人不知、明月来相照。ただ二十字のうちに優に別乾坤を建立している。この乾坤の功徳は「不如帰」や「金色夜叉」の功徳ではない。汽船、汽車、権利、義務、道徳、礼義で疲れ果てた後に、すべてを忘却してぐっすり寝込むような功徳である。
-          　二十世紀に睡眠が必要ならば、二十世紀にこの出世間的の詩味は大切である。惜しい事に今の詩を作る人も、詩を読む人もみんな、西洋人にかぶれているから、わざわざ呑気な扁舟を泛べてこの桃源に溯るものはないようだ。余は固より詩人を職業にしておらんから、王維や淵明の境界を今の世に布教して広げようと云う心掛も何もない。ただ自分にはこう云う感興が演芸会よりも舞踏会よりも薬になるように思われる。ファウストよりも、ハムレットよりもありがたく考えられる。こうやって、ただ一人絵の具箱と三脚几を担いで春の山路をのそのそあるくのも全くこれがためである。淵明、王維の詩境を直接に自然から吸収して、すこしの間でも非人情の天地に逍遥したいからの願。一つの酔興だ。
-          　もちろん人間の一分子だから、いくら好きでも、非人情はそう長く続く訳には行かぬ。淵明だって年が年中南山を見詰めていたのでもあるまいし、王維も好んで竹藪の中に蚊帳を釣らずに寝た男でもなかろう。やはり余った菊は花屋へ売りこかして、生えた筍は八百屋へ払い下げたものと思う。こう云う余もその通り。いくら雲雀と菜の花が気に入ったって、山のなかへ野宿するほど非人情が募ってはおらん。こんな所でも人間に逢う。じんじん端折りの頬冠りや、赤い腰巻の姉さんや、時には人間より顔の長い馬にまで逢う。百万本の檜に取り囲まれて、海面を抜く何百尺かの空気を呑んだり吐いたりしても、人の臭いはなかなか取れない。それどころか、山を越えて落ちつく先の、今宵の宿は那古井の温泉場だ。
-          　ただ、物は見様でどうでもなる。レオナルド・ダ・ヴィンチが弟子に告げた言に、あの鐘の音を聞け、鐘は一つだが、音はどうとも聞かれるとある。一人の男、一人の女も見様次第でいかようとも見立てがつく。どうせ非人情をしに出掛けた旅だから、そのつもりで人間を見たら、浮世小路の何軒目に狭苦しく暮した時とは違うだろう。よし全く人情を離れる事が出来んでも、せめて御能拝見の時くらいは淡い心持ちにはなれそうなものだ。能にも人情はある。七騎落でも、墨田川でも泣かぬとは保証が出来ん。しかしあれは情三分芸七分で見せるわざだ。我らが能から享けるありがた味は下界の人情をよくそのままに写す手際から出てくるのではない。そのままの上へ芸術という着物を何枚も着せて、世の中にあるまじき悠長な振舞をするからである。
-          　しばらくこの旅中に起る出来事と、旅中に出逢う人間を能の仕組と能役者の所作に見立てたらどうだろう。まるで人情を棄てる訳には行くまいが、根が詩的に出来た旅だから、非人情のやりついでに、なるべく節倹してそこまでは漕ぎつけたいものだ。南山や幽篁とは性の違ったものに相違ないし、また雲雀や菜の花といっしょにする事も出来まいが、なるべくこれに近づけて、近づけ得る限りは同じ観察点から人間を視てみたい。芭蕉と云う男は枕元へ馬が尿するのをさえ雅な事と見立てて発句にした。余もこれから逢う人物を――百姓も、町人も、村役場の書記も、爺さんも婆さんも――ことごとく大自然の点景として描き出されたものと仮定して取こなして見よう。もっとも画中の人物と違って、彼らはおのがじし勝手な真似をするだろう。しかし普通の小説家のようにその勝手な真似の根本を探ぐって、心理作用に立ち入ったり、人事葛藤の詮議立てをしては俗になる。動いても構わない。画中の人間が動くと見れば差し支ない。画中の人物はどう動いても平面以外に出られるものではない。平面以外に飛び出して、立方的に働くと思えばこそ、こっちと衝突したり、利害の交渉が起ったりして面倒になる。面倒になればなるほど美的に見ている訳に行かなくなる。これから逢う人間には超然と遠き上から見物する気で、人情の電気がむやみに双方で起らないようにする。そうすれば相手がいくら働いても、こちらの懐には容易に飛び込めない訳だから、つまりは画の前へ立って、画中の人物が画面の中をあちらこちらと騒ぎ廻るのを見るのと同じ訳になる。間三尺も隔てていれば落ちついて見られる。あぶな気なしに見られる。言を換えて云えば、利害に気を奪われないから、全力を挙げて彼らの動作を芸術の方面から観察する事が出来る。余念もなく美か美でないかと鑒識する事が出来る。
-          　ここまで決心をした時、空があやしくなって来た。煮え切れない雲が、頭の上へ靠垂れ懸っていたと思ったが、いつのまにか、崩れ出して、四方はただ雲の海かと怪しまれる中から、しとしとと春の雨が降り出した。菜の花は疾くに通り過して、今は山と山の間を行くのだが、雨の糸が濃かでほとんど霧を欺くくらいだから、隔たりはどれほどかわからぬ。時々風が来て、高い雲を吹き払うとき、薄黒い山の背が右手に見える事がある。何でも谷一つ隔てて向うが脈の走っている所らしい。左はすぐ山の裾と見える。深く罩める雨の奥から松らしいものが、ちょくちょく顔を出す。出すかと思うと、隠れる。雨が動くのか、木が動くのか、夢が動くのか、何となく不思議な心持ちだ。
-          　路は存外広くなって、かつ平だから、あるくに骨は折れんが、雨具の用意がないので急ぐ。帽子から雨垂れがぽたりぽたりと落つる頃、五六間先きから、鈴の音がして、黒い中から、馬子がふうとあらわれた。
-          「ここらに休む所はないかね」
-          「もう十五丁行くと茶屋がありますよ。だいぶ濡れたね」
-          　まだ十五丁かと、振り向いているうちに、馬子の姿は影画のように雨につつまれて、またふうと消えた。
-          　糠のように見えた粒は次第に太く長くなって、今は一筋ごとに風に捲かれる様までが目に入る。羽織はとくに濡れ尽して肌着に浸み込んだ水が、身体の温度で生暖く感ぜられる。気持がわるいから、帽を傾けて、すたすた歩行く。
-          　茫々たる薄墨色の世界を、幾条の銀箭が斜めに走るなかを、ひたぶるに濡れて行くわれを、われならぬ人の姿と思えば、詩にもなる、句にも咏まれる。有体なる己れを忘れ尽して純客観に眼をつくる時、始めてわれは画中の人物として、自然の景物と美しき調和を保つ。ただ降る雨の心苦しくて、踏む足の疲れたるを気に掛ける瞬間に、われはすでに詩中の人にもあらず、画裡の人にもあらず。依然として市井の一豎子に過ぎぬ。雲煙飛動の趣も眼に入らぬ。落花啼鳥の情けも心に浮ばぬ。蕭々として独り春山を行く吾の、いかに美しきかはなおさらに解せぬ。初めは帽を傾けて歩行た。後にはただ足の甲のみを見詰めてあるいた。終りには肩をすぼめて、恐る恐る歩行た。雨は満目の樹梢を揺かして四方より孤客に逼る。非人情がちと強過ぎたようだ。
-        </div>
-      </article>
-    </div>
-
-    <div>
-      <a href="https://www.aozora.gr.jp/cards/000148/card776.html">青空文庫 - 図書カード：No.776</a>
-    </div>
-</html>
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@ -2,17 +2,14 @@ import os
 import unittest

 from readability import Document
-import timeout_decorator


-SAMPLES = os.path.join(os.path.dirname(__file__), "samples")
+SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')


 def load_sample(filename):
    """Helper to get the content out of the sample files"""
-    with open(os.path.join(SAMPLES, filename)) as f:
-        html = f.read()
-    return html
+    return open(os.path.join(SAMPLES, filename)).read()


 class TestArticleOnly(unittest.TestCase):
@ -26,101 +23,17 @@ class TestArticleOnly(unittest.TestCase):

    def test_si_sample(self):
        """Using the si sample, load article with only opening body element"""
-        sample = load_sample("si-game.sample.html")
+        sample = load_sample('si-game.sample.html')
        doc = Document(
            sample,
-            url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
-        )
+            url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
        res = doc.summary()
-        self.assertEqual("<html><body><div><div class", res[0:27])
+        self.assertEqual('<html><body><div><div class', res[0:27])

    def test_si_sample_html_partial(self):
        """Using the si sample, make sure we can get the article alone."""
-        sample = load_sample("si-game.sample.html")
-        doc = Document(
-            sample,
-            url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html",
-        )
+        sample = load_sample('si-game.sample.html')
+        doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
        res = doc.summary(html_partial=True)
        self.assertEqual('<div><div class="', res[0:17])

-    def test_too_many_images_sample_html_partial(self):
-        """Using the too-many-images sample, make sure we still get the article."""
-        sample = load_sample("too-many-images.sample.html")
-        doc = Document(sample)
-        res = doc.summary(html_partial=True)
-        self.assertEqual('<div><div class="post-body', res[0:26])
-
-    def test_wrong_link_issue_49(self):
-        """We shouldn't break on bad HTML."""
-        sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
-        doc = Document(sample)
-        res = doc.summary(html_partial=True)
-        self.assertEqual('<div><div class="content__article-body ', res[0:39])
-
-    def test_best_elem_is_root_and_passing(self):
-        sample = (
-            '<html class="article" id="body">'
-            "   <body>"
-            "       <p>1234567890123456789012345</p>"
-            "   </body>"
-            "</html>"
-        )
-        doc = Document(sample)
-        doc.summary()
-
-    def test_correct_cleanup(self):
-        sample = """
-        <html>
-            <body>
-                <section>test section</section>
-                <article class="">
-<p>Lot of text here.</p>
-                <div id="advertisement"><a href="link">Ad</a></div>
-<p>More text is written here, and contains punctuation and dots.</p>
-</article>
-                <aside id="comment1"/>
-                <div id="comment2">
-                    <a href="asd">spam</a>
-                    <a href="asd">spam</a>
-                    <a href="asd">spam</a>
-                </div>
-                <div id="comment3"/>
-                <aside id="comment4">A small comment.</aside>
-                <div id="comment5"><p>The comment is also helpful, but it's
-                    still not the correct item to be extracted.</p>
-                    <p>It's even longer than the article itself!"</p></div>
-            </body>
-        </html>
-        """
-        doc = Document(sample)
-        s = doc.summary()
-        # print(s)
-        assert "punctuation" in s
-        assert not "comment" in s
-        assert not "aside" in s
-
-    # Many spaces make some regexes run forever
-    @timeout_decorator.timeout(seconds=3, use_signals=False)
-    def test_many_repeated_spaces(self):
-        long_space = " " * 1000000
-        sample = "<html><body><p>foo" + long_space + "</p></body></html>"
-
-        doc = Document(sample)
-        s = doc.summary()
-
-        assert "foo" in s
-
-    def test_not_self_closing(self):
-        sample = '<h2><a href="#"></a>foobar</h2>'
-        doc = Document(sample)
-        assert (
-            '<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>'
-            == doc.summary()
-        )
-
-    def test_utf8_kanji(self):
-        """Using the UTF-8 kanji sample, load article which is written in kanji"""
-        sample = load_sample("utf-8-kanji.sample.html")
-        doc = Document(sample)
-        res = doc.summary()
--- a/tox.ini
+++ b/tox.ini
@ -1,33 +0,0 @@
-# Tox (http://tox.testrun.org/) is a tool for running tests
-# in multiple virtualenvs. This configuration file will run the
-# test suite on all supported python versions. To use it, "pip install tox"
-# and then run "tox" from this directory.
-
-[tox]
-envlist =
-    py{27,35,36,37,38,py,py3}, doc
-skip_missing_interpreters =
-    True
-
-[testenv]
-deps =
-    pytest
-    doc: sphinx
-    doc: sphinx_rtd_theme
-    doc: recommonmark
-
-# This creates the virtual envs with --site-packages so already packages
-# that are already installed will be reused. This is especially useful on
-# Windows. Since we use lxml instead of compiling it locally (which in turn
-# requires a Compiler and the build dependencies), you can download
-# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
-# $PYTHONDIR\Scripts\pip.exe install *.whl
-sitepackages=
-    True
-commands =
-    pip install -r requirements.txt -e ".[test]"
-    py.test
-
-[testenv:doc]
-commands =
-    python setup.py build_sphinx
Author	SHA1	Message	Date
Yuri Baburov	e8f86bdcf9	Several updates from dev version.	9 years ago
Yuri Baburov	40e430c27d	Makefile updates	9 years ago
Yuri Baburov	0a082ff020	Fix for Mac OS X 10.10	9 years ago
Yuri Baburov	8048160d66	WIP: update to support python2 and python3	9 years ago
Yuri Baburov	71294f094f	Encoding improvements	10 years ago
Yuri Baburov	5855beb32a	WIP; Backported features from stable branch	10 years ago
Yuri Baburov	ae1f1adfff	Switched to use python logging module. Added xpath option (undocumented yet).	10 years ago