From 58c69651d37af756caf40d30ddf8dd18b4703e93 Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Wed, 18 Apr 2012 21:31:42 -0400 Subject: [PATCH] Update README to be a rst file and clean up a little bit. --- README => README.rst | 60 ++++++++++++++++++----------- src/readability_lxml/readability.py | 8 +--- 2 files changed, 39 insertions(+), 29 deletions(-) rename README => README.rst (52%) diff --git a/README b/README.rst similarity index 52% rename from README rename to README.rst index 9e0471c..ee9aa66 100644 --- a/README +++ b/README.rst @@ -1,14 +1,14 @@ -This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0 +readability_lxml +================ -This is a python port of a ruby port of arc90's readability project +This is a python port of a ruby port of `arc90's readability`_ project -http://lab.arc90.com/experiments/readability/ - -In few words, Given a html document, it pulls out the main body text and cleans it up. It also can clean up title based on latest readability.js code. -Based on: + +Inspiration +----------- - Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js ) - Ruby port by starrhorne and iterationlabs - Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup ) @@ -16,13 +16,29 @@ Based on: - "BR to P" fix from readability.js which improves quality for smaller texts. - Github users contributions. -Installation:: - easy_install readability-lxml - or - pip install readability-lxml +Installation +------------- +:: + + $ easy_install readability-lxml + # or + $ pip install readability-lxml + + +Usage +------ -Usage:: +Command Line Client +~~~~~~~~~~~~~~~~~~~ +:: + + $ readability http://pypi.python.org/pypi/readability-lxml + $ readability /home/rharding/sampledoc.html + +As a Library +~~~~~~~~~~~~ +:: from readability.readability import Document import urllib @@ -30,21 +46,19 @@ Usage:: readable_article = Document(html).summary() readable_title = Document(html).short_title() -Command-line usage:: - - python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml - +Optional `Document` keyword argument: -Document() kwarg options: +- attributes: +- debug: output debug messages +- min_text_length: +- retry_length: +- url: will allow adjusting links to be absolute - - attributes: - - debug: output debug messages - - min_text_length: - - retry_length: - - url: will allow adjusting links to be absolute +History +------- -Updates + - `0.2.5`` Update setup.py for uploading .tar.gz to pypi - - 0.2.5 Update setup.py for uploading .tar.gz to pypi +.. _arc90's readability: http://lab.arc90.com/experiments/readability/ diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index 2c8c630..c83b46f 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -102,11 +102,6 @@ class Document: self.options = options self.html = None - def _html(self, force=False): - if force or self.html is None: - self.html = self._parse(self.input_doc) - return self.html - def _parse(self, input_doc): doc = build_doc(input_doc) doc = html_cleaner.clean_html(doc) @@ -136,7 +131,8 @@ class Document: try: ruthless = True while True: - self._html(True) + self.html = self._parse(self.input_doc) + for i in self.tags(self.html, 'script', 'style'): i.drop_tree() for i in self.tags(self.html, 'body'):