diff --git a/README b/README index b42803f..a29322f 100644 --- a/README +++ b/README @@ -13,9 +13,12 @@ Based on: - Ruby port by starrhorne and iterationlabs - Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup ) - Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ ) + - "BR to P" fix from readability.js which improves quality for smaller texts. + - Github users contributions. Usage: +from readability.readability import Document import urllib html = urllib.urlopen(url).read() readable_article = Document(html).summary() @@ -23,4 +26,4 @@ readable_title = Document(html).short_title() Command-line usage: -python -m readability.readability -u http://yoursite.com/yourpage +python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml diff --git a/readability/readability.py b/readability/readability.py index 94d43fe..4ef86cb 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -120,7 +120,9 @@ class Document: continue else: logging.debug("Ruthless and lenient parsing did not work. Returning raw html") - article = self.html.find('body') or self.html + article = self.html.find('body') + if article is None: + article = self.html cleaned_article = self.sanitize(article, candidates) of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH) diff --git a/setup.py b/setup.py index 53eb747..639d24f 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name="readability-lxml", - version="0.1dev", + version="0.2", author="Yuri Baburov", author_email="burchik@gmail.com", description="python port of arc90's readability bookmarklet",