Release version 0.7 . Better HTML5 support and an important bugfix.

pull/74/merge
Yuri Baburov 6 years ago
parent 537de2b8f6
commit 0e50b53d05

@ -11,7 +11,7 @@ env:
before_install:
# work around https://github.com/travis-ci/travis-ci/issues/8363
- pyenv global system 3.5
- pyenv global system 3.6
install:
- travis_retry pip install -U pip wheel tox

@ -1,10 +1,9 @@
# Makefile to help automate tasks
WD := $(shell pwd)
PY := .env/bin/python
PIP := .env/bin/pip
PEP8 := .env/bin/pep8
NOSE := .env/bin/nosetests
PY := .venv/bin/python
PIP := .venv/bin/pip
PEP8 := .venv/bin/pep8
NOSE := .venv/bin/nosetests
# ###########
# Tests rule!
@ -22,16 +21,17 @@ $(NOSE):
.PHONY: all
all: venv develop
venv: bin/python
bin/python:
virtualenv .env
venv: .venv/bin/python
.venv/bin/python:
virtualenv .venv
.PHONY: clean_venv
clean_venv:
rm -rf .env
rm -rf .venv
develop: .env/lib/python*/site-packages/readability-lxml.egg-link
.env/lib/python*/site-packages/readability-lxml.egg-link:
develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
.venv/lib/python*/site-packages/readability-lxml.egg-link:
$(PY) setup.py develop

@ -35,13 +35,15 @@ Usage
Change Log
----------
- 0.3 Added Document.encoding, positive\_keywords and
negative\_keywords
- 0.4 Added Videos loading and allowed more images per paragraph
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
3.4
- 0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important
bug with stripping unwanted HTML nodes (only first matching node was removed before).
- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3
and 3.4
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
3.4
- 0.4 Added Videos loading and allowed more images per paragraph
- 0.3 Added Document.encoding, positive\_keywords and
negative\_keywords
Licensing
=========

@ -381,13 +381,13 @@ class Document:
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
if name in ["div", "article"]:
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]:
content_score -= 5
return {
'content_score': content_score,
@ -463,7 +463,7 @@ class Document:
allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"):
if el in allowed:
continue
weight = self.class_weight(el)
@ -577,7 +577,7 @@ class Document:
if siblings and sum(siblings) > 1000:
to_remove = False
log.debug("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"):
for desnode in self.tags(el, "table", "ul", "div", "section"):
allowed[desnode] = True
if to_remove:

@ -14,7 +14,7 @@ if sys.platform == 'darwin':
setup(
name="readability-lxml",
version="0.6.2",
version="0.7",
author="Yuri Baburov",
author_email="burchik@gmail.com",
description="fast html to text parser (article readability tool) with python3 support",
@ -43,6 +43,5 @@ setup(
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
],
)

@ -61,3 +61,34 @@ class TestArticleOnly(unittest.TestCase):
)
doc = Document(sample)
doc.summary()
def test_correct_cleanup(self):
sample = """
<html>
<body>
<section>test section</section>
<article class="">
<p>Lot of text here.</p>
<div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
<aside id="comment1"/>
<div id="comment2">
<a href="asd">spam</a>
<a href="asd">spam</a>
<a href="asd">spam</a>
</div>
<div id="comment3"/>
<aside id="comment4">A small comment.</aside>
<div id="comment5"><p>The comment is also helpful, but it's
still not the correct item to be extracted.</p>
<p>It's even longer than the article itself!"</p></div>
</body>
</html>
"""
doc = Document(sample)
s = doc.summary()
#print(s)
assert('punctuation' in s)
assert(not 'comment' in s)
assert(not 'aside' in s)

Loading…
Cancel
Save