Updated docs for positive_keywords and negative_keywords, cleaner implementation.

pull/74/merge
Yuri Baburov 6 years ago
parent 0e50b53d05
commit 0c8f040d53

@ -31,6 +31,7 @@ clean_venv:
rm -rf .venv rm -rf .venv
develop: .venv/lib/python*/site-packages/readability-lxml.egg-link develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
.venv/lib/python*/site-packages/readability-lxml.egg-link: .venv/lib/python*/site-packages/readability-lxml.egg-link:
$(PY) setup.py develop $(PY) setup.py develop

@ -6,6 +6,9 @@ syntax that can only be solved by conditionally importing different functions.
""" """
import sys import sys
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
bytes_ = str
str_ = unicode str_ = unicode
elif sys.version_info[0] == 3: elif sys.version_info[0] == 3:
bytes_ = bytes
str_ = str str_ = str

@ -16,7 +16,7 @@ from .htmls import build_doc
from .htmls import get_body from .htmls import get_body
from .htmls import get_title from .htmls import get_title
from .htmls import shorten_title from .htmls import shorten_title
from .compat import str_ from .compat import str_, bytes_
from .debug import describe, text_content from .debug import describe, text_content
@ -67,14 +67,18 @@ regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements): def compile_pattern(elements):
if not elements: if not elements:
return None return None
elif isinstance(elements, (str_, bytes_)):
if isinstance(elements, bytes_):
elements = str_(elements, 'utf-8')
elements = elements.split(u',')
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
elif isinstance(elements, (list, tuple)): elif isinstance(elements, (list, tuple)):
return list(elements) return list(elements)
elif isinstance(elements, regexp_type): elif isinstance(elements, regexp_type):
return elements return elements
else: else:
raise Exception("Unknown format for the pattern")
# assume string or string like object # assume string or string like object
elements = elements.split(',')
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
class Document: class Document:
"""Class to build a etree document out of html.""" """Class to build a etree document out of html."""
@ -84,16 +88,18 @@ class Document:
"""Generate the document """Generate the document
:param input: string of the html content. :param input: string of the html content.
:param positive_keywords: regex or list of patterns in classes and ids :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
:param negative_keywords: regex or list of patterns in classes and ids :param negative_keywords: regex, list or comma-separated string in classes and ids
:param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts. :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
:param retry_length: Tunable. Set to a lower value for better detection of very small texts. :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
:param xpath: If set to True, adds x="..." attribute to each HTML node, :param xpath: If set to True, adds x="..." attribute to each HTML node,
containing xpath path pointing to original document path (allows to containing xpath path pointing to original document path (allows to
reconstruct selected summary in original document). reconstruct selected summary in original document).
Example: Examples:
positive_keywords=["news-item", "block"] positive_keywords=["news-item", "block"]
positive_keywords=["news-item, block"]
positive_keywords=re.compile("news|block")
negative_keywords=["mysidebar", "related", "ads"] negative_keywords=["mysidebar", "related", "ads"]
The Document class is not re-enterable. The Document class is not re-enterable.

Loading…
Cancel
Save