From 0c8f040d5328baccc5bdf5803ce79e88e1f5b7c2 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Mon, 7 May 2018 18:27:06 +0700 Subject: [PATCH] Updated docs for positive_keywords and negative_keywords, cleaner implementation. --- Makefile | 1 + readability/compat/__init__.py | 3 +++ readability/readability.py | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 3daf2d1..81a1452 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,7 @@ clean_venv: rm -rf .venv develop: .venv/lib/python*/site-packages/readability-lxml.egg-link + .venv/lib/python*/site-packages/readability-lxml.egg-link: $(PY) setup.py develop diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py index 4d89b0d..d02b65c 100644 --- a/readability/compat/__init__.py +++ b/readability/compat/__init__.py @@ -6,6 +6,9 @@ syntax that can only be solved by conditionally importing different functions. """ import sys if sys.version_info[0] == 2: + bytes_ = str str_ = unicode + elif sys.version_info[0] == 3: + bytes_ = bytes str_ = str diff --git a/readability/readability.py b/readability/readability.py index 12f3d95..2a8a30f 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -16,7 +16,7 @@ from .htmls import build_doc from .htmls import get_body from .htmls import get_title from .htmls import shorten_title -from .compat import str_ +from .compat import str_, bytes_ from .debug import describe, text_content @@ -67,14 +67,18 @@ regexp_type = type(re.compile('hello, world')) def compile_pattern(elements): if not elements: return None + elif isinstance(elements, (str_, bytes_)): + if isinstance(elements, bytes_): + elements = str_(elements, 'utf-8') + elements = elements.split(u',') + return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) elif isinstance(elements, (list, tuple)): return list(elements) elif isinstance(elements, regexp_type): return elements else: + raise Exception("Unknown format for the pattern") # assume string or string like object - elements = elements.split(',') - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) class Document: """Class to build a etree document out of html.""" @@ -84,16 +88,18 @@ class Document: """Generate the document :param input: string of the html content. - :param positive_keywords: regex or list of patterns in classes and ids - :param negative_keywords: regex or list of patterns in classes and ids + :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids + :param negative_keywords: regex, list or comma-separated string in classes and ids :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts. :param retry_length: Tunable. Set to a lower value for better detection of very small texts. :param xpath: If set to True, adds x="..." attribute to each HTML node, containing xpath path pointing to original document path (allows to reconstruct selected summary in original document). - Example: + Examples: positive_keywords=["news-item", "block"] + positive_keywords=["news-item, block"] + positive_keywords=re.compile("news|block") negative_keywords=["mysidebar", "related", "ads"] The Document class is not re-enterable.