Updated docs for positive_keywords and negative_keywords, cleaner implementation.

6 years ago · 0c8f040d53
parent 0e50b53d05
commit 0c8f040d53
3 changed files with 16 additions and 6 deletions
--- a/1
+++ b/1
@ -31,6 +31,7 @@ clean_venv:
 	rm -rf .venv

 develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
+
 .venv/lib/python*/site-packages/readability-lxml.egg-link:
 	$(PY) setup.py develop

--- a/readability/compat/init.py
+++ b/readability/compat/init.py
@ -6,6 +6,9 @@ syntax that can only be solved by conditionally importing different functions.
 """
 import sys
 if sys.version_info[0] == 2:
+    bytes_ = str
    str_ = unicode
+    
 elif sys.version_info[0] == 3:
+    bytes_ = bytes
    str_ = str
--- a/readability/readability.py
+++ b/readability/readability.py
@ -16,7 +16,7 @@ from .htmls import build_doc
 from .htmls import get_body
 from .htmls import get_title
 from .htmls import shorten_title
-from .compat import str_
+from .compat import str_, bytes_
 from .debug import describe, text_content


@ -67,14 +67,18 @@ regexp_type = type(re.compile('hello, world'))
 def compile_pattern(elements):
    if not elements:
        return None
+    elif isinstance(elements, (str_, bytes_)):
+        if isinstance(elements, bytes_):
+            elements = str_(elements, 'utf-8')
+        elements = elements.split(u',')
+        return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
    elif isinstance(elements, (list, tuple)):
        return list(elements)
    elif isinstance(elements, regexp_type):
        return elements
    else:
+        raise Exception("Unknown format for the pattern")
        # assume string or string like object
-        elements = elements.split(',')
-        return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)

 class Document:
    """Class to build a etree document out of html."""
@ -84,16 +88,18 @@ class Document:
        """Generate the document

        :param input: string of the html content.
-        :param positive_keywords: regex or list of patterns in classes and ids
-        :param negative_keywords: regex or list of patterns in classes and ids
+        :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
+        :param negative_keywords: regex, list or comma-separated string in classes and ids
        :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
        :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
        :param xpath: If set to True, adds x="..." attribute to each HTML node,
        containing xpath path pointing to original document path (allows to
        reconstruct selected summary in original document).
        
-        Example:
+        Examples:
            positive_keywords=["news-item", "block"]
+            positive_keywords=["news-item, block"]
+            positive_keywords=re.compile("news|block")
            negative_keywords=["mysidebar", "related", "ads"]

        The Document class is not re-enterable.