From 0c8f040d5328baccc5bdf5803ce79e88e1f5b7c2 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Mon, 7 May 2018 18:27:06 +0700
Subject: [PATCH] Updated docs for positive_keywords and negative_keywords,
 cleaner implementation.

---
 Makefile                       |  1 +
 readability/compat/__init__.py |  3 +++
 readability/readability.py     | 18 ++++++++++++------
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 3daf2d1..81a1452 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,7 @@ clean_venv:
 	rm -rf .venv
 
 develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
+
 .venv/lib/python*/site-packages/readability-lxml.egg-link:
 	$(PY) setup.py develop
 
diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
index 4d89b0d..d02b65c 100644
--- a/readability/compat/__init__.py
+++ b/readability/compat/__init__.py
@@ -6,6 +6,9 @@ syntax that can only be solved by conditionally importing different functions.
 """
 import sys
 if sys.version_info[0] == 2:
+    bytes_ = str
     str_ = unicode
+    
 elif sys.version_info[0] == 3:
+    bytes_ = bytes
     str_ = str
diff --git a/readability/readability.py b/readability/readability.py
index 12f3d95..2a8a30f 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -16,7 +16,7 @@ from .htmls import build_doc
 from .htmls import get_body
 from .htmls import get_title
 from .htmls import shorten_title
-from .compat import str_
+from .compat import str_, bytes_
 from .debug import describe, text_content
 
 
@@ -67,14 +67,18 @@ regexp_type = type(re.compile('hello, world'))
 def compile_pattern(elements):
     if not elements:
         return None
+    elif isinstance(elements, (str_, bytes_)):
+        if isinstance(elements, bytes_):
+            elements = str_(elements, 'utf-8')
+        elements = elements.split(u',')
+        return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
     elif isinstance(elements, (list, tuple)):
         return list(elements)
     elif isinstance(elements, regexp_type):
         return elements
     else:
+        raise Exception("Unknown format for the pattern")
         # assume string or string like object
-        elements = elements.split(',')
-        return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
 
 class Document:
     """Class to build a etree document out of html."""
@@ -84,16 +88,18 @@ class Document:
         """Generate the document
 
         :param input: string of the html content.
-        :param positive_keywords: regex or list of patterns in classes and ids
-        :param negative_keywords: regex or list of patterns in classes and ids
+        :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
+        :param negative_keywords: regex, list or comma-separated string in classes and ids
         :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
         :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
         :param xpath: If set to True, adds x="..." attribute to each HTML node,
         containing xpath path pointing to original document path (allows to
         reconstruct selected summary in original document).
         
-        Example:
+        Examples:
             positive_keywords=["news-item", "block"]
+            positive_keywords=["news-item, block"]
+            positive_keywords=re.compile("news|block")
             negative_keywords=["mysidebar", "related", "ads"]
 
         The Document class is not re-enterable.