From 747c46abce4cf4307cde626e628c47e8eb262cd3 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Wed, 26 Sep 2018 08:26:08 +0300 Subject: [PATCH 1/2] Trim many repeated spaces to make clean() faster When Readability encounters many repeated whitespace, the cleanup regexes in clean() take forever to run, so trim the amount of whitespace to 255 characters. Additionally, test the extracting performance with "timeout_decorator". --- readability/readability.py | 3 +++ setup.py | 4 ++++ tests/test_article_only.py | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/readability/readability.py b/readability/readability.py index 54874ac..91f8a94 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -54,6 +54,9 @@ def to_int(x): def clean(text): + # Many spaces make the following regexes run forever + text = re.sub(r'\s{255,}', ' ' * 255, text) + text = re.sub('\s*\n\s*', '\n', text) text = re.sub('\t|[ \t]{2,}', ' ', text) return text.strip() diff --git a/setup.py b/setup.py index 09744b8..9672aff 100755 --- a/setup.py +++ b/setup.py @@ -28,6 +28,10 @@ setup( lxml_requirement, "cssselect" ], + tests_require=[ + # Test timeouts + "timeout_decorator", + ], classifiers=[ "Environment :: Web Environment", "Intended Audience :: Developers", diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 87e623c..e545025 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -2,6 +2,7 @@ import os import unittest from readability import Document +import timeout_decorator SAMPLES = os.path.join(os.path.dirname(__file__), 'samples') @@ -92,3 +93,14 @@ class TestArticleOnly(unittest.TestCase): assert('punctuation' in s) assert(not 'comment' in s) assert(not 'aside' in s) + + # Many spaces make some regexes run forever + @timeout_decorator.timeout(seconds=3, use_signals=False) + def test_many_repeated_spaces(self): + long_space = ' ' * 1000000 + sample = '

foo' + long_space + '

' + + doc = Document(sample) + s = doc.summary() + + assert 'foo' in s From 2bbb70b3e5957ff9d224aa84493b560e0efd802c Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Wed, 26 Sep 2018 08:42:11 +0300 Subject: [PATCH 2/2] Fix Travis build Add "test" extra and install dependencies for said extra as detailed in: https://stackoverflow.com/a/41398850/200603 --- .travis.yml | 2 +- setup.py | 14 ++++++++++---- tox.ini | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8cbcf71..bd0fd94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ before_install: install: - travis_retry pip install -U pip wheel tox - - travis_retry pip install -U -r requirements.txt -e . + - travis_retry pip install -U -r requirements.txt -e ".[test]" script: - tox -e $TOX_ENV diff --git a/setup.py b/setup.py index 9672aff..e845c34 100755 --- a/setup.py +++ b/setup.py @@ -12,6 +12,14 @@ if sys.platform == 'darwin': print("Using lxml<2.4") lxml_requirement = "lxml<2.4" +test_deps = [ + # Test timeouts + "timeout_decorator", +] +extras = { + 'test': test_deps, +} + setup( name="readability-lxml", version="0.7", @@ -28,10 +36,8 @@ setup( lxml_requirement, "cssselect" ], - tests_require=[ - # Test timeouts - "timeout_decorator", - ], + tests_require=test_deps, + extras_require=extras, classifiers=[ "Environment :: Web Environment", "Intended Audience :: Developers", diff --git a/tox.ini b/tox.ini index 89239db..9296cc4 100644 --- a/tox.ini +++ b/tox.ini @@ -16,5 +16,5 @@ deps=pytest # $PYTHONDIR\Scripts\pip.exe install *.whl sitepackages=True commands = - pip install -r requirements.txt + pip install -r requirements.txt -e ".[test]" py.test