Merge pull request #105 from pypt/many_repeated_spaces_timeout

Trim many repeated spaces to make clean() faster
pull/109/head
Yuri Baburov 6 years ago committed by GitHub
commit 59b99ffa0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -24,7 +24,7 @@ before_install:
install:
- travis_retry pip install -U pip wheel tox
- travis_retry pip install -U -r requirements.txt -e .
- travis_retry pip install -U -r requirements.txt -e ".[test]"
script:
- tox -e $TOX_ENV

@ -54,6 +54,9 @@ def to_int(x):
def clean(text):
# Many spaces make the following regexes run forever
text = re.sub(r'\s{255,}', ' ' * 255, text)
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('\t|[ \t]{2,}', ' ', text)
return text.strip()

@ -16,6 +16,14 @@ if sys.platform == 'darwin':
print("Using lxml<2.4")
lxml_requirement = "lxml<2.4"
test_deps = [
# Test timeouts
"timeout_decorator",
]
extras = {
'test': test_deps,
}
# Adapted from https://github.com/pypa/pip/blob/master/setup.py
def find_version(*file_paths):
@ -35,7 +43,6 @@ def find_version(*file_paths):
raise RuntimeError("Unable to find version string.")
setup(
name="readability-lxml",
version=find_version("readability", "__init__.py"),
@ -52,6 +59,8 @@ setup(
lxml_requirement,
"cssselect"
],
tests_require=test_deps,
extras_require=extras,
classifiers=[
"Environment :: Web Environment",
"Intended Audience :: Developers",

@ -2,6 +2,7 @@ import os
import unittest
from readability import Document
import timeout_decorator
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
@ -94,3 +95,14 @@ class TestArticleOnly(unittest.TestCase):
assert('punctuation' in s)
assert(not 'comment' in s)
assert(not 'aside' in s)
# Many spaces make some regexes run forever
@timeout_decorator.timeout(seconds=3, use_signals=False)
def test_many_repeated_spaces(self):
long_space = ' ' * 1000000
sample = '<html><body><p>foo' + long_space + '</p></body></html>'
doc = Document(sample)
s = doc.summary()
assert 'foo' in s

@ -16,5 +16,5 @@ deps=pytest
# $PYTHONDIR\Scripts\pip.exe install *.whl
sitepackages=True
commands =
pip install -r requirements.txt
pip install -r requirements.txt -e ".[test]"
py.test

Loading…
Cancel
Save