Trim many repeated spaces to make clean() faster

When Readability encounters many repeated whitespace, the cleanup
regexes in clean() take forever to run, so trim the amount of whitespace
to 255 characters.

Additionally, test the extracting performance with "timeout_decorator".
pull/105/head
Linas Valiukas 6 years ago
parent 8235f0794c
commit 747c46abce

@ -54,6 +54,9 @@ def to_int(x):
def clean(text):
# Many spaces make the following regexes run forever
text = re.sub(r'\s{255,}', ' ' * 255, text)
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('\t|[ \t]{2,}', ' ', text)
return text.strip()

@ -28,6 +28,10 @@ setup(
lxml_requirement,
"cssselect"
],
tests_require=[
# Test timeouts
"timeout_decorator",
],
classifiers=[
"Environment :: Web Environment",
"Intended Audience :: Developers",

@ -2,6 +2,7 @@ import os
import unittest
from readability import Document
import timeout_decorator
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
@ -92,3 +93,14 @@ class TestArticleOnly(unittest.TestCase):
assert('punctuation' in s)
assert(not 'comment' in s)
assert(not 'aside' in s)
# Many spaces make some regexes run forever
@timeout_decorator.timeout(seconds=3, use_signals=False)
def test_many_repeated_spaces(self):
long_space = ' ' * 1000000
sample = '<html><body><p>foo' + long_space + '</p></body></html>'
doc = Document(sample)
s = doc.summary()
assert 'foo' in s

Loading…
Cancel
Save