From f02fe79840b3d52daa155b74ad047eba52f11f32 Mon Sep 17 00:00:00 2001 From: Jerry Charumilind Date: Thu, 21 Jul 2011 09:56:04 -0700 Subject: [PATCH] Checkpoint multi-page readability work Restructured code to better support multi-page readability. Improved tests. Conflicts: src/readability_lxml/readability.py src/tests/regression.py --- src/readability_lxml/readability.py | 3 +- src/tests/regression.py | 1 + test_data/basic-multi-page-3.html | 60 +++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 test_data/basic-multi-page-3.html diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index 75a59ec..d6f8365 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -27,6 +27,7 @@ log = logging.getLogger() REGEXES = { +<<<<<<< HEAD:src/readability_lxml/readability.py 'unlikelyCandidatesRe': re.compile( ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|' 'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|' @@ -46,7 +47,7 @@ REGEXES = { 'divToPElementsRe': re.compile( '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), # Match: next, continue, >, >>, but not >|, as those usually mean last. - 'nextLink': re.compile(r'(next|weiter|continue|>[^\|]|$)', re.I), + 'nextLink': re.compile(r'(next|weiter|continue|>[^\|]$)', re.I), # Match: next, continue, >, >>, but not >|, as those usually mean last. 'prevLink': re.compile(r'(prev|earl|old|new|<)', re.I), 'page': re.compile(r'pag(e|ing|inat)', re.I), 'firstLast': re.compile(r'(first|last)', re.I) diff --git a/src/tests/regression.py b/src/tests/regression.py index b95505f..5c6555c 100644 --- a/src/tests/regression.py +++ b/src/tests/regression.py @@ -17,6 +17,7 @@ import os.path import re import sys import unittest +import readability.urlfetch import yaml from lxml.html import builder as B diff --git a/test_data/basic-multi-page-3.html b/test_data/basic-multi-page-3.html new file mode 100644 index 0000000..7e4cad8 --- /dev/null +++ b/test_data/basic-multi-page-3.html @@ -0,0 +1,60 @@ + + + + A Simple Multi-Page Article For Testing : Page 3 + + +

A Simple Multi-Page Article For Testing : Page 3

+

+ Nullam laoreet, nibh non faucibus dictum, tellus libero varius + erat, lobortis varius est massa quis metus. Donec vitae justo + lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum + justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce + sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis + quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu + imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus + aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla, + luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit + amet viverra pretium, magna tortor suscipit nisi, id interdum lorem + orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis + ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae + imperdiet est. +

+

+ Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac + tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse + cursus, lacus sit amet sodales molestie, dui erat varius velit, non + tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut + pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor + eget placerat magna luctus. Duis mollis ligula a orci ultrices in + facilisis felis feugiat. Morbi eget odio eget erat pulvinar + placerat sed nec erat. Duis dignissim, dolor a lacinia commodo, + metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce + imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus + scelerisque non sit amet orci. Phasellus id quam odio. Nulla + adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio, + tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis + vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna + dolor, imperdiet eget rutrum tempus, euismod nec augue. +

+

+ Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae + neque magna, in laoreet felis. Aenean elit ligula, tempor in + vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat + placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id + vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl, + gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna + a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec + magna scelerisque vitae vulputate ipsum luctus. +

+ + +