diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py
index 5a19374..61a8c86 100755
--- a/src/readability_lxml/readability.py
+++ b/src/readability_lxml/readability.py
@@ -22,18 +22,28 @@ log = logging.getLogger()
REGEXES = {
- 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
- 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
- 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
- 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
- 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
+ 'unlikelyCandidatesRe': re.compile(
+ ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|'
+ 'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|'
+ 'tweet|twitter'), re.I),
+ 'okMaybeItsACandidateRe': re.compile(
+ 'and|article|body|column|main|shadow', re.I),
+ 'positiveRe': re.compile(
+ ('article|body|content|entry|hentry|main|page|pagination|post|text|'
+ 'blog|story'), re.I),
+ 'negativeRe': re.compile(
+ ('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|'
+ 'outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|'
+ 'tool|widget'), re.I),
+ 'divToPElementsRe': re.compile(
+ '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
#'replaceBrsRe': re.compile('(
]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
#'normalizeRe': re.compile('\s{2,}/'),
#'killBreaksRe': re.compile('(
(\s| ?)*){1,}/'),
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
- #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
+ #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
@@ -132,8 +142,8 @@ class Document:
def summary(self, enclose_with_html_tag=False):
"""Generate the summary of the html docuemnt
- :param enclose_with_html_tag: return only the div of the document, don't wrap
- in html and body tags.
+ :param enclose_with_html_tag: return only the div of the document,
+ don't wrap in html and body tags.
"""
try:
@@ -187,7 +197,8 @@ class Document:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
- def get_article(self, candidates, best_candidate, enclose_with_html_tag=False):
+ def get_article(self, candidates, best_candidate,
+ enclose_with_html_tag=False):
# Now that we have the top candidate, look through its siblings for
# content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
@@ -235,7 +246,9 @@ class Document:
return output
def select_best_candidate(self, candidates):
- sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+ sorted_candidates = sorted(candidates.values(),
+ key=lambda x: x['content_score'],
+ reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % (
@@ -466,7 +479,8 @@ class Document:
reason = "less than 3x
s than s" to_remove = True elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): - reason = "too short content length %s without a single image" % content_length + reason = ('too short content length %s without a single' + ' image') % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( @@ -477,36 +491,26 @@ class Document: link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: - reason = "