|
|
|
@ -23,17 +23,17 @@ from .debug import describe, text_content
|
|
|
|
|
log = logging.getLogger("readability.readability")
|
|
|
|
|
|
|
|
|
|
REGEXES = {
|
|
|
|
|
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
|
|
|
|
|
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
|
|
|
|
|
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
|
|
|
|
|
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
|
|
|
|
|
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
|
|
|
|
|
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
|
|
|
|
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
|
|
|
|
#'trimRe': re.compile('^\s+|\s+$/'),
|
|
|
|
|
#'normalizeRe': re.compile('\s{2,}/'),
|
|
|
|
|
#'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'),
|
|
|
|
|
'videoRe': re.compile('https?:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
|
|
|
|
|
'unlikelyCandidatesRe': re.compile(r'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
|
|
|
|
|
'okMaybeItsACandidateRe': re.compile(r'and|article|body|column|main|shadow', re.I),
|
|
|
|
|
'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
|
|
|
|
|
'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
|
|
|
|
|
'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
|
|
|
|
|
#'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
|
|
|
|
#'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
|
|
|
|
|
#'trimRe': re.compile(r'^\s+|\s+$/'),
|
|
|
|
|
#'normalizeRe': re.compile(r'\s{2,}/'),
|
|
|
|
|
#'killBreaksRe': re.compile(r'(<br\s*\/?>(\s| ?)*){1,}/'),
|
|
|
|
|
'videoRe': re.compile(r'https?:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
|
|
|
|
|
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -57,8 +57,8 @@ def clean(text):
|
|
|
|
|
# Many spaces make the following regexes run forever
|
|
|
|
|
text = re.sub(r'\s{255,}', ' ' * 255, text)
|
|
|
|
|
|
|
|
|
|
text = re.sub('\s*\n\s*', '\n', text)
|
|
|
|
|
text = re.sub('\t|[ \t]{2,}', ' ', text)
|
|
|
|
|
text = re.sub(r'\s*\n\s*', '\n', text)
|
|
|
|
|
text = re.sub(r'\t|[ \t]{2,}', ' ', text)
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -271,7 +271,7 @@ class Document:
|
|
|
|
|
append = True
|
|
|
|
|
elif node_length <= 80 \
|
|
|
|
|
and link_density == 0 \
|
|
|
|
|
and re.search('\.( |$)', node_content):
|
|
|
|
|
and re.search(r'\.( |$)', node_content):
|
|
|
|
|
append = True
|
|
|
|
|
|
|
|
|
|
if append:
|
|
|
|
|