From 6c1c6391e2836c4d65c056a1b54a6ce99d2643b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89loi=20Rivard?= Date: Sun, 29 Dec 2019 15:59:33 +0100 Subject: [PATCH] Fixed a few regex warnings --- readability/debug.py | 2 +- readability/encoding.py | 2 +- readability/readability.py | 28 ++++++++++++++-------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/readability/debug.py b/readability/debug.py index f14f682..061014d 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -41,7 +41,7 @@ def describe(node, depth=1): return parent + describe_node(node) -RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U) +RE_COLLAPSE_WHITESPACES = re.compile(r'\s+', re.U) def text_content(elem, length=40): diff --git a/readability/encoding.py b/readability/encoding.py index cc14320..ff50182 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -51,7 +51,7 @@ def get_encoding(page): # Fallback to chardet if declared encodings fail # Remove all HTML tags, and leave only text for chardet - text = re.sub(b'(\s*]*>)+\s*', b' ', page).strip() + text = re.sub(br'(\s*]*>)+\s*', b' ', page).strip() enc = 'utf-8' if len(text) < 10: return enc # can't guess diff --git a/readability/readability.py b/readability/readability.py index d7caa5b..1c84005 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -23,17 +23,17 @@ from .debug import describe, text_content log = logging.getLogger("readability.readability") REGEXES = { - 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), - 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), - 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), - 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), - 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), - #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile('^\s+|\s+$/'), - #'normalizeRe': re.compile('\s{2,}/'), - #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), - 'videoRe': re.compile('https?:\/\/(www\.)?(youtube|vimeo)\.com', re.I), + 'unlikelyCandidatesRe': re.compile(r'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), + 'okMaybeItsACandidateRe': re.compile(r'and|article|body|column|main|shadow', re.I), + 'positiveRe': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), + 'negativeRe': re.compile(r'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), + 'divToPElementsRe': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), + #'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), + #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), + #'trimRe': re.compile(r'^\s+|\s+$/'), + #'normalizeRe': re.compile(r'\s{2,}/'), + #'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), + 'videoRe': re.compile(r'https?:\/\/(www\.)?(youtube|vimeo)\.com', re.I), #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } @@ -57,8 +57,8 @@ def clean(text): # Many spaces make the following regexes run forever text = re.sub(r'\s{255,}', ' ' * 255, text) - text = re.sub('\s*\n\s*', '\n', text) - text = re.sub('\t|[ \t]{2,}', ' ', text) + text = re.sub(r'\s*\n\s*', '\n', text) + text = re.sub(r'\t|[ \t]{2,}', ' ', text) return text.strip() @@ -271,7 +271,7 @@ class Document: append = True elif node_length <= 80 \ and link_density == 0 \ - and re.search('\.( |$)', node_content): + and re.search(r'\.( |$)', node_content): append = True if append: