You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
python-readability/src/readability_lxml/cleaners.py

39 lines
1.2 KiB
Python

# strip out a set of nuisance html attributes that can mess up rendering in
# RSS feeds
import re
from lxml.html.clean import Cleaner
bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
">", # end
re.I)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html
def normalize_spaces(s):
"""replace any sequence of whitespace characters with a single space"""
if not s:
return ''
return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True,
embedded=False, frames=False, forms=False,
annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False)