You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
53 lines
1.2 KiB
Python
53 lines
1.2 KiB
Python
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
|
import re
|
|
from lxml.html.clean import Cleaner
|
|
|
|
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
|
|
single_quoted = "'[^']+'"
|
|
double_quoted = '"[^"]+"'
|
|
non_space = "[^ \"'>]+"
|
|
htmlstrip = re.compile(
|
|
"<" # open
|
|
"([^>]+) " # prefix
|
|
"(?:%s) *" % ("|".join(bad_attrs),)
|
|
+ "= *(?:%s|%s|%s)" # undesirable attributes
|
|
% (non_space, single_quoted, double_quoted)
|
|
+ "([^>]*)" # value # postfix
|
|
">", # end
|
|
re.I,
|
|
)
|
|
|
|
|
|
def clean_attributes(html):
|
|
while htmlstrip.search(html):
|
|
html = htmlstrip.sub("<\\1\\2>", html)
|
|
return html
|
|
|
|
|
|
def normalize_spaces(s):
|
|
if not s:
|
|
return ""
|
|
"""replace any sequence of whitespace
|
|
characters with a single space"""
|
|
return " ".join(s.split())
|
|
|
|
|
|
html_cleaner = Cleaner(
|
|
scripts=True,
|
|
javascript=True,
|
|
comments=True,
|
|
style=True,
|
|
links=True,
|
|
meta=False,
|
|
add_nofollow=False,
|
|
page_structure=False,
|
|
processing_instructions=True,
|
|
embedded=False,
|
|
frames=False,
|
|
forms=False,
|
|
annoying_tags=False,
|
|
remove_tags=None,
|
|
remove_unknown_tags=False,
|
|
safe_attrs_only=False,
|
|
)
|