use a more leniant parser

pull/1/head
gfxmonk 14 years ago
parent ad3d52ade4
commit c0ca60ee26

File diff suppressed because it is too large Load Diff

@ -0,0 +1,145 @@
import re
from url_helpers import absolute_url
from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit
from logging import error
__all__ = [
'Unparseable',
'parse',
'get_title',
'get_body',
'ascii']
def debug(s): pass
class Unparseable(ValueError):
pass
def parse(raw_content, base_href=None, notify=lambda x: None):
for parse_method in _parse_methods():
try:
return parse_method(raw_content, base_href)
except HTMLParseError, e:
notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
continue
raise Unparseable()
def get_title(soup):
title = unicode(getattr(soup.title, 'string', ''))
if not title:
return None
return normalize_spaces(title)
def get_body(soup):
[ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
raw_html = unicode(soup.body or soup)
cleaned = clean_attributes(raw_html)
try:
BeautifulSoup(cleaned)
return cleaned
except HTMLParseError:
error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned))
return raw_html
def ascii(s):
return s.decode('ascii', 'ignore')
class Replacement(object):
def __init__(self, desc, regex, replacement):
self.desc = desc
self.regex = regex
self.replacement = replacement
def apply(self, content):
# # useful for debugging:
# try:
# print self. desc + ':' + str(self.regex.findall(content))
# except RuntimeError: pass
return self.regex.sub(self.replacement, content)
def beautiful_soup(content, base_href):
soup = BeautifulSoup(content)
if base_href:
_fix_references(soup, base_href)
return soup
def _make_absolute_links(soup, base_href):
for link in soup.findAll('a', attrs={'href':True}):
link['href'] = absolute_url(link['href'], base_href)
def _make_absolute_images(soup, base_href):
for img in soup.findAll('img', attrs={'src':True}):
img['src'] = absolute_url(img['src'], base_href)
def _fix_references(soup, base_href):
_make_absolute_links(soup, base_href)
_make_absolute_images(soup, base_href)
# a bunch of regexes to hack around lousy html
dodgy_regexes = (
Replacement('javascript',
regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
replacement=''),
Replacement('double double-quoted attributes',
regex=re.compile('(="[^"]+")"+'),
replacement='\\1'),
Replacement('unclosed tags',
regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
replacement='\\1>\\2'),
Replacement('unclosed (numerical) attribute values',
regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
replacement='\\1"\\2'),
)
# helpers for parsing
def normalize_spaces(s):
"""replace any sequence of whitespace
characters with a single space"""
return ' '.join(s.split())
def _remove_crufty_html(content):
for replacement in dodgy_regexes:
content = replacement.apply(content)
return content
def _parse_methods():
def unicode_cleansed(content, base_href):
content = UnicodeDammit(content, isHTML=True).markup
cleaned = _remove_crufty_html(content)
debug("Cleaned content: %s" % (cleaned,))
return beautiful_soup(cleaned, base_href)
def ascii_cleansed(content, base_href):
content = ascii(content)
cleaned = _remove_crufty_html(content)
debug("Cleaned content: %s" % (cleaned,))
return beautiful_soup(cleaned, base_href)
return (
beautiful_soup,
unicode_cleansed,
ascii_cleansed)
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
">" # end
, re.I)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html

@ -1,5 +1,6 @@
#!/usr/bin/env python
from BeautifulSoup import BeautifulSoup, NavigableString
from BeautifulSoup import NavigableString
from page_parser import parse
import re
REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
@ -34,8 +35,7 @@ class Document:
self.make_html()
def make_html(self):
self.html = BeautifulSoup(self.input)
self.html = parse(self.input, self.options['url'])
def content(self, remove_unlikely_candidates = True):
def remove(tag): [i.extract() for i in self.html.findAll(tag)]
@ -60,7 +60,7 @@ class Document:
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = BeautifulSoup("<div/>")
output = parse("<div/>")
for sibling in best_candidate['elem'].parent.contents:
if isinstance(sibling, NavigableString): continue
append = False

@ -0,0 +1,52 @@
import logging
from urlparse import urlparse
def host_for_url(url):
"""
>>> host_for_url('http://base/whatever/fdsh')
'base'
>>> host_for_url('invalid')
"""
host = urlparse(url)[1]
if not host:
logging.error("could not extract host from URL: %r" % (url,))
return None
return host
def absolute_url(url, base_href):
"""
>>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
'http://base/whatever/ooo/foo'
>>> absolute_url('foo/bar/', 'http://base')
'http://base/foo/bar/'
>>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
'http://base/foo/bar'
>>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
'http://base/foo/bar'
>>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
'http://localhost/foo'
"""
url = url.strip()
proto = urlparse(url)[0]
if proto:
return url
base_url_parts = urlparse(base_href)
base_server = '://'.join(base_url_parts[:2])
if url.startswith('/'):
return base_server + url
else:
path = base_url_parts[2]
if '/' in path:
path = path.rsplit('/', 1)[0] + '/'
else:
path = '/'
return base_server + path + url
if __name__ == '__main__':
import doctest
doctest.testmod()
Loading…
Cancel
Save