Update to make sure we don't drop the html tag when ditching elements

pull/19/head
Richard Harding 12 years ago
parent 46f0302ebc
commit f1a79fb8f8

@ -350,8 +350,9 @@ class Document:
if len(s) < 2:
continue
#self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
self.debug("Removing unlikely candidate - %s" % describe(elem))
import ipdb; ipdb.set_trace()
elem.drop_tree()
def transform_misused_divs_into_paragraphs(self):

Loading…
Cancel
Save