From f1a79fb8f87b2f4b0df926e7d5c9cb8502f4344e Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Tue, 17 Apr 2012 11:04:36 -0400 Subject: [PATCH] Update to make sure we don't drop the html tag when ditching elements --- readability/readability.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readability/readability.py b/readability/readability.py index a4db7a1..d690861 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -350,8 +350,9 @@ class Document: if len(s) < 2: continue #self.debug(s) - if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': + if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: self.debug("Removing unlikely candidate - %s" % describe(elem)) + import ipdb; ipdb.set_trace() elem.drop_tree() def transform_misused_divs_into_paragraphs(self):