From cdd30f625eaedbaf47e11385666199245f31a309 Mon Sep 17 00:00:00 2001 From: Jerry Charumilind Date: Tue, 5 Jul 2011 13:35:36 -0700 Subject: [PATCH] Return confidence level when retieving summary --- readability/readability.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/readability/readability.py b/readability/readability.py index c7ddcd7..a6534ed 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -59,6 +59,17 @@ def text_length(i): class Unparseable(ValueError): pass +class Summary: + ''' + The type of object returned by Document.summary(). This includes the + confidence level we have in our summary. If this is low (<35), our summary + may not be valid, though we did our best. + ''' + + def __init__(self, confidence, html): + self.confidence = confidence + self.html = html + class Document: TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 @@ -111,6 +122,7 @@ class Document: best_candidate = self.select_best_candidate(candidates) if best_candidate: + confidence = best_candidate['content_score'] article = self.get_article(candidates, best_candidate) else: if ruthless: @@ -121,6 +133,7 @@ class Document: continue else: logging.debug("Ruthless and lenient parsing did not work. Returning raw html") + confidence = 0; article = self.html.find('body') or self.html cleaned_article = self.sanitize(article, candidates) @@ -129,7 +142,7 @@ class Document: ruthless = False continue # try again else: - return cleaned_article + return Summary(confidence, cleaned_article) except StandardError, e: #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info()))) logging.exception('error getting summary: ' )