Merge Jerry: pull in the ability to get back confidence score as well as the processed html

0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent a2b17e757c
commit b78d7e8501

@ -54,6 +54,16 @@ As a Library
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
You can also use the `get_summary_with_metadata` method to get back other
metadata such as the confidence score found while processing the input.
::
doc = Document(html).get_summary_with_metadata()
print doc.html
print doc.confidence
Optional `Document` keyword argument:
- attributes:

@ -4,6 +4,7 @@ import re
import sys
from collections import defaultdict
from collections import namedtuple
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.html import document_fromstring
@ -87,6 +88,11 @@ def text_length(i):
return len(clean(i.text_content() or ""))
# We want to change over the Summary to a nametuple to be more memory
# effecient and because it doesn't need to be mutable.
Summary = namedtuple('Summary', ['html', 'confidence'])
class Document:
"""Class to build a etree document out of html."""
TEXT_LENGTH_THRESHOLD = 25
@ -139,11 +145,33 @@ class Document:
def short_title(self):
return shorten_title(self.html)
def get_summary_with_metadata(self, enclose_with_html_tag=True):
"""Parse the input content and return a Summary object
:param enclose_with_html_tag: Bool do you want a full <html> document
or just the <div> html partial.
"""
summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
# For this call return the raw Summary object.
return summary
def summary(self, enclose_with_html_tag=True):
"""Generate the summary of the html docuemnt
"""Generate the summary of the html document
:param enclose_with_html_tag: Bool do you want a full <html> document
or just the <div> html partial.
"""
summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
# Only return the html to be consistent with the backwards api.
return summary.html
def _summary(self, enclose_with_html_tag=True):
"""Helper used in a few places to generate the summary of the content
:param enclose_with_html_tag: return only the div of the document,
don't wrap in html and body tags.
:param enclose_with_html_tag: Bool do you want a full <html> document
or just the <div> html partial.
"""
try:
@ -162,6 +190,7 @@ class Document:
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
confidence = best_candidate['content_score']
article = self.get_article(candidates, best_candidate,
enclose_with_html_tag=enclose_with_html_tag)
else:
@ -177,7 +206,8 @@ class Document:
log.debug(
("Ruthless and lenient parsing did not work. "
"Returning raw html"))
article = self.html.find('body')
article = self.html.find('body') or self.html
confidence = 0
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
@ -191,7 +221,7 @@ class Document:
# Loop through and try again.
continue
else:
return cleaned_article
return Summary(confidence=confidence, html=cleaned_article)
except StandardError, e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]

@ -36,3 +36,15 @@ class TestArticleOnly(unittest.TestCase):
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary(enclose_with_html_tag=False)
self.assertEqual('<div><div class="', res[0:17])
def test_si_sample_full_summary(self):
"""We should parse the doc and get a full summary with confidence"""
sample = load_sample('si-game.sample.html')
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.get_summary_with_metadata(enclose_with_html_tag=False)
self.assertTrue(hasattr(res, 'html'), 'res should have an html attrib')
self.assertTrue(hasattr(res, 'confidence'), 'res should have an html attrib')
self.assertEqual('<div><div class="', res.html[0:17])
self.assertTrue(res.confidence > 50,
'The confidence score should be larger than 50: ' + str(res.confidence))

Loading…
Cancel
Save