Add the title and the short title to the metadata set.

- Tested for perf. hit, 100 iterations add .03s total time.
- Added the -m flag to the cmd line client to get all metadata output.
- Added test for making sure title/short title come back as well.
0.3.0.dev
Richard Harding 12 years ago
parent b1966df1c3
commit 7dc373e9c5

@ -14,7 +14,12 @@ def parse_args():
parser.add_argument('-v', '--verbose',
action='store_true',
default=False,
help="Increase logging verbosity to DEBUG.")
help='Increase logging verbosity to DEBUG.')
parser.add_argument('-m', '--metadata',
action='store_true',
default=False,
help='print all metadata as well as content for the content')
parser.add_argument('path', metavar='P', type=str, nargs=1,
help="The url or file path to process in readable form.")
@ -47,7 +52,14 @@ def main():
doc = Document(target.read(),
debug=args.verbose,
url=url)
print doc.summary().encode(enc, 'replace')
if args.metadata:
m = doc.summary_with_metadata()
print m.title()
print m.short_title()
print m.confidence
print m.html.encode(enc, 'replace')
else:
print doc.summary().encode(enc, 'replace')
finally:
target.close()

@ -52,6 +52,11 @@ class Unparseable(ValueError):
pass
# We want to change over the Summary to a nametuple to be more memory
# effecient and because it doesn't need to be mutable.
Summary = namedtuple('Summary', ['html', 'confidence', 'title', 'short_title'])
def describe(node, depth=1):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
@ -88,9 +93,7 @@ def text_length(i):
return len(clean(i.text_content() or ""))
# We want to change over the Summary to a nametuple to be more memory
# effecient and because it doesn't need to be mutable.
Summary = namedtuple('Summary', ['html', 'confidence'])
class Document:
@ -221,7 +224,10 @@ class Document:
# Loop through and try again.
continue
else:
return Summary(confidence=confidence, html=cleaned_article)
return Summary(confidence=confidence,
html=cleaned_article,
short_title=self.short_title,
title=self.title)
except StandardError, e:
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]

@ -42,8 +42,14 @@ class TestArticleOnly(unittest.TestCase):
sample = load_sample('si-game.sample.html')
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
res = doc.summary_with_metadata(enclose_with_html_tag=False)
self.assertTrue(hasattr(res, 'html'), 'res should have an html attrib')
self.assertTrue(hasattr(res, 'confidence'), 'res should have an html attrib')
self.assertTrue(hasattr(res, 'html'),
'res should have an html attrib')
self.assertTrue(hasattr(res, 'confidence'),
'res should have an html attrib')
self.assertTrue(hasattr(res, 'title'),
'res should have an titile attrib')
self.assertTrue(hasattr(res, 'short_title'),
'res should have an short_title attrib')
self.assertEqual('<div><div class="', res.html[0:17])
self.assertTrue(res.confidence > 50,
'The confidence score should be larger than 50: ' + str(res.confidence))

Loading…
Cancel
Save