Checkpoint multi-page readability work

Restructured code to better support multi-page readability.  Improved tests.

Rick:
This generally works and the tests pass, but there are some broken cases with
the multipage bits that are causing me grief. It does pass the one test case.
I made the multipage an option vs doing it by default. The more I change the
code the harder future merges will be, but man it needs some cleanup, reorg,
and comments.

Conflicts:

	src/readability_lxml/readability.py
	src/tests/regression.py
0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent 99efa5c10b
commit f8315d011c

@ -13,6 +13,8 @@ from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from cleaners import clean_attributes
from cleaners import html_cleaner
from htmls import build_doc
from htmls import get_body
@ -150,7 +152,7 @@ def transform_misused_divs_into_paragraphs(doc):
# transform <div>s that do not contain other block elements into <p>s
if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))):
logging.debug("Altering %s to p" % (describe(elem)))
# log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
@ -160,7 +162,7 @@ def transform_misused_divs_into_paragraphs(doc):
p.text = elem.text
elem.text = None
elem.insert(0, p)
logging.debug("Appended %s to %s" % (tounicode(p), describe(elem)))
# log.debug("Appended %s to %s" % (tounicode(p), describe(elem)))
#print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))):
@ -169,9 +171,9 @@ def transform_misused_divs_into_paragraphs(doc):
p.text = child.tail
child.tail = None
elem.insert(pos + 1, p)
logging.debug("Inserted %s to %s" % (
tounicode(p),
describe(elem)))
# log.debug("Inserted %s to %s" % (
# tounicode(p),
# describe(elem)))
#print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem)
@ -181,13 +183,13 @@ def transform_misused_divs_into_paragraphs(doc):
def remove_unlikely_candidates(doc):
for elem in doc.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#logging.debug(s)
#log.debug(s)
if (REGEXES['unlikelyCandidatesRe'].search(s) and
(not REGEXES['okMaybeItsACandidateRe'].search(s)) and
elem.tag != 'body' and
elem.getparent() is not None
):
logging.debug("Removing unlikely candidate - %s" % describe(elem))
# log.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree()
@ -200,14 +202,13 @@ def get_link_density(elem):
total_length = text_length(elem)
return float(link_length) / max(total_length, 1)
def score_paragraphs(doc, min_text_len):
def score_paragraphs(doc, options):
candidates = {}
#logging.debug(str([describe(node) for node in tags(doc, "div")]))
#log.debug(str([describe(node) for node in tags(doc, "div")]))
ordered = []
for elem in tags(doc, "p", "pre", "td"):
logging.debug('Scoring %s' % describe(elem))
# log.debug('Scoring %s' % describe(elem))
parent_node = elem.getparent()
if parent_node is None:
continue
@ -217,7 +218,7 @@ def score_paragraphs(doc, min_text_len):
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < min_text_len:
if inner_text_len < options['min_text_length']:
continue
if parent_node not in candidates:
@ -246,11 +247,11 @@ def score_paragraphs(doc, min_text_len):
candidate = candidates[elem]
ld = get_link_density(elem)
score = candidate['content_score']
logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
score,
describe(elem),
ld,
score * (1 - ld)))
# log.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
# score,
# describe(elem),
# ld,
# score * (1 - ld)))
candidate['content_score'] *= (1 - ld)
return candidates
@ -280,7 +281,7 @@ def reverse_tags(node, *tag_names):
yield e
def sanitize(node, candidates, min_text_len):
def sanitize(node, candidates, options):
for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if class_weight(header) < 0 or get_link_density(header) > 0.33:
header.drop_tree()
@ -301,8 +302,8 @@ def sanitize(node, candidates, min_text_len):
tag = el.tag
if weight + content_score < 0:
logging.debug("Cleaned %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, ))
# log.debug("Cleaned %s with score %6.3f and weight %-3s" %
# (describe(el), content_score, weight, ))
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
@ -339,7 +340,7 @@ def sanitize(node, candidates, min_text_len):
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < (min_text_len) and (counts["img"] == 0 or counts["img"] > 2):
elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2):
reason = "too short content length %s without a single image" % content_length
to_remove = True
elif weight < 25 and link_density > 0.2:
@ -365,7 +366,7 @@ def sanitize(node, candidates, min_text_len):
x = 1
siblings = []
for sib in el.itersiblings():
#logging.debug(sib.text_content())
#log.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
i += 1
@ -373,25 +374,25 @@ def sanitize(node, candidates, min_text_len):
if i == x:
break
for sib in el.itersiblings(preceding=True):
#logging.debug(sib.text_content())
#log.debug(sib.text_content())
sib_content_length = text_length(sib)
if sib_content_length:
j += 1
siblings.append(sib_content_length)
if j == x:
break
#logging.debug(str(siblings))
#log.debug(str(siblings))
if siblings and sum(siblings) > 1000:
to_remove = False
logging.debug("Allowing %s" % describe(el))
log.debug("Allowing %s" % describe(el))
for desnode in tags(el, "table", "ul", "div"):
allowed[desnode] = True
if to_remove:
logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
(content_score, describe(el), weight, reason))
# log.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
# (content_score, describe(el), weight, reason))
#print tounicode(el)
#logging.debug("pname %s pweight %.3f" %(pname, pweight))
#log.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
# for el in ([node] + [n for n in node.iter()]):
@ -412,49 +413,52 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
else:
output = fragment_fromstring('<div/>')
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no
# concept of simple text
append = False
if sibling is best_elem:
append = True
sibling_key = sibling # HashableElement(sibling)
# Print out sibling information for debugging.
if sibling_key in candidates:
sibling_candidate = candidates[sibling_key]
logging.debug(
"Sibling: %6.3f %s" %
(sibling_candidate['content_score'], describe(sibling))
)
else:
logging.debug("Sibling: %s" % describe(sibling))
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if sibling.tag == "p":
link_density = get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
if best_elem.getparent() is not None:
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no
# concept of simple text
append = False
if sibling is best_elem:
append = True
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
sibling_key = sibling # HashableElement(sibling)
# Print out sibling information for debugging.
if sibling_key in candidates:
sibling_candidate = candidates[sibling_key]
log.debug(
"Sibling: %6.3f %s" %
(sibling_candidate['content_score'], describe(sibling))
)
else:
log.debug("Sibling: %s" % describe(sibling))
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if append:
# We don't want to append directly to output, but the div
# in html->body->div
if enclose_with_html_tag:
output.getchildren()[0].getchildren()[0].append(sibling)
else:
output.append(sibling)
if sibling.tag == "p":
link_density = get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
append = True
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
append = True
if append:
# We don't want to append directly to output, but the div
# in html->body->div
if enclose_with_html_tag:
output.getchildren()[0].getchildren()[0].append(sibling)
else:
output.append(sibling)
else:
output = best_elem
return output
def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
def get_article(doc, options, enclose_with_html_tag=True):
try:
ruthless = True
while True:
@ -465,8 +469,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
if ruthless:
remove_unlikely_candidates(doc)
transform_misused_divs_into_paragraphs(doc)
candidates = score_paragraphs(doc, min_text_len)
candidates = score_paragraphs(doc, options)
best_candidate = select_best_candidate(candidates)
if best_candidate:
confidence = best_candidate['content_score']
@ -474,22 +477,18 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
enclose_with_html_tag=enclose_with_html_tag)
else:
if ruthless:
logging.debug("ruthless removal did not work. ")
log.debug("ruthless removal did not work. ")
ruthless = False
logging.debug("ended up stripping too much - going for a safer parse")
log.debug("ended up stripping too much - going for a safer parse")
# try again
continue
else:
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
return Summary(0, None, '', '')
log.debug("Ruthless and lenient parsing did not work. Returning raw html")
return Summary(None, 0, '', '')
cleaned_article = sanitize(
article,
candidates,
min_text_len
)
cleaned_article = sanitize(article, candidates, options)
of_acceptable_length = len(cleaned_article or '') >= retry_len
of_acceptable_length = len(cleaned_article or '') >= options['retry_length']
if ruthless and not of_acceptable_length:
ruthless = False
continue # try again
@ -500,7 +499,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
title=get_title(doc))
except StandardError as e:
logging.exception('error getting summary: ')
log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
@ -615,10 +614,13 @@ def find_base_url(url):
cleaned_segments = clean_segments(segments)
new_path = '/'.join(cleaned_segments)
new_parts = (parts.scheme, parts.netloc, new_path, '', '')
return urlparse.urlunsplit(new_parts)
base_url = urlparse.urlunsplit(new_parts)
log.debug('url: %s' % url)
log.debug('base_url: %s' % base_url)
return base_url
class CandidatePage():
class NextPageCandidate():
'''
An object that tracks a single href that is a candidate for the location of
the next page. Note that this is distinct from the candidates used when
@ -650,7 +652,7 @@ def eval_href(parsed_urls, url, base_url, link):
return None, None, False
href = strip_trailing_slash(raw_href)
logging.debug('evaluating next page link: %s' % href)
# log.debug('evaluating next page link: %s' % href)
# If we've already seen this page, ignore it.
if href == base_url or href == url or href in parsed_urls:
@ -658,7 +660,7 @@ def eval_href(parsed_urls, url, base_url, link):
# If it's on a different domain, skip it.
if url is not None and not same_domain(url, href):
logging.debug('rejecting %s: different domain' % href)
# log.debug('rejecting %s: different domain' % href)
return raw_href, href, False
return raw_href, href, True
@ -672,7 +674,7 @@ def eval_link_text(link):
return link_text, True
def find_or_create_page(candidates, href, link_text):
def find_or_create_page_candidate(candidates, href, link_text):
'''
Finds or creates a candidate page object for a next-page href. If one
exists already, which happens if there are multiple links with the same
@ -683,14 +685,12 @@ def find_or_create_page(candidates, href, link_text):
if href in candidates:
return candidates[href], False
else:
candidate = CandidatePage(link_text, href)
candidate = NextPageCandidate(link_text, href)
candidates[href] = candidate
return candidate, True
def eval_possible_next_page_link(
parsed_urls, url, base_url, candidates, link):
def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
raw_href, href, ok = eval_href(parsed_urls, url, base_url, link)
if not ok:
return
@ -706,21 +706,31 @@ def eval_possible_next_page_link(
if not re.search(r'\d', href_leftover):
return
candidate, created = find_or_create_page(candidates, href, link_text)
candidate, created = find_or_create_page_candidate(
candidates,
href,
link_text
)
if not created:
candidate.link_text += ' | ' + link_text
link_class_name = link.get('class') or ''
link_id = link.get('id') or ''
link_data = ' '.join([link_text, link_class_name, link_id])
# log.debug('link: %s' % tostring(link))
log.debug('link_data: %s' % link_data)
if base_url is not None and href.find(base_url) != 0:
log.debug('no base_url')
candidate.score -= 25
if REGEXES['nextLink'].search(link_data):
log.debug('link_data nextLink regex match')
candidate.score += 50
if REGEXES['page'].search(link_data):
log.debug('link_data page regex match')
candidate.score += 25
if REGEXES['firstLast'].search(link_data):
@ -754,6 +764,7 @@ def eval_possible_next_page_link(
parent = parent.getparent()
if REGEXES['page'].search(href):
log.debug('href regex match')
candidate.score += 25
if REGEXES['extraneous'].search(href):
@ -768,16 +779,15 @@ def eval_possible_next_page_link(
candidate.score -= 10
else:
candidate.score += max(0, 10 - link_text_as_int)
except ValueError as e:
except ValueError as exc:
pass
def find_next_page_link(parsed_urls, url, elem):
def find_next_page_url(parsed_urls, url, elem):
links = tags(elem, 'a')
base_url = find_base_url(url)
# candidates is a mapping from URLs to CandidatePage objects that represent
# information used to determine if a URL points to the next page in the
# article.
# candidates is a mapping from URLs to NextPageCandidate objects that
# represent information used to determine if a URL points to the next page
# in the article.
candidates = {}
for link in links:
eval_possible_next_page_link(
@ -789,28 +799,44 @@ def find_next_page_link(parsed_urls, url, elem):
)
top_page = None
for url, page in candidates.items():
logging.debug('next page score of %s: %s' % (url, page.score))
log.debug('next page score of %s: %s' % (url, page.score))
if 50 <= page.score and (not top_page or top_page.score < page.score):
top_page = page
if top_page:
logging.debug('next page link found: %s' % top_page.href)
log.debug('next page link found: %s' % top_page.href)
parsed_urls.add(top_page.href)
return top_page.href
else:
return None
def append_next_page(fetcher, next_page_link, doc):
# html = fetcher.urlread(next_page_link)
# page_doc = parse(html, next_page_link)
pass
def append_next_page(parsed_urls, page_url, doc, options):
log.debug(str((parsed_urls, page_url, doc, options)))
log.debug('appending next page: %s' % page_url)
fetcher = options['urlfetch']
html = fetcher.urlread(page_url)
orig_page_doc = parse(html, page_url)
next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc)
page_article = get_article(orig_page_doc, options)
log.debug('Appending ' + str(page_article))
if page_article.html:
page_doc = fragment_fromstring(page_article.html)
# page_doc is a singular element containing the page article elements. We
# want to add its children to the main article document to which we are
# appending a page.
for elem in page_doc:
doc.append(elem)
if next_page_url is not None:
append_next_page(parsed_urls, next_page_url, doc, options)
def parse(input, url):
raw_doc = build_doc(input)
doc = html_cleaner.clean_html(raw_doc)
log.debug('parse url: %s', url)
if url:
log.debug('making links absolute')
doc.make_links_absolute(url, resolve_base_href=True)
else:
doc.resolve_base_href()
@ -831,6 +857,8 @@ class Document:
- attributes:
- debug: output debug messages
- min_text_length:
- multipage: should we check for page 2/3 of article and build
together?
- retry_length:
- url: will allow adjusting links to be absolute
@ -840,7 +868,12 @@ class Document:
self.input_doc = input_doc
self.options = options
self.options['urlfetch'] = urlfetch.UrlFetch()
self.options['urlfetch'] = self.options.get('urlfetch',
urlfetch.UrlFetch())
self.options['min_text_length'] = self.options.get('min_text_length',
self.TEXT_LENGTH_THRESHOLD)
self.options['retry_length'] = self.options.get('retry_length',
self.RETRY_LENGTH)
self._html = None
@property
@ -877,23 +910,20 @@ class Document:
return summary.html
def _summary(self, enclose_with_html_tag=True):
# the first page parsed into a elementree element
doc = self.html
# the set of urls we've processed so far
parsed_urls = set()
url = self.options.get('url', None)
if url is not None:
parsed_urls.add(url)
next_page_link = find_next_page_link(parsed_urls, url, doc)
if next_page_link is not None:
fetcher = self.options.get('urlfetch')
append_next_page(fetcher, next_page_link, doc)
min_text_len = self.options.get(
'min_text_length',
self.TEXT_LENGTH_THRESHOLD
)
retry_len = self.options.get('retry_length', self.RETRY_LENGTH)
return get_article(doc, min_text_len, retry_len,
enclose_with_html_tag=enclose_with_html_tag)
def debug(self, *a):
if self.options.get('debug', False):
log.debug(*a)
# check the current doc for a next page if requested
if self.options.get('multipage', False):
next_page_link = find_next_page_url(parsed_urls, url, doc)
if next_page_link is not None:
append_next_page(parsed_urls, next_page_link, doc, self.options)
return get_article(doc, self.options,
enclose_with_html_tag=enclose_with_html_tag)

@ -9,6 +9,7 @@ This allows you to tweak and change the readability algorithm and see how it
changes existing results, hopefully for the better.
"""
import logging
import lxml.html
import lxml.html.diff
import os
@ -20,6 +21,7 @@ import yaml
from lxml.html import builder as B
from readability_lxml import readability
from readability_lxml import urlfetch
DIFF_SUFFIX = '-diff.html'
@ -100,13 +102,24 @@ del img {
class ReadabilityTest:
def __init__(
self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path
self,
dir_path,
enabled,
name,
url,
desc,
notes,
url_map,
orig_path,
rdbl_path
):
self.dir_path = dir_path
self.enabled = enabled
self.name = name
self.url = url
self.desc = desc
self.notes = notes
self.url_map = url_map
self.orig_path = orig_path
self.rdbl_path = rdbl_path
@ -137,20 +150,17 @@ def make_path(dir_path, name, suffix):
def make_readability_test(dir_path, name, spec_dict):
if 'enabled' in spec_dict:
enabled = spec_dict['enabled']
else:
enabled = True
if 'notes' in spec_dict:
notes = spec_dict['notes']
else:
notes = ''
enabled = spec_dict.get('enabled', True)
notes = spec_dict.get('notes', '')
url_map = spec_dict.get('url_map', dict())
return ReadabilityTest(
dir_path,
enabled,
name,
spec_dict['url'],
spec_dict['test_description'],
notes,
url_map,
make_path(dir_path, name, ORIGINAL_SUFFIX),
make_path(dir_path, name, READABLE_SUFFIX)
)
@ -180,7 +190,13 @@ def execute_test(test_data):
if test_data is None:
return None
else:
doc = readability.Document(test_data.orig_html)
url = test_data.test.url
fetcher = urlfetch.MockUrlFetch(test_data.test.url_map)
doc = readability.Document(
test_data.orig_html,
url=url,
urlfetch=fetcher
)
summary = doc.summary_with_metadata()
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
return ReadabilityTestResult(test_data, summary.html, diff)
@ -193,6 +209,7 @@ def element_string_lengths(elems):
class ResultSummary():
def __init__(self, result):
# logging.debug('diff: %s' % result.diff_html)
doc = lxml.html.fragment_fromstring(result.diff_html)
insertions = doc.xpath('//ins')
@ -319,6 +336,7 @@ def run_readability_tests():
write_summary(TEST_SUMMARY_PATH, zip(tests, results))
def main():
logging.basicConfig(level = logging.DEBUG)
if len(sys.argv) > 1 and sys.argv[1] == 'unittest':
del sys.argv[1]
return unittest.main()

@ -0,0 +1,60 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>A Simple Multi-Page Article For Testing : Page 3</title>
</head>
<body>
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
<p>
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
erat, lobortis varius est massa quis metus. Donec vitae justo
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
imperdiet est.
</p>
<p>
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
dolor, imperdiet eget rutrum tempus, euismod nec augue.
</p>
<p>
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
neque magna, in laoreet felis. Aenean elit ligula, tempor in
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
magna scelerisque vitae vulputate ipsum luctus.
</p>
<ul id="pageNumbers">
<li> 1 </li>
<li>
<a title="Page 1" href="/article.html">1</a>
</li>
<li>
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
</li>
</ul>
</body>
</html>
Loading…
Cancel
Save