Checkpoint multi-page readability work

Restructured code to better support multi-page readability.  Improved tests.

Rick:
This generally works and the tests pass, but there are some broken cases with
the multipage bits that are causing me grief. It does pass the one test case.
I made the multipage an option vs doing it by default. The more I change the
code the harder future merges will be, but man it needs some cleanup, reorg,
and comments.

Conflicts:

	src/readability_lxml/readability.py
	src/tests/regression.py
0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent 99efa5c10b
commit f8315d011c

@ -13,6 +13,8 @@ from lxml.html import document_fromstring
from lxml.html import fragment_fromstring from lxml.html import fragment_fromstring
from cleaners import clean_attributes from cleaners import clean_attributes
from cleaners import html_cleaner from cleaners import html_cleaner
from htmls import build_doc from htmls import build_doc
from htmls import get_body from htmls import get_body
@ -150,7 +152,7 @@ def transform_misused_divs_into_paragraphs(doc):
# transform <div>s that do not contain other block elements into <p>s # transform <div>s that do not contain other block elements into <p>s
if not REGEXES['divToPElementsRe'].search( if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))): unicode(''.join(map(tostring, list(elem))))):
logging.debug("Altering %s to p" % (describe(elem))) # log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p" elem.tag = "p"
#print "Fixed element "+describe(elem) #print "Fixed element "+describe(elem)
@ -160,7 +162,7 @@ def transform_misused_divs_into_paragraphs(doc):
p.text = elem.text p.text = elem.text
elem.text = None elem.text = None
elem.insert(0, p) elem.insert(0, p)
logging.debug("Appended %s to %s" % (tounicode(p), describe(elem))) # log.debug("Appended %s to %s" % (tounicode(p), describe(elem)))
#print "Appended "+tounicode(p)+" to "+describe(elem) #print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))): for pos, child in reversed(list(enumerate(elem))):
@ -169,9 +171,9 @@ def transform_misused_divs_into_paragraphs(doc):
p.text = child.tail p.text = child.tail
child.tail = None child.tail = None
elem.insert(pos + 1, p) elem.insert(pos + 1, p)
logging.debug("Inserted %s to %s" % ( # log.debug("Inserted %s to %s" % (
tounicode(p), # tounicode(p),
describe(elem))) # describe(elem)))
#print "Inserted "+tounicode(p)+" to "+describe(elem) #print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br': if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem) #print 'Dropped <br> at '+describe(elem)
@ -181,13 +183,13 @@ def transform_misused_divs_into_paragraphs(doc):
def remove_unlikely_candidates(doc): def remove_unlikely_candidates(doc):
for elem in doc.iter(): for elem in doc.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#logging.debug(s) #log.debug(s)
if (REGEXES['unlikelyCandidatesRe'].search(s) and if (REGEXES['unlikelyCandidatesRe'].search(s) and
(not REGEXES['okMaybeItsACandidateRe'].search(s)) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and
elem.tag != 'body' and elem.tag != 'body' and
elem.getparent() is not None elem.getparent() is not None
): ):
logging.debug("Removing unlikely candidate - %s" % describe(elem)) # log.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree() elem.drop_tree()
@ -200,14 +202,13 @@ def get_link_density(elem):
total_length = text_length(elem) total_length = text_length(elem)
return float(link_length) / max(total_length, 1) return float(link_length) / max(total_length, 1)
def score_paragraphs(doc, options):
def score_paragraphs(doc, min_text_len):
candidates = {} candidates = {}
#logging.debug(str([describe(node) for node in tags(doc, "div")])) #log.debug(str([describe(node) for node in tags(doc, "div")]))
ordered = [] ordered = []
for elem in tags(doc, "p", "pre", "td"): for elem in tags(doc, "p", "pre", "td"):
logging.debug('Scoring %s' % describe(elem)) # log.debug('Scoring %s' % describe(elem))
parent_node = elem.getparent() parent_node = elem.getparent()
if parent_node is None: if parent_node is None:
continue continue
@ -217,7 +218,7 @@ def score_paragraphs(doc, min_text_len):
inner_text_len = len(inner_text) inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it. # If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < min_text_len: if inner_text_len < options['min_text_length']:
continue continue
if parent_node not in candidates: if parent_node not in candidates:
@ -246,11 +247,11 @@ def score_paragraphs(doc, min_text_len):
candidate = candidates[elem] candidate = candidates[elem]
ld = get_link_density(elem) ld = get_link_density(elem)
score = candidate['content_score'] score = candidate['content_score']
logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % ( # log.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
score, # score,
describe(elem), # describe(elem),
ld, # ld,
score * (1 - ld))) # score * (1 - ld)))
candidate['content_score'] *= (1 - ld) candidate['content_score'] *= (1 - ld)
return candidates return candidates
@ -280,7 +281,7 @@ def reverse_tags(node, *tag_names):
yield e yield e
def sanitize(node, candidates, min_text_len): def sanitize(node, candidates, options):
for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if class_weight(header) < 0 or get_link_density(header) > 0.33: if class_weight(header) < 0 or get_link_density(header) > 0.33:
header.drop_tree() header.drop_tree()
@ -301,8 +302,8 @@ def sanitize(node, candidates, min_text_len):
tag = el.tag tag = el.tag
if weight + content_score < 0: if weight + content_score < 0:
logging.debug("Cleaned %s with score %6.3f and weight %-3s" % # log.debug("Cleaned %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, )) # (describe(el), content_score, weight, ))
el.drop_tree() el.drop_tree()
elif el.text_content().count(",") < 10: elif el.text_content().count(",") < 10:
counts = {} counts = {}
@ -339,7 +340,7 @@ def sanitize(node, candidates, min_text_len):
elif counts["input"] > (counts["p"] / 3): elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s" reason = "less than 3x <p>s than <input>s"
to_remove = True to_remove = True
elif content_length < (min_text_len) and (counts["img"] == 0 or counts["img"] > 2): elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2):
reason = "too short content length %s without a single image" % content_length reason = "too short content length %s without a single image" % content_length
to_remove = True to_remove = True
elif weight < 25 and link_density > 0.2: elif weight < 25 and link_density > 0.2:
@ -365,7 +366,7 @@ def sanitize(node, candidates, min_text_len):
x = 1 x = 1
siblings = [] siblings = []
for sib in el.itersiblings(): for sib in el.itersiblings():
#logging.debug(sib.text_content()) #log.debug(sib.text_content())
sib_content_length = text_length(sib) sib_content_length = text_length(sib)
if sib_content_length: if sib_content_length:
i += 1 i += 1
@ -373,25 +374,25 @@ def sanitize(node, candidates, min_text_len):
if i == x: if i == x:
break break
for sib in el.itersiblings(preceding=True): for sib in el.itersiblings(preceding=True):
#logging.debug(sib.text_content()) #log.debug(sib.text_content())
sib_content_length = text_length(sib) sib_content_length = text_length(sib)
if sib_content_length: if sib_content_length:
j += 1 j += 1
siblings.append(sib_content_length) siblings.append(sib_content_length)
if j == x: if j == x:
break break
#logging.debug(str(siblings)) #log.debug(str(siblings))
if siblings and sum(siblings) > 1000: if siblings and sum(siblings) > 1000:
to_remove = False to_remove = False
logging.debug("Allowing %s" % describe(el)) log.debug("Allowing %s" % describe(el))
for desnode in tags(el, "table", "ul", "div"): for desnode in tags(el, "table", "ul", "div"):
allowed[desnode] = True allowed[desnode] = True
if to_remove: if to_remove:
logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." % # log.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
(content_score, describe(el), weight, reason)) # (content_score, describe(el), weight, reason))
#print tounicode(el) #print tounicode(el)
#logging.debug("pname %s pweight %.3f" %(pname, pweight)) #log.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree() el.drop_tree()
# for el in ([node] + [n for n in node.iter()]): # for el in ([node] + [n for n in node.iter()]):
@ -412,49 +413,52 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
else: else:
output = fragment_fromstring('<div/>') output = fragment_fromstring('<div/>')
best_elem = best_candidate['elem'] best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren(): if best_elem.getparent() is not None:
#if isinstance(sibling, NavigableString): continue#in lxml there no for sibling in best_elem.getparent().getchildren():
# concept of simple text #if isinstance(sibling, NavigableString): continue#in lxml there no
append = False # concept of simple text
if sibling is best_elem: append = False
append = True if sibling is best_elem:
sibling_key = sibling # HashableElement(sibling)
# Print out sibling information for debugging.
if sibling_key in candidates:
sibling_candidate = candidates[sibling_key]
logging.debug(
"Sibling: %6.3f %s" %
(sibling_candidate['content_score'], describe(sibling))
)
else:
logging.debug("Sibling: %s" % describe(sibling))
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True
if sibling.tag == "p":
link_density = get_link_density(sibling)
node_content = sibling.text or ""
node_length = len(node_content)
if node_length > 80 and link_density < 0.25:
append = True append = True
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): sibling_key = sibling # HashableElement(sibling)
# Print out sibling information for debugging.
if sibling_key in candidates:
sibling_candidate = candidates[sibling_key]
log.debug(
"Sibling: %6.3f %s" %
(sibling_candidate['content_score'], describe(sibling))
)
else:
log.debug("Sibling: %s" % describe(sibling))
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
append = True append = True
if append: if sibling.tag == "p":
# We don't want to append directly to output, but the div link_density = get_link_density(sibling)
# in html->body->div node_content = sibling.text or ""
if enclose_with_html_tag: node_length = len(node_content)
output.getchildren()[0].getchildren()[0].append(sibling)
else: if node_length > 80 and link_density < 0.25:
output.append(sibling) append = True
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
append = True
if append:
# We don't want to append directly to output, but the div
# in html->body->div
if enclose_with_html_tag:
output.getchildren()[0].getchildren()[0].append(sibling)
else:
output.append(sibling)
else:
output = best_elem
return output return output
def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True): def get_article(doc, options, enclose_with_html_tag=True):
try: try:
ruthless = True ruthless = True
while True: while True:
@ -465,8 +469,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
if ruthless: if ruthless:
remove_unlikely_candidates(doc) remove_unlikely_candidates(doc)
transform_misused_divs_into_paragraphs(doc) transform_misused_divs_into_paragraphs(doc)
candidates = score_paragraphs(doc, min_text_len) candidates = score_paragraphs(doc, options)
best_candidate = select_best_candidate(candidates) best_candidate = select_best_candidate(candidates)
if best_candidate: if best_candidate:
confidence = best_candidate['content_score'] confidence = best_candidate['content_score']
@ -474,22 +477,18 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
enclose_with_html_tag=enclose_with_html_tag) enclose_with_html_tag=enclose_with_html_tag)
else: else:
if ruthless: if ruthless:
logging.debug("ruthless removal did not work. ") log.debug("ruthless removal did not work. ")
ruthless = False ruthless = False
logging.debug("ended up stripping too much - going for a safer parse") log.debug("ended up stripping too much - going for a safer parse")
# try again # try again
continue continue
else: else:
logging.debug("Ruthless and lenient parsing did not work. Returning raw html") log.debug("Ruthless and lenient parsing did not work. Returning raw html")
return Summary(0, None, '', '') return Summary(None, 0, '', '')
cleaned_article = sanitize( cleaned_article = sanitize(article, candidates, options)
article,
candidates,
min_text_len
)
of_acceptable_length = len(cleaned_article or '') >= retry_len of_acceptable_length = len(cleaned_article or '') >= options['retry_length']
if ruthless and not of_acceptable_length: if ruthless and not of_acceptable_length:
ruthless = False ruthless = False
continue # try again continue # try again
@ -500,7 +499,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
title=get_title(doc)) title=get_title(doc))
except StandardError as e: except StandardError as e:
logging.exception('error getting summary: ') log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2] raise Unparseable(str(e)), None, sys.exc_info()[2]
@ -615,10 +614,13 @@ def find_base_url(url):
cleaned_segments = clean_segments(segments) cleaned_segments = clean_segments(segments)
new_path = '/'.join(cleaned_segments) new_path = '/'.join(cleaned_segments)
new_parts = (parts.scheme, parts.netloc, new_path, '', '') new_parts = (parts.scheme, parts.netloc, new_path, '', '')
return urlparse.urlunsplit(new_parts) base_url = urlparse.urlunsplit(new_parts)
log.debug('url: %s' % url)
log.debug('base_url: %s' % base_url)
return base_url
class CandidatePage(): class NextPageCandidate():
''' '''
An object that tracks a single href that is a candidate for the location of An object that tracks a single href that is a candidate for the location of
the next page. Note that this is distinct from the candidates used when the next page. Note that this is distinct from the candidates used when
@ -650,7 +652,7 @@ def eval_href(parsed_urls, url, base_url, link):
return None, None, False return None, None, False
href = strip_trailing_slash(raw_href) href = strip_trailing_slash(raw_href)
logging.debug('evaluating next page link: %s' % href) # log.debug('evaluating next page link: %s' % href)
# If we've already seen this page, ignore it. # If we've already seen this page, ignore it.
if href == base_url or href == url or href in parsed_urls: if href == base_url or href == url or href in parsed_urls:
@ -658,7 +660,7 @@ def eval_href(parsed_urls, url, base_url, link):
# If it's on a different domain, skip it. # If it's on a different domain, skip it.
if url is not None and not same_domain(url, href): if url is not None and not same_domain(url, href):
logging.debug('rejecting %s: different domain' % href) # log.debug('rejecting %s: different domain' % href)
return raw_href, href, False return raw_href, href, False
return raw_href, href, True return raw_href, href, True
@ -672,7 +674,7 @@ def eval_link_text(link):
return link_text, True return link_text, True
def find_or_create_page(candidates, href, link_text): def find_or_create_page_candidate(candidates, href, link_text):
''' '''
Finds or creates a candidate page object for a next-page href. If one Finds or creates a candidate page object for a next-page href. If one
exists already, which happens if there are multiple links with the same exists already, which happens if there are multiple links with the same
@ -683,14 +685,12 @@ def find_or_create_page(candidates, href, link_text):
if href in candidates: if href in candidates:
return candidates[href], False return candidates[href], False
else: else:
candidate = CandidatePage(link_text, href) candidate = NextPageCandidate(link_text, href)
candidates[href] = candidate candidates[href] = candidate
return candidate, True return candidate, True
def eval_possible_next_page_link( def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
parsed_urls, url, base_url, candidates, link):
raw_href, href, ok = eval_href(parsed_urls, url, base_url, link) raw_href, href, ok = eval_href(parsed_urls, url, base_url, link)
if not ok: if not ok:
return return
@ -706,21 +706,31 @@ def eval_possible_next_page_link(
if not re.search(r'\d', href_leftover): if not re.search(r'\d', href_leftover):
return return
candidate, created = find_or_create_page(candidates, href, link_text) candidate, created = find_or_create_page_candidate(
candidates,
href,
link_text
)
if not created: if not created:
candidate.link_text += ' | ' + link_text candidate.link_text += ' | ' + link_text
link_class_name = link.get('class') or '' link_class_name = link.get('class') or ''
link_id = link.get('id') or '' link_id = link.get('id') or ''
link_data = ' '.join([link_text, link_class_name, link_id]) link_data = ' '.join([link_text, link_class_name, link_id])
# log.debug('link: %s' % tostring(link))
log.debug('link_data: %s' % link_data)
if base_url is not None and href.find(base_url) != 0: if base_url is not None and href.find(base_url) != 0:
log.debug('no base_url')
candidate.score -= 25 candidate.score -= 25
if REGEXES['nextLink'].search(link_data): if REGEXES['nextLink'].search(link_data):
log.debug('link_data nextLink regex match')
candidate.score += 50 candidate.score += 50
if REGEXES['page'].search(link_data): if REGEXES['page'].search(link_data):
log.debug('link_data page regex match')
candidate.score += 25 candidate.score += 25
if REGEXES['firstLast'].search(link_data): if REGEXES['firstLast'].search(link_data):
@ -754,6 +764,7 @@ def eval_possible_next_page_link(
parent = parent.getparent() parent = parent.getparent()
if REGEXES['page'].search(href): if REGEXES['page'].search(href):
log.debug('href regex match')
candidate.score += 25 candidate.score += 25
if REGEXES['extraneous'].search(href): if REGEXES['extraneous'].search(href):
@ -768,16 +779,15 @@ def eval_possible_next_page_link(
candidate.score -= 10 candidate.score -= 10
else: else:
candidate.score += max(0, 10 - link_text_as_int) candidate.score += max(0, 10 - link_text_as_int)
except ValueError as e: except ValueError as exc:
pass pass
def find_next_page_url(parsed_urls, url, elem):
def find_next_page_link(parsed_urls, url, elem):
links = tags(elem, 'a') links = tags(elem, 'a')
base_url = find_base_url(url) base_url = find_base_url(url)
# candidates is a mapping from URLs to CandidatePage objects that represent # candidates is a mapping from URLs to NextPageCandidate objects that
# information used to determine if a URL points to the next page in the # represent information used to determine if a URL points to the next page
# article. # in the article.
candidates = {} candidates = {}
for link in links: for link in links:
eval_possible_next_page_link( eval_possible_next_page_link(
@ -789,28 +799,44 @@ def find_next_page_link(parsed_urls, url, elem):
) )
top_page = None top_page = None
for url, page in candidates.items(): for url, page in candidates.items():
logging.debug('next page score of %s: %s' % (url, page.score)) log.debug('next page score of %s: %s' % (url, page.score))
if 50 <= page.score and (not top_page or top_page.score < page.score): if 50 <= page.score and (not top_page or top_page.score < page.score):
top_page = page top_page = page
if top_page: if top_page:
logging.debug('next page link found: %s' % top_page.href) log.debug('next page link found: %s' % top_page.href)
parsed_urls.add(top_page.href) parsed_urls.add(top_page.href)
return top_page.href return top_page.href
else: else:
return None return None
def append_next_page(fetcher, next_page_link, doc): def append_next_page(parsed_urls, page_url, doc, options):
# html = fetcher.urlread(next_page_link) log.debug(str((parsed_urls, page_url, doc, options)))
# page_doc = parse(html, next_page_link) log.debug('appending next page: %s' % page_url)
pass fetcher = options['urlfetch']
html = fetcher.urlread(page_url)
orig_page_doc = parse(html, page_url)
next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc)
page_article = get_article(orig_page_doc, options)
log.debug('Appending ' + str(page_article))
if page_article.html:
page_doc = fragment_fromstring(page_article.html)
# page_doc is a singular element containing the page article elements. We
# want to add its children to the main article document to which we are
# appending a page.
for elem in page_doc:
doc.append(elem)
if next_page_url is not None:
append_next_page(parsed_urls, next_page_url, doc, options)
def parse(input, url): def parse(input, url):
raw_doc = build_doc(input) raw_doc = build_doc(input)
doc = html_cleaner.clean_html(raw_doc) doc = html_cleaner.clean_html(raw_doc)
log.debug('parse url: %s', url)
if url: if url:
log.debug('making links absolute')
doc.make_links_absolute(url, resolve_base_href=True) doc.make_links_absolute(url, resolve_base_href=True)
else: else:
doc.resolve_base_href() doc.resolve_base_href()
@ -831,6 +857,8 @@ class Document:
- attributes: - attributes:
- debug: output debug messages - debug: output debug messages
- min_text_length: - min_text_length:
- multipage: should we check for page 2/3 of article and build
together?
- retry_length: - retry_length:
- url: will allow adjusting links to be absolute - url: will allow adjusting links to be absolute
@ -840,7 +868,12 @@ class Document:
self.input_doc = input_doc self.input_doc = input_doc
self.options = options self.options = options
self.options['urlfetch'] = urlfetch.UrlFetch() self.options['urlfetch'] = self.options.get('urlfetch',
urlfetch.UrlFetch())
self.options['min_text_length'] = self.options.get('min_text_length',
self.TEXT_LENGTH_THRESHOLD)
self.options['retry_length'] = self.options.get('retry_length',
self.RETRY_LENGTH)
self._html = None self._html = None
@property @property
@ -877,23 +910,20 @@ class Document:
return summary.html return summary.html
def _summary(self, enclose_with_html_tag=True): def _summary(self, enclose_with_html_tag=True):
# the first page parsed into a elementree element
doc = self.html doc = self.html
# the set of urls we've processed so far
parsed_urls = set() parsed_urls = set()
url = self.options.get('url', None) url = self.options.get('url', None)
if url is not None: if url is not None:
parsed_urls.add(url) parsed_urls.add(url)
next_page_link = find_next_page_link(parsed_urls, url, doc)
if next_page_link is not None:
fetcher = self.options.get('urlfetch')
append_next_page(fetcher, next_page_link, doc)
min_text_len = self.options.get(
'min_text_length',
self.TEXT_LENGTH_THRESHOLD
)
retry_len = self.options.get('retry_length', self.RETRY_LENGTH)
return get_article(doc, min_text_len, retry_len,
enclose_with_html_tag=enclose_with_html_tag)
def debug(self, *a): # check the current doc for a next page if requested
if self.options.get('debug', False): if self.options.get('multipage', False):
log.debug(*a) next_page_link = find_next_page_url(parsed_urls, url, doc)
if next_page_link is not None:
append_next_page(parsed_urls, next_page_link, doc, self.options)
return get_article(doc, self.options,
enclose_with_html_tag=enclose_with_html_tag)

@ -9,6 +9,7 @@ This allows you to tweak and change the readability algorithm and see how it
changes existing results, hopefully for the better. changes existing results, hopefully for the better.
""" """
import logging
import lxml.html import lxml.html
import lxml.html.diff import lxml.html.diff
import os import os
@ -20,6 +21,7 @@ import yaml
from lxml.html import builder as B from lxml.html import builder as B
from readability_lxml import readability from readability_lxml import readability
from readability_lxml import urlfetch
DIFF_SUFFIX = '-diff.html' DIFF_SUFFIX = '-diff.html'
@ -100,13 +102,24 @@ del img {
class ReadabilityTest: class ReadabilityTest:
def __init__( def __init__(
self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path self,
dir_path,
enabled,
name,
url,
desc,
notes,
url_map,
orig_path,
rdbl_path
): ):
self.dir_path = dir_path self.dir_path = dir_path
self.enabled = enabled self.enabled = enabled
self.name = name self.name = name
self.url = url
self.desc = desc self.desc = desc
self.notes = notes self.notes = notes
self.url_map = url_map
self.orig_path = orig_path self.orig_path = orig_path
self.rdbl_path = rdbl_path self.rdbl_path = rdbl_path
@ -137,20 +150,17 @@ def make_path(dir_path, name, suffix):
def make_readability_test(dir_path, name, spec_dict): def make_readability_test(dir_path, name, spec_dict):
if 'enabled' in spec_dict: enabled = spec_dict.get('enabled', True)
enabled = spec_dict['enabled'] notes = spec_dict.get('notes', '')
else: url_map = spec_dict.get('url_map', dict())
enabled = True
if 'notes' in spec_dict:
notes = spec_dict['notes']
else:
notes = ''
return ReadabilityTest( return ReadabilityTest(
dir_path, dir_path,
enabled, enabled,
name, name,
spec_dict['url'],
spec_dict['test_description'], spec_dict['test_description'],
notes, notes,
url_map,
make_path(dir_path, name, ORIGINAL_SUFFIX), make_path(dir_path, name, ORIGINAL_SUFFIX),
make_path(dir_path, name, READABLE_SUFFIX) make_path(dir_path, name, READABLE_SUFFIX)
) )
@ -180,7 +190,13 @@ def execute_test(test_data):
if test_data is None: if test_data is None:
return None return None
else: else:
doc = readability.Document(test_data.orig_html) url = test_data.test.url
fetcher = urlfetch.MockUrlFetch(test_data.test.url_map)
doc = readability.Document(
test_data.orig_html,
url=url,
urlfetch=fetcher
)
summary = doc.summary_with_metadata() summary = doc.summary_with_metadata()
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html) diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
return ReadabilityTestResult(test_data, summary.html, diff) return ReadabilityTestResult(test_data, summary.html, diff)
@ -193,6 +209,7 @@ def element_string_lengths(elems):
class ResultSummary(): class ResultSummary():
def __init__(self, result): def __init__(self, result):
# logging.debug('diff: %s' % result.diff_html)
doc = lxml.html.fragment_fromstring(result.diff_html) doc = lxml.html.fragment_fromstring(result.diff_html)
insertions = doc.xpath('//ins') insertions = doc.xpath('//ins')
@ -319,6 +336,7 @@ def run_readability_tests():
write_summary(TEST_SUMMARY_PATH, zip(tests, results)) write_summary(TEST_SUMMARY_PATH, zip(tests, results))
def main(): def main():
logging.basicConfig(level = logging.DEBUG)
if len(sys.argv) > 1 and sys.argv[1] == 'unittest': if len(sys.argv) > 1 and sys.argv[1] == 'unittest':
del sys.argv[1] del sys.argv[1]
return unittest.main() return unittest.main()

@ -0,0 +1,60 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>A Simple Multi-Page Article For Testing : Page 3</title>
</head>
<body>
<h1>A Simple Multi-Page Article For Testing : Page 3</h1>
<p>
Nullam laoreet, nibh non faucibus dictum, tellus libero varius
erat, lobortis varius est massa quis metus. Donec vitae justo
lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum
justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce
sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis
quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu
imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus
aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla,
luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit
amet viverra pretium, magna tortor suscipit nisi, id interdum lorem
orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis
ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae
imperdiet est.
</p>
<p>
Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac
tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse
cursus, lacus sit amet sodales molestie, dui erat varius velit, non
tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut
pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor
eget placerat magna luctus. Duis mollis ligula a orci ultrices in
facilisis felis feugiat. Morbi eget odio eget erat pulvinar
placerat sed nec erat. Duis dignissim, dolor a lacinia commodo,
metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce
imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus
scelerisque non sit amet orci. Phasellus id quam odio. Nulla
adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio,
tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis
vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna
dolor, imperdiet eget rutrum tempus, euismod nec augue.
</p>
<p>
Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae
neque magna, in laoreet felis. Aenean elit ligula, tempor in
vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat
placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id
vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl,
gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna
a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec
magna scelerisque vitae vulputate ipsum luctus.
</p>
<ul id="pageNumbers">
<li> 1 </li>
<li>
<a title="Page 1" href="/article.html">1</a>
</li>
<li>
<a title="Page 2" href="/article.html?pagewanted=2">2</a>
</li>
</ul>
</body>
</html>
Loading…
Cancel
Save