diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index 7a89d98..75a59ec 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -13,6 +13,8 @@ from lxml.html import document_fromstring from lxml.html import fragment_fromstring from cleaners import clean_attributes + + from cleaners import html_cleaner from htmls import build_doc from htmls import get_body @@ -150,7 +152,7 @@ def transform_misused_divs_into_paragraphs(doc): # transform
s that do not contain other block elements into

s if not REGEXES['divToPElementsRe'].search( unicode(''.join(map(tostring, list(elem))))): - logging.debug("Altering %s to p" % (describe(elem))) + # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) @@ -160,7 +162,7 @@ def transform_misused_divs_into_paragraphs(doc): p.text = elem.text elem.text = None elem.insert(0, p) - logging.debug("Appended %s to %s" % (tounicode(p), describe(elem))) + # log.debug("Appended %s to %s" % (tounicode(p), describe(elem))) #print "Appended "+tounicode(p)+" to "+describe(elem) for pos, child in reversed(list(enumerate(elem))): @@ -169,9 +171,9 @@ def transform_misused_divs_into_paragraphs(doc): p.text = child.tail child.tail = None elem.insert(pos + 1, p) - logging.debug("Inserted %s to %s" % ( - tounicode(p), - describe(elem))) + # log.debug("Inserted %s to %s" % ( + # tounicode(p), + # describe(elem))) #print "Inserted "+tounicode(p)+" to "+describe(elem) if child.tag == 'br': #print 'Dropped
at '+describe(elem) @@ -181,13 +183,13 @@ def transform_misused_divs_into_paragraphs(doc): def remove_unlikely_candidates(doc): for elem in doc.iter(): s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) - #logging.debug(s) + #log.debug(s) if (REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body' and elem.getparent() is not None ): - logging.debug("Removing unlikely candidate - %s" % describe(elem)) + # log.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() @@ -200,14 +202,13 @@ def get_link_density(elem): total_length = text_length(elem) return float(link_length) / max(total_length, 1) - -def score_paragraphs(doc, min_text_len): +def score_paragraphs(doc, options): candidates = {} - #logging.debug(str([describe(node) for node in tags(doc, "div")])) + #log.debug(str([describe(node) for node in tags(doc, "div")])) ordered = [] for elem in tags(doc, "p", "pre", "td"): - logging.debug('Scoring %s' % describe(elem)) + # log.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: continue @@ -217,7 +218,7 @@ def score_paragraphs(doc, min_text_len): inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. - if inner_text_len < min_text_len: + if inner_text_len < options['min_text_length']: continue if parent_node not in candidates: @@ -246,11 +247,11 @@ def score_paragraphs(doc, min_text_len): candidate = candidates[elem] ld = get_link_density(elem) score = candidate['content_score'] - logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % ( - score, - describe(elem), - ld, - score * (1 - ld))) + # log.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % ( + # score, + # describe(elem), + # ld, + # score * (1 - ld))) candidate['content_score'] *= (1 - ld) return candidates @@ -280,7 +281,7 @@ def reverse_tags(node, *tag_names): yield e -def sanitize(node, candidates, min_text_len): +def sanitize(node, candidates, options): for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if class_weight(header) < 0 or get_link_density(header) > 0.33: header.drop_tree() @@ -301,8 +302,8 @@ def sanitize(node, candidates, min_text_len): tag = el.tag if weight + content_score < 0: - logging.debug("Cleaned %s with score %6.3f and weight %-3s" % - (describe(el), content_score, weight, )) + # log.debug("Cleaned %s with score %6.3f and weight %-3s" % + # (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} @@ -339,7 +340,7 @@ def sanitize(node, candidates, min_text_len): elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

s than s" to_remove = True - elif content_length < (min_text_len) and (counts["img"] == 0 or counts["img"] > 2): + elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: @@ -365,7 +366,7 @@ def sanitize(node, candidates, min_text_len): x = 1 siblings = [] for sib in el.itersiblings(): - #logging.debug(sib.text_content()) + #log.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i += 1 @@ -373,25 +374,25 @@ def sanitize(node, candidates, min_text_len): if i == x: break for sib in el.itersiblings(preceding=True): - #logging.debug(sib.text_content()) + #log.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j += 1 siblings.append(sib_content_length) if j == x: break - #logging.debug(str(siblings)) + #log.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False - logging.debug("Allowing %s" % describe(el)) + log.debug("Allowing %s" % describe(el)) for desnode in tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: - logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." % - (content_score, describe(el), weight, reason)) + # log.debug("Cleaned %6.3f %s with weight %s cause it has %s." % + # (content_score, describe(el), weight, reason)) #print tounicode(el) - #logging.debug("pname %s pweight %.3f" %(pname, pweight)) + #log.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() # for el in ([node] + [n for n in node.iter()]): @@ -412,49 +413,52 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True): else: output = fragment_fromstring('

') best_elem = best_candidate['elem'] - for sibling in best_elem.getparent().getchildren(): - #if isinstance(sibling, NavigableString): continue#in lxml there no - # concept of simple text - append = False - if sibling is best_elem: - append = True - sibling_key = sibling # HashableElement(sibling) - - # Print out sibling information for debugging. - if sibling_key in candidates: - sibling_candidate = candidates[sibling_key] - logging.debug( - "Sibling: %6.3f %s" % - (sibling_candidate['content_score'], describe(sibling)) - ) - else: - logging.debug("Sibling: %s" % describe(sibling)) - - if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: - append = True - - if sibling.tag == "p": - link_density = get_link_density(sibling) - node_content = sibling.text or "" - node_length = len(node_content) - - if node_length > 80 and link_density < 0.25: + if best_elem.getparent() is not None: + for sibling in best_elem.getparent().getchildren(): + #if isinstance(sibling, NavigableString): continue#in lxml there no + # concept of simple text + append = False + if sibling is best_elem: append = True - elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): + sibling_key = sibling # HashableElement(sibling) + + # Print out sibling information for debugging. + if sibling_key in candidates: + sibling_candidate = candidates[sibling_key] + log.debug( + "Sibling: %6.3f %s" % + (sibling_candidate['content_score'], describe(sibling)) + ) + else: + log.debug("Sibling: %s" % describe(sibling)) + + if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True - if append: - # We don't want to append directly to output, but the div - # in html->body->div - if enclose_with_html_tag: - output.getchildren()[0].getchildren()[0].append(sibling) - else: - output.append(sibling) + if sibling.tag == "p": + link_density = get_link_density(sibling) + node_content = sibling.text or "" + node_length = len(node_content) + + if node_length > 80 and link_density < 0.25: + append = True + elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): + append = True + + if append: + # We don't want to append directly to output, but the div + # in html->body->div + if enclose_with_html_tag: + output.getchildren()[0].getchildren()[0].append(sibling) + else: + output.append(sibling) + else: + output = best_elem return output -def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True): +def get_article(doc, options, enclose_with_html_tag=True): try: ruthless = True while True: @@ -465,8 +469,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True): if ruthless: remove_unlikely_candidates(doc) transform_misused_divs_into_paragraphs(doc) - candidates = score_paragraphs(doc, min_text_len) - + candidates = score_paragraphs(doc, options) best_candidate = select_best_candidate(candidates) if best_candidate: confidence = best_candidate['content_score'] @@ -474,22 +477,18 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True): enclose_with_html_tag=enclose_with_html_tag) else: if ruthless: - logging.debug("ruthless removal did not work. ") + log.debug("ruthless removal did not work. ") ruthless = False - logging.debug("ended up stripping too much - going for a safer parse") + log.debug("ended up stripping too much - going for a safer parse") # try again continue else: - logging.debug("Ruthless and lenient parsing did not work. Returning raw html") - return Summary(0, None, '', '') + log.debug("Ruthless and lenient parsing did not work. Returning raw html") + return Summary(None, 0, '', '') - cleaned_article = sanitize( - article, - candidates, - min_text_len - ) + cleaned_article = sanitize(article, candidates, options) - of_acceptable_length = len(cleaned_article or '') >= retry_len + of_acceptable_length = len(cleaned_article or '') >= options['retry_length'] if ruthless and not of_acceptable_length: ruthless = False continue # try again @@ -500,7 +499,7 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True): title=get_title(doc)) except StandardError as e: - logging.exception('error getting summary: ') + log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2] @@ -615,10 +614,13 @@ def find_base_url(url): cleaned_segments = clean_segments(segments) new_path = '/'.join(cleaned_segments) new_parts = (parts.scheme, parts.netloc, new_path, '', '') - return urlparse.urlunsplit(new_parts) + base_url = urlparse.urlunsplit(new_parts) + log.debug('url: %s' % url) + log.debug('base_url: %s' % base_url) + return base_url -class CandidatePage(): +class NextPageCandidate(): ''' An object that tracks a single href that is a candidate for the location of the next page. Note that this is distinct from the candidates used when @@ -650,7 +652,7 @@ def eval_href(parsed_urls, url, base_url, link): return None, None, False href = strip_trailing_slash(raw_href) - logging.debug('evaluating next page link: %s' % href) + # log.debug('evaluating next page link: %s' % href) # If we've already seen this page, ignore it. if href == base_url or href == url or href in parsed_urls: @@ -658,7 +660,7 @@ def eval_href(parsed_urls, url, base_url, link): # If it's on a different domain, skip it. if url is not None and not same_domain(url, href): - logging.debug('rejecting %s: different domain' % href) + # log.debug('rejecting %s: different domain' % href) return raw_href, href, False return raw_href, href, True @@ -672,7 +674,7 @@ def eval_link_text(link): return link_text, True -def find_or_create_page(candidates, href, link_text): +def find_or_create_page_candidate(candidates, href, link_text): ''' Finds or creates a candidate page object for a next-page href. If one exists already, which happens if there are multiple links with the same @@ -683,14 +685,12 @@ def find_or_create_page(candidates, href, link_text): if href in candidates: return candidates[href], False else: - candidate = CandidatePage(link_text, href) + candidate = NextPageCandidate(link_text, href) candidates[href] = candidate return candidate, True -def eval_possible_next_page_link( - parsed_urls, url, base_url, candidates, link): - +def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link): raw_href, href, ok = eval_href(parsed_urls, url, base_url, link) if not ok: return @@ -706,21 +706,31 @@ def eval_possible_next_page_link( if not re.search(r'\d', href_leftover): return - candidate, created = find_or_create_page(candidates, href, link_text) + candidate, created = find_or_create_page_candidate( + candidates, + href, + link_text + ) + if not created: candidate.link_text += ' | ' + link_text link_class_name = link.get('class') or '' link_id = link.get('id') or '' link_data = ' '.join([link_text, link_class_name, link_id]) + # log.debug('link: %s' % tostring(link)) + log.debug('link_data: %s' % link_data) if base_url is not None and href.find(base_url) != 0: + log.debug('no base_url') candidate.score -= 25 if REGEXES['nextLink'].search(link_data): + log.debug('link_data nextLink regex match') candidate.score += 50 if REGEXES['page'].search(link_data): + log.debug('link_data page regex match') candidate.score += 25 if REGEXES['firstLast'].search(link_data): @@ -754,6 +764,7 @@ def eval_possible_next_page_link( parent = parent.getparent() if REGEXES['page'].search(href): + log.debug('href regex match') candidate.score += 25 if REGEXES['extraneous'].search(href): @@ -768,16 +779,15 @@ def eval_possible_next_page_link( candidate.score -= 10 else: candidate.score += max(0, 10 - link_text_as_int) - except ValueError as e: + except ValueError as exc: pass - -def find_next_page_link(parsed_urls, url, elem): +def find_next_page_url(parsed_urls, url, elem): links = tags(elem, 'a') base_url = find_base_url(url) - # candidates is a mapping from URLs to CandidatePage objects that represent - # information used to determine if a URL points to the next page in the - # article. + # candidates is a mapping from URLs to NextPageCandidate objects that + # represent information used to determine if a URL points to the next page + # in the article. candidates = {} for link in links: eval_possible_next_page_link( @@ -789,28 +799,44 @@ def find_next_page_link(parsed_urls, url, elem): ) top_page = None for url, page in candidates.items(): - logging.debug('next page score of %s: %s' % (url, page.score)) + log.debug('next page score of %s: %s' % (url, page.score)) if 50 <= page.score and (not top_page or top_page.score < page.score): top_page = page if top_page: - logging.debug('next page link found: %s' % top_page.href) + log.debug('next page link found: %s' % top_page.href) parsed_urls.add(top_page.href) return top_page.href else: return None -def append_next_page(fetcher, next_page_link, doc): - # html = fetcher.urlread(next_page_link) - # page_doc = parse(html, next_page_link) - pass +def append_next_page(parsed_urls, page_url, doc, options): + log.debug(str((parsed_urls, page_url, doc, options))) + log.debug('appending next page: %s' % page_url) + fetcher = options['urlfetch'] + html = fetcher.urlread(page_url) + orig_page_doc = parse(html, page_url) + next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) + page_article = get_article(orig_page_doc, options) + log.debug('Appending ' + str(page_article)) + if page_article.html: + page_doc = fragment_fromstring(page_article.html) + # page_doc is a singular element containing the page article elements. We + # want to add its children to the main article document to which we are + # appending a page. + for elem in page_doc: + doc.append(elem) + if next_page_url is not None: + append_next_page(parsed_urls, next_page_url, doc, options) def parse(input, url): raw_doc = build_doc(input) doc = html_cleaner.clean_html(raw_doc) + log.debug('parse url: %s', url) if url: + log.debug('making links absolute') doc.make_links_absolute(url, resolve_base_href=True) else: doc.resolve_base_href() @@ -831,6 +857,8 @@ class Document: - attributes: - debug: output debug messages - min_text_length: + - multipage: should we check for page 2/3 of article and build + together? - retry_length: - url: will allow adjusting links to be absolute @@ -840,7 +868,12 @@ class Document: self.input_doc = input_doc self.options = options - self.options['urlfetch'] = urlfetch.UrlFetch() + self.options['urlfetch'] = self.options.get('urlfetch', + urlfetch.UrlFetch()) + self.options['min_text_length'] = self.options.get('min_text_length', + self.TEXT_LENGTH_THRESHOLD) + self.options['retry_length'] = self.options.get('retry_length', + self.RETRY_LENGTH) self._html = None @property @@ -877,23 +910,20 @@ class Document: return summary.html def _summary(self, enclose_with_html_tag=True): + # the first page parsed into a elementree element doc = self.html + + # the set of urls we've processed so far parsed_urls = set() url = self.options.get('url', None) if url is not None: parsed_urls.add(url) - next_page_link = find_next_page_link(parsed_urls, url, doc) - if next_page_link is not None: - fetcher = self.options.get('urlfetch') - append_next_page(fetcher, next_page_link, doc) - min_text_len = self.options.get( - 'min_text_length', - self.TEXT_LENGTH_THRESHOLD - ) - retry_len = self.options.get('retry_length', self.RETRY_LENGTH) - return get_article(doc, min_text_len, retry_len, - enclose_with_html_tag=enclose_with_html_tag) - def debug(self, *a): - if self.options.get('debug', False): - log.debug(*a) + # check the current doc for a next page if requested + if self.options.get('multipage', False): + next_page_link = find_next_page_url(parsed_urls, url, doc) + if next_page_link is not None: + append_next_page(parsed_urls, next_page_link, doc, self.options) + + return get_article(doc, self.options, + enclose_with_html_tag=enclose_with_html_tag) diff --git a/src/tests/regression.py b/src/tests/regression.py index 556cb22..b95505f 100644 --- a/src/tests/regression.py +++ b/src/tests/regression.py @@ -9,6 +9,7 @@ This allows you to tweak and change the readability algorithm and see how it changes existing results, hopefully for the better. """ +import logging import lxml.html import lxml.html.diff import os @@ -20,6 +21,7 @@ import yaml from lxml.html import builder as B from readability_lxml import readability +from readability_lxml import urlfetch DIFF_SUFFIX = '-diff.html' @@ -100,13 +102,24 @@ del img { class ReadabilityTest: def __init__( - self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path + self, + dir_path, + enabled, + name, + url, + desc, + notes, + url_map, + orig_path, + rdbl_path ): self.dir_path = dir_path self.enabled = enabled self.name = name + self.url = url self.desc = desc self.notes = notes + self.url_map = url_map self.orig_path = orig_path self.rdbl_path = rdbl_path @@ -137,20 +150,17 @@ def make_path(dir_path, name, suffix): def make_readability_test(dir_path, name, spec_dict): - if 'enabled' in spec_dict: - enabled = spec_dict['enabled'] - else: - enabled = True - if 'notes' in spec_dict: - notes = spec_dict['notes'] - else: - notes = '' + enabled = spec_dict.get('enabled', True) + notes = spec_dict.get('notes', '') + url_map = spec_dict.get('url_map', dict()) return ReadabilityTest( dir_path, enabled, name, + spec_dict['url'], spec_dict['test_description'], notes, + url_map, make_path(dir_path, name, ORIGINAL_SUFFIX), make_path(dir_path, name, READABLE_SUFFIX) ) @@ -180,7 +190,13 @@ def execute_test(test_data): if test_data is None: return None else: - doc = readability.Document(test_data.orig_html) + url = test_data.test.url + fetcher = urlfetch.MockUrlFetch(test_data.test.url_map) + doc = readability.Document( + test_data.orig_html, + url=url, + urlfetch=fetcher + ) summary = doc.summary_with_metadata() diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html) return ReadabilityTestResult(test_data, summary.html, diff) @@ -193,6 +209,7 @@ def element_string_lengths(elems): class ResultSummary(): def __init__(self, result): + # logging.debug('diff: %s' % result.diff_html) doc = lxml.html.fragment_fromstring(result.diff_html) insertions = doc.xpath('//ins') @@ -319,6 +336,7 @@ def run_readability_tests(): write_summary(TEST_SUMMARY_PATH, zip(tests, results)) def main(): + logging.basicConfig(level = logging.DEBUG) if len(sys.argv) > 1 and sys.argv[1] == 'unittest': del sys.argv[1] return unittest.main() diff --git a/src/tests/regression_test_data/basic-multi-page-3.html b/src/tests/regression_test_data/basic-multi-page-3.html new file mode 100644 index 0000000..7e4cad8 --- /dev/null +++ b/src/tests/regression_test_data/basic-multi-page-3.html @@ -0,0 +1,60 @@ + + + + A Simple Multi-Page Article For Testing : Page 3 + + +

A Simple Multi-Page Article For Testing : Page 3

+

+ Nullam laoreet, nibh non faucibus dictum, tellus libero varius + erat, lobortis varius est massa quis metus. Donec vitae justo + lacus, nec convallis metus. Suspendisse potenti. Nunc et rutrum + justo. Maecenas ultrices ipsum in magna fermentum eleifend. Fusce + sagittis pretium aliquam. Vestibulum et gravida lorem. Sed turpis + quam, placerat ac ultrices eu, tempor sit amet elit. Curabitur eu + imperdiet velit. Quisque pharetra ornare nunc, a volutpat metus + aliquam quis. Vivamus semper aliquam cursus. Nullam ac nibh nulla, + luctus pharetra nunc. Etiam ut sapien sem. Fusce vehicula, sem sit + amet viverra pretium, magna tortor suscipit nisi, id interdum lorem + orci in tellus. Vivamus vel ipsum eros. Fusce porttitor convallis + ultricies. Etiam in risus diam, viverra suscipit felis. Duis vitae + imperdiet est. +

+

+ Nunc nunc magna, facilisis blandit venenatis ut, scelerisque ac + tortor. Cras condimentum fermentum lectus ac convallis. Suspendisse + cursus, lacus sit amet sodales molestie, dui erat varius velit, non + tincidunt metus dui sed nulla. Aliquam lacus orci, convallis ut + pellentesque ac, molestie et dolor. Ut pretium enim ut nunc auctor + eget placerat magna luctus. Duis mollis ligula a orci ultrices in + facilisis felis feugiat. Morbi eget odio eget erat pulvinar + placerat sed nec erat. Duis dignissim, dolor a lacinia commodo, + metus erat laoreet dui, in lacinia felis lacus vitae nulla. Fusce + imperdiet condimentum volutpat. Vivamus ut lacus a eros cursus + scelerisque non sit amet orci. Phasellus id quam odio. Nulla + adipiscing venenatis lorem nec feugiat. Aenean sit amet nisl odio, + tincidunt scelerisque nisl. Curabitur ut nisl a dui facilisis + vulputate. Mauris eu elit et felis hendrerit blandit. Cras magna + dolor, imperdiet eget rutrum tempus, euismod nec augue. +

+

+ Ut in sem sit amet felis scelerisque elementum. Suspendisse vitae + neque magna, in laoreet felis. Aenean elit ligula, tempor in + vestibulum ac, porttitor nec lacus. Aenean urna mi, dictum feugiat + placerat eget, congue nec dolor. Etiam pellentesque dictum nulla id + vulputate. Etiam sit amet vehicula purus. Integer quis mi nisl, + gravida malesuada enim. Donec malesuada felis nisi. Etiam id magna + a libero pulvinar ullamcorper in nec neque. Duis pulvinar massa nec + magna scelerisque vitae vulputate ipsum luctus. +

+ + +