From f55f16baa118f7280546ad49aecce26e3531442f Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Wed, 1 Jun 2011 12:16:32 +0700 Subject: [PATCH] Updated scoring algorithm to match readability.js v1.7.1 --- readability/cleaners.py | 2 +- readability/readability.py | 322 +++++++++++++++++++++---------------- 2 files changed, 187 insertions(+), 137 deletions(-) diff --git a/readability/cleaners.py b/readability/cleaners.py index a9f1d37..9b158c5 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -2,7 +2,7 @@ import re from lxml.html.clean import Cleaner -bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*'] +bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] single_quoted = "'[^']+'" double_quoted = '"[^"]+"' non_space = '[^ "\'>]+' diff --git a/readability/readability.py b/readability/readability.py index 0802c6b..50e191d 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,8 +1,9 @@ #!/usr/bin/env python -from collections import defaultdict from cleaners import html_cleaner, clean_attributes +from collections import defaultdict from htmls import build_doc, get_body, get_title, shorten_title from lxml.etree import tostring, tounicode +from lxml.html import fragment_fromstring, document_fromstring import logging import re import sys @@ -10,10 +11,10 @@ import sys logging.basicConfig(level=logging.INFO) REGEXES = { - 'unlikelyCandidatesRe': re.compile('share|bookmark|adwrapper|ad_wrapper|combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I), - 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I), - 'positiveRe': re.compile('caption|article|body|content|entry|hentry|page|pagination|post|text',re.I), - 'negativeRe': re.compile('adwrapper|ad_wrapper|share|bookmark|nav|combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I), + 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I), + 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I), + 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I), + 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I), 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), @@ -21,21 +22,21 @@ REGEXES = { #'normalizeRe': re.compile('\s{2,}/'), #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), + #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } -def describe(node): +def describe(node, depth=1): if not hasattr(node, 'tag'): - return "[text]" - return "%s#%s.%s" % ( - node.tag, node.get('id', ''), node.get('class','')) - - -def log_candidates(candidates, print_format=""): - for candidate, value in candidates.items(): - logging.info( "%s\t%s\t%s\t%s" %(id(candidate), describe(candidate), value['content_score'], describe(value['elem']))) - -#def _text(node): -# return " ".join(node.findall(text=True)) + return "[%s]" % type(node) + name = node.tag + if node.get('id', ''): name += '#'+node.get('id') + if node.get('class', ''): + name += '.' + node.get('class').replace(' ','.') + if name[:4] in ['div#', 'div.']: + name = name[3:] + if depth and node.getparent() is not None: + return name+' - '+describe(node.getparent(), depth-1) + return name def to_int(x): if not x: return None @@ -46,6 +47,14 @@ def to_int(x): return int(x[:-2]) * 12 return int(x) +def clean(text): + text = re.sub('\s*\n\s*', '\n', text) + text = re.sub('[ \t]{2,}', ' ', text) + return text.strip() + +def text_length(i): + return len(clean(i.text_content() or "")) + class Unparseable(ValueError): pass @@ -92,12 +101,12 @@ class Document: for i in self.tags(self.html, 'script', 'style'): i.drop_tree() - + for i in self.tags(self.html, 'body'): + i.set('id', 'readabilityBody') if ruthless: self.remove_unlikely_candidates() self.transform_misused_divs_into_paragraphs() - candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) - #log_candidates(candidates) + candidates = self.score_paragraphs() best_candidate = self.select_best_candidate(candidates) if best_candidate: @@ -130,11 +139,12 @@ class Document: # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) - output = self._parse("
") - for sibling in best_candidate['elem'].getparent().getchildren(): + output = document_fromstring('
') + best_elem = best_candidate['elem'] + for sibling in best_elem.getparent().getchildren(): #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text append = False - if sibling is best_candidate['elem']: + if sibling is best_elem: append = True sibling_key = sibling #HashableElement(sibling) if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: @@ -152,65 +162,78 @@ class Document: if append: output.append(sibling) - if output is not None: output.append(best_candidate['elem']) + #if output is not None: + # output.append(best_elem) return output def select_best_candidate(self, candidates): sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) - self.debug("Top 5 candidates:") for candidate in sorted_candidates[:5]: elem = candidate['elem'] - self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score'])) + self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem))) if len(sorted_candidates) == 0: return None + best_candidate = sorted_candidates[0] - #self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score'])) return best_candidate - def get_link_density(self, elem): - link_length = len("".join([i.text or "" for i in elem.findall(".//a")])) - text_length = len(elem.text_content()) - return float(link_length) / max(text_length, 1) - def score_paragraphs(self, min_text_length): + def get_link_density(self, elem): + link_length = 0 + for i in elem.findall(".//a"): + link_length += text_length(i) + #if len(elem.findall(".//div") or elem.findall(".//p")): + # link_length = link_length + total_length = text_length(elem) + return float(link_length) / max(total_length, 1) + + def score_paragraphs(self, ): + MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) candidates = {} - self.debug(str([describe(node) for node in self.tags(self.html, "div")])) - elems = self.tags(self.html, "div", "p", "td", 'li', "a") + #self.debug(str([describe(node) for node in self.tags(self.html, "div")])) - for elem in elems: + ordered = [] + for elem in self.tags(self.html, "p", "pre", "td"): parent_node = elem.getparent() + if parent_node is None: + continue grand_parent_node = parent_node.getparent() - elem_key = elem#HashableElement(elem) - parent_key = parent_node#HashableElement(parent_node) - grand_parent_key = grand_parent_node#HashableElement(grand_parent_node) - inner_text = elem.text_content() + inner_text = clean(elem.text_content() or "") + inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. - if (not inner_text) or len(inner_text) < min_text_length: + if inner_text_len < MIN_LEN: continue - if parent_key not in candidates: - candidates[parent_key] = self.score_node(parent_node) - if grand_parent_node is not None and grand_parent_key not in candidates: - candidates[grand_parent_key] = self.score_node(grand_parent_node) + if parent_node not in candidates: + candidates[parent_node] = self.score_node(parent_node) + ordered.append(parent_node) + + if grand_parent_node is not None and grand_parent_node not in candidates: + candidates[grand_parent_node] = self.score_node(grand_parent_node) + ordered.append(grand_parent_node) content_score = 1 content_score += len(inner_text.split(',')) - content_score += min([(len(inner_text) / 100), 3]) - if elem not in candidates: - candidates[elem_key] = self.score_node(elem) - candidates[elem_key]['content_score'] += content_score - candidates[parent_key]['content_score'] += content_score + content_score += min((inner_text_len / 100), 3) + #if elem not in candidates: + # candidates[elem] = self.score_node(elem) + + #WTF? candidates[elem]['content_score'] += content_score + candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: - candidates[grand_parent_key]['content_score'] += content_score / 2.0 + candidates[grand_parent_node]['content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. - for elem, candidate in candidates.items(): - candidate['content_score'] *= (1 - self.get_link_density(elem)) - #self.debug("candidate %s scored %s" % (describe(elem), candidate['content_score'])) + for elem in ordered: + candidate = candidates[elem] + ld = self.get_link_density(elem) + score = candidate['content_score'] + self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld))) + candidate['content_score'] *= (1 - ld) return candidates @@ -237,43 +260,68 @@ class Document: name = elem.tag.lower() if name == "div": content_score += 5 - elif name == "blockquote": + elif name in ["pre", "td", "blockquote"]: content_score += 3 - elif name == "form": + elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: content_score -= 3 - elif name == "th": + elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: content_score -= 5 - return { 'content_score': content_score, 'elem': elem } + return { + 'content_score': content_score, + 'elem': elem + } def debug(self, *a): #if self.options['debug']: logging.debug(*a) def remove_unlikely_candidates(self): - for elem in self.html.iter(): - s = "%s%s" % (elem.get('class', ''), elem.get('id', '')) - self.debug(s) + s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) + #self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': - self.debug("Removing unlikely candidate - %s" % (s,)) + self.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() def transform_misused_divs_into_paragraphs(self): - for elem in self.html.iter(): - if not isinstance(elem.tag, basestring): - raise Exception("You have to strip html comments!") - if elem.tag.lower() == "div": - # transform
s that do not contain other block elements into

s - if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))): - self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', ''))) - elem.tag = "p" + for elem in self.tags(self.html, 'div'): + # transform

s that do not contain other block elements into

s + if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))): + #self.debug("Altering %s to p" % (describe(elem))) + elem.tag = "p" + #print "Fixed element "+describe(elem) + + for elem in self.tags(self.html, 'div'): + if elem.text and elem.text.strip(): + p = fragment_fromstring('

') + p.text = elem.text + elem.text = None + elem.insert(0, p) + #print "Appended "+tounicode(p)+" to "+describe(elem) + + for pos, child in reversed(list(enumerate(elem))): + if child.tail and child.tail.strip(): + p = fragment_fromstring('

') + p.text = child.tail + child.tail = None + elem.insert(pos + 1, p) + #print "Inserted "+tounicode(p)+" to "+describe(elem) + if child.tag == 'br': + #print 'Dropped
at '+describe(elem) + child.drop_tree() def tags(self, node, *tag_names): for tag_name in tag_names: - for e in node.findall('.//%s' %tag_name): + for e in node.findall('.//%s' % tag_name): + yield e + + def reverse_tags(self, node, *tag_names): + for tag_name in tag_names: + for e in reversed(node.findall('.//%s' % tag_name)): yield e def sanitize(self, node, candidates): + MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree() @@ -282,47 +330,48 @@ class Document: elem.drop_tree() allowed = {} # Conditionally clean s,
    s, and
    s - for el in self.tags(node, "table", "ul", "div"): + for el in self.reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = self.class_weight(el) - el_key = el #HashableElement(el) - if el_key in candidates: - content_score = candidates[el_key]['content_score'] + if el in candidates: + content_score = candidates[el]['content_score'] + #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: + self.debug("Cleaned %s with score %6.3f and weight %-3s" % + (describe(el), content_score, weight, )) el.drop_tree() - self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." % - (describe(el), weight, content_score)) - elif len(el.text_content().split(",")) < 10: + elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' %kind)) counts["li"] -= 100 - content_length = len(el.text_content()) # Count the text length excluding any surrounding whitespace + content_length = text_length(el) # Count the text length excluding any surrounding whitespace link_density = self.get_link_density(el) parent_node = el.getparent() - if parent_node: + if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 - pweight = self.class_weight(parent_node) + content_score - pname = parent_node.tag - else: - pweight = 0 - pname = "no parent" + #if parent_node is not None: + #pweight = self.class_weight(parent_node) + content_score + #pname = describe(parent_node) + #else: + #pweight = 0 + #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: - reason = "too many images" + reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more
  • s than

    s" @@ -330,68 +379,69 @@ class Document: elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

    s than s" to_remove = True - elif content_length < (self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0 or counts["img"] > 2): - reason = "too short a content length without a single image" + elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): + reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links for its weight less than 25 (#{weight})" + reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: - reason = "too many links for its weight (#{weight})" + reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: - reason = "s with too short a content length, or too many s" + reason = "s with too short content length, or too many s" to_remove = True - if el.tag == 'div' and counts['img'] >= 1 and to_remove: - imgs = el.findall('.//img') - valid_img = False - self.debug(tounicode(el)) - for img in imgs: - - height = img.get('height') - width = img.get('width') - self.debug ("height %s width %s" %(repr(height), repr(width))) - if to_int(height) >= 100 or to_int(width) >= 100: - valid_img = True - self.debug("valid image" + tounicode(img)) - break - if valid_img: +# if el.tag == 'div' and counts['img'] >= 1 and to_remove: +# imgs = el.findall('.//img') +# valid_img = False +# self.debug(tounicode(el)) +# for img in imgs: +# +# height = img.get('height') +# text_length = img.get('text_length') +# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) +# if to_int(height) >= 100 or to_int(text_length) >= 100: +# valid_img = True +# self.debug("valid image" + tounicode(img)) +# break +# if valid_img: +# to_remove = False +# self.debug("Allowing %s" %el.text_content()) +# for desnode in self.tags(el, "table", "ul", "div"): +# allowed[desnode] = True + + #find x non empty preceding and succeeding siblings + i, j = 0, 0 + x = 1 + siblings = [] + for sib in el.itersiblings(): + #self.debug(sib.text_content()) + sib_content_length = text_length(sib) + if sib_content_length: + i =+ 1 + siblings.append(sib_content_length) + if i == x: + break + for sib in el.itersiblings(preceding=True): + #self.debug(sib.text_content()) + sib_content_length = text_length(sib) + if sib_content_length: + j =+ 1 + siblings.append(sib_content_length) + if j == x: + break + #self.debug(str(siblings)) + if siblings and sum(siblings) > 1000 : to_remove = False - self.debug("Allowing %s" %el.text_content()) + self.debug("Allowing %s" % describe(el)) for desnode in self.tags(el, "table", "ul", "div"): allowed[desnode] = True - #find x non empty preceeding and succeeding siblings - """ - i, j = 0, 0 - x = 1 - siblings = [] - for sib in el.itersiblings(): - self.debug(sib.text_content()) - sib_content_length = len(sib.text_content()) - if sib_content_length: - i =+ 1 - siblings.append(sib_content_length) - if i == x: - break - for sib in el.itersiblings(preceding=True): - self.debug(sib.text_content()) - sib_content_length = len(sib.text_content()) - if sib_content_length: - j =+ 1 - siblings.append(sib_content_length) - if j == x: - break - self.debug(str(siblings)) - if siblings and sum(siblings) > 1000 : - to_remove = False - self.debug("Allowing %s" %el.text_content()) - for desnode in self.tags(el, "table", "ul", "div"): - allowed[desnode] = True - """ + if to_remove: - self.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." % - (el.tag, el.get('id',''), el.get('class', ''), weight, content_score, reason)) - self.debug("pname %s pweight %s" %(pname, pweight)) + self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % + (content_score, describe(el), weight, reason)) + #print tounicode(el) + #self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() for el in ([node] + [n for n in node.iter()]):