diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index 5a19374..61a8c86 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -22,18 +22,28 @@ log = logging.getLogger() REGEXES = { - 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), - 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), - 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), - 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), - 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), + 'unlikelyCandidatesRe': re.compile( + ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|' + 'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|' + 'tweet|twitter'), re.I), + 'okMaybeItsACandidateRe': re.compile( + 'and|article|body|column|main|shadow', re.I), + 'positiveRe': re.compile( + ('article|body|content|entry|hentry|main|page|pagination|post|text|' + 'blog|story'), re.I), + 'negativeRe': re.compile( + ('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|' + 'outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|' + 'tool|widget'), re.I), + 'divToPElementsRe': re.compile( + '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), #'trimRe': re.compile('^\s+|\s+$/'), #'normalizeRe': re.compile('\s{2,}/'), #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), - #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, + #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } @@ -132,8 +142,8 @@ class Document: def summary(self, enclose_with_html_tag=False): """Generate the summary of the html docuemnt - :param enclose_with_html_tag: return only the div of the document, don't wrap - in html and body tags. + :param enclose_with_html_tag: return only the div of the document, + don't wrap in html and body tags. """ try: @@ -187,7 +197,8 @@ class Document: log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2] - def get_article(self, candidates, best_candidate, enclose_with_html_tag=False): + def get_article(self, candidates, best_candidate, + enclose_with_html_tag=False): # Now that we have the top candidate, look through its siblings for # content that might also be related. # Things like preambles, content split by ads that we removed, etc. @@ -235,7 +246,9 @@ class Document: return output def select_best_candidate(self, candidates): - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) + sorted_candidates = sorted(candidates.values(), + key=lambda x: x['content_score'], + reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( @@ -466,7 +479,8 @@ class Document: reason = "less than 3x

s than s" to_remove = True elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): - reason = "too short content length %s without a single image" % content_length + reason = ('too short content length %s without a single' + ' image') % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( @@ -477,36 +491,26 @@ class Document: link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: - reason = "s with too short content length, or too many s" + reason = ('s with too short content length, or too' + ' many s') to_remove = True -# if el.tag == 'div' and counts['img'] >= 1 and to_remove: -# imgs = el.findall('.//img') -# valid_img = False -# self.debug(tounicode(el)) -# for img in imgs: -# -# height = img.get('height') -# text_length = img.get('text_length') -# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) -# if to_int(height) >= 100 or to_int(text_length) >= 100: -# valid_img = True -# self.debug("valid image" + tounicode(img)) -# break -# if valid_img: -# to_remove = False -# self.debug("Allowing %s" %el.text_content()) -# for desnode in self.tags(el, "table", "ul", "div"): -# allowed[desnode] = True - - #find x non empty preceding and succeeding siblings + + # don't really understand what this is doing. Originally + # the i/j were =+ which sets the value to 1. I think that + # was supposed to be += which would increment. But then + # it's compared to x which is hard set to 1. So you only + # ever do one loop in each iteration and don't understand + # it. Will have to investigate when we get to testing more + # pages. i, j = 0, 0 x = 1 + siblings = [] for sib in el.itersiblings(): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: - i =+ 1 + i += 1 siblings.append(sib_content_length) if i == x: break @@ -514,7 +518,7 @@ class Document: #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: - j =+ 1 + j += 1 siblings.append(sib_content_length) if j == x: break @@ -526,7 +530,8 @@ class Document: allowed[desnode] = True if to_remove: - self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % + self.debug( + "Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) #print tounicode(el) #self.debug("pname %s pweight %.3f" %(pname, pweight))