From 9765d13e90879c5faa980385916a06545736df34 Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Sat, 21 Apr 2012 13:28:39 -0400 Subject: [PATCH] Garden --- src/readability_lxml/readability.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index 8f52402..b755927 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -4,7 +4,6 @@ import re import sys import urlparse -from collections import defaultdict from collections import namedtuple from lxml.etree import tostring from lxml.etree import tounicode @@ -232,9 +231,11 @@ def same_domain(lhs, rhs): else: return split_lhs.netloc == split_rhs.netloc + def strip_trailing_slash(s): return re.sub(r'/$', '', s) + def eval_possible_next_page_link( parsed_urls, url, @@ -336,6 +337,7 @@ def eval_possible_next_page_link( except ValueError as e: pass + def find_next_page_link(parsed_urls, url, elem): links = tags(elem, 'a') base_url = find_base_url(url) @@ -814,26 +816,6 @@ class Document: ' many s') to_remove = True - -# if el.tag == 'div' and counts['img'] >= 1 and to_remove: -# imgs = el.findall('.//img') -# valid_img = False -# self.debug(tounicode(el)) -# for img in imgs: -# -# height = img.get('height') -# text_length = img.get('text_length') -# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) -# if to_int(height) >= 100 or to_int(text_length) >= 100: -# valid_img = True -# self.debug("valid image" + tounicode(img)) -# break -# if valid_img: -# to_remove = False -# self.debug("Allowing %s" %el.text_content()) -# for desnode in tags(el, "table", "ul", "div"): -# allowed[desnode] = True - # don't really understand what this is doing. Originally # the i/j were =+ which sets the value to 1. I think that # was supposed to be += which would increment. But then