0.3.0.dev
Richard Harding 12 years ago
parent 32d1764e83
commit 9765d13e90

@ -4,7 +4,6 @@ import re
import sys
import urlparse
from collections import defaultdict
from collections import namedtuple
from lxml.etree import tostring
from lxml.etree import tounicode
@ -232,9 +231,11 @@ def same_domain(lhs, rhs):
else:
return split_lhs.netloc == split_rhs.netloc
def strip_trailing_slash(s):
return re.sub(r'/$', '', s)
def eval_possible_next_page_link(
parsed_urls,
url,
@ -336,6 +337,7 @@ def eval_possible_next_page_link(
except ValueError as e:
pass
def find_next_page_link(parsed_urls, url, elem):
links = tags(elem, 'a')
base_url = find_base_url(url)
@ -814,26 +816,6 @@ class Document:
' many <embed>s')
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img')
# valid_img = False
# self.debug(tounicode(el))
# for img in imgs:
#
# height = img.get('height')
# text_length = img.get('text_length')
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
# if to_int(height) >= 100 or to_int(text_length) >= 100:
# valid_img = True
# self.debug("valid image" + tounicode(img))
# break
# if valid_img:
# to_remove = False
# self.debug("Allowing %s" %el.text_content())
# for desnode in tags(el, "table", "ul", "div"):
# allowed[desnode] = True
# don't really understand what this is doing. Originally
# the i/j were =+ which sets the value to 1. I think that
# was supposed to be += which would increment. But then

Loading…
Cancel
Save