|
|
|
@ -4,7 +4,6 @@ import re
|
|
|
|
|
import sys
|
|
|
|
|
import urlparse
|
|
|
|
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
from collections import namedtuple
|
|
|
|
|
from lxml.etree import tostring
|
|
|
|
|
from lxml.etree import tounicode
|
|
|
|
@ -232,9 +231,11 @@ def same_domain(lhs, rhs):
|
|
|
|
|
else:
|
|
|
|
|
return split_lhs.netloc == split_rhs.netloc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def strip_trailing_slash(s):
|
|
|
|
|
return re.sub(r'/$', '', s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def eval_possible_next_page_link(
|
|
|
|
|
parsed_urls,
|
|
|
|
|
url,
|
|
|
|
@ -336,6 +337,7 @@ def eval_possible_next_page_link(
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_next_page_link(parsed_urls, url, elem):
|
|
|
|
|
links = tags(elem, 'a')
|
|
|
|
|
base_url = find_base_url(url)
|
|
|
|
@ -814,26 +816,6 @@ class Document:
|
|
|
|
|
' many <embed>s')
|
|
|
|
|
to_remove = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
|
|
|
|
|
# imgs = el.findall('.//img')
|
|
|
|
|
# valid_img = False
|
|
|
|
|
# self.debug(tounicode(el))
|
|
|
|
|
# for img in imgs:
|
|
|
|
|
#
|
|
|
|
|
# height = img.get('height')
|
|
|
|
|
# text_length = img.get('text_length')
|
|
|
|
|
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
|
|
|
|
|
# if to_int(height) >= 100 or to_int(text_length) >= 100:
|
|
|
|
|
# valid_img = True
|
|
|
|
|
# self.debug("valid image" + tounicode(img))
|
|
|
|
|
# break
|
|
|
|
|
# if valid_img:
|
|
|
|
|
# to_remove = False
|
|
|
|
|
# self.debug("Allowing %s" %el.text_content())
|
|
|
|
|
# for desnode in tags(el, "table", "ul", "div"):
|
|
|
|
|
# allowed[desnode] = True
|
|
|
|
|
|
|
|
|
|
# don't really understand what this is doing. Originally
|
|
|
|
|
# the i/j were =+ which sets the value to 1. I think that
|
|
|
|
|
# was supposed to be += which would increment. But then
|
|
|
|
|