|
|
|
@ -94,7 +94,7 @@ def text_length(i):
|
|
|
|
|
return len(clean(i.text_content() or ""))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_segment_extension(num_segments, index, segment):
|
|
|
|
|
def clean_segment_extension(segments, index, segment):
|
|
|
|
|
if segment.find('.') == -1:
|
|
|
|
|
return segment
|
|
|
|
|
else:
|
|
|
|
@ -107,7 +107,7 @@ def clean_segment_extension(num_segments, index, segment):
|
|
|
|
|
return split_segment[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_segment_ewcms(num_segments, index, segment):
|
|
|
|
|
def clean_segment_ewcms(segments, index, segment):
|
|
|
|
|
"""
|
|
|
|
|
EW-CMS specific segment cleaning. Quoth the original source:
|
|
|
|
|
"EW-CMS specific segment replacement. Ugly.
|
|
|
|
@ -116,10 +116,10 @@ def clean_segment_ewcms(num_segments, index, segment):
|
|
|
|
|
return segment.replace(',00', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_segment_page_number(num_segments, index, segment):
|
|
|
|
|
def clean_segment_page_number(segments, index, segment):
|
|
|
|
|
# If our first or second segment has anything looking like a page number,
|
|
|
|
|
# remove it.
|
|
|
|
|
if index >= (num_segments - 2):
|
|
|
|
|
if index >= (len(segments) - 2):
|
|
|
|
|
pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$'
|
|
|
|
|
cleaned = re.sub(pattern, '', segment, re.IGNORECASE)
|
|
|
|
|
if cleaned == '':
|
|
|
|
@ -130,23 +130,37 @@ def clean_segment_page_number(num_segments, index, segment):
|
|
|
|
|
return segment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_segment_number(num_segments, index, segment):
|
|
|
|
|
def clean_segment_number(segments, index, segment):
|
|
|
|
|
# If this is purely a number, and it's the first or second segment, it's
|
|
|
|
|
# probably a page number. Remove it.
|
|
|
|
|
if index >= (num_segments - 2) and re.search(r'^\d{1,2}$', segment):
|
|
|
|
|
if index >= (len(segments) - 2) and re.search(r'^\d{1,2}$', segment):
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
return segment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_segment_index(num_segments, index, segment):
|
|
|
|
|
if index == (num_segments - 1) and segment.lower() == 'index':
|
|
|
|
|
def clean_segment_index(segments, index, segment):
|
|
|
|
|
if index == (len(segments) - 1) and segment.lower() == 'index':
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
return segment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_segment(num_segments, index, segment):
|
|
|
|
|
def clean_segment_short(segments, index, segment):
|
|
|
|
|
# It is not clear to me what this is accomplishing. The original
|
|
|
|
|
# readability source just says:
|
|
|
|
|
#
|
|
|
|
|
# "If our first or second segment is smaller than 3 characters, and the
|
|
|
|
|
# first segment was purely alphas, remove it."
|
|
|
|
|
#
|
|
|
|
|
# However, the code actually checks to make sure that there are no alphas
|
|
|
|
|
# in the segment, rather than checking for purely alphas.
|
|
|
|
|
alphas = re.search(r'[a-z]', segments[-1], re.IGNORECASE)
|
|
|
|
|
if index >= (len(segments) - 2) and len(segment) < 3 and not alphas:
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
return segment
|
|
|
|
|
|
|
|
|
|
def clean_segment(segments, index, segment):
|
|
|
|
|
"""
|
|
|
|
|
Cleans a single segment of a URL to find the base URL. The base URL is as
|
|
|
|
|
a reference when evaluating URLs that might be next-page links. Returns a
|
|
|
|
@ -158,13 +172,14 @@ def clean_segment(num_segments, index, segment):
|
|
|
|
|
clean_segment_ewcms,
|
|
|
|
|
clean_segment_page_number,
|
|
|
|
|
clean_segment_number,
|
|
|
|
|
clean_segment_index
|
|
|
|
|
clean_segment_index,
|
|
|
|
|
clean_segment_short
|
|
|
|
|
]
|
|
|
|
|
cleaned_segment = segment
|
|
|
|
|
for func in funcs:
|
|
|
|
|
if cleaned_segment is None:
|
|
|
|
|
break
|
|
|
|
|
cleaned_segment = func(num_segments, index, cleaned_segment)
|
|
|
|
|
cleaned_segment = func(segments, index, cleaned_segment)
|
|
|
|
|
return cleaned_segment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -174,7 +189,7 @@ def filter_none(seq):
|
|
|
|
|
|
|
|
|
|
def clean_segments(segments):
|
|
|
|
|
cleaned = [
|
|
|
|
|
clean_segment(len(segments), i, s)
|
|
|
|
|
clean_segment(segments, i, s)
|
|
|
|
|
for i, s in enumerate(segments)
|
|
|
|
|
]
|
|
|
|
|
return filter_none(cleaned)
|
|
|
|
|