diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index ff79ef7..b4d1e10 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -94,7 +94,7 @@ def text_length(i): return len(clean(i.text_content() or "")) -def clean_segment_extension(num_segments, index, segment): +def clean_segment_extension(segments, index, segment): if segment.find('.') == -1: return segment else: @@ -107,7 +107,7 @@ def clean_segment_extension(num_segments, index, segment): return split_segment[0] -def clean_segment_ewcms(num_segments, index, segment): +def clean_segment_ewcms(segments, index, segment): """ EW-CMS specific segment cleaning. Quoth the original source: "EW-CMS specific segment replacement. Ugly. @@ -116,10 +116,10 @@ def clean_segment_ewcms(num_segments, index, segment): return segment.replace(',00', '') -def clean_segment_page_number(num_segments, index, segment): +def clean_segment_page_number(segments, index, segment): # If our first or second segment has anything looking like a page number, # remove it. - if index >= (num_segments - 2): + if index >= (len(segments) - 2): pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$' cleaned = re.sub(pattern, '', segment, re.IGNORECASE) if cleaned == '': @@ -130,23 +130,37 @@ def clean_segment_page_number(num_segments, index, segment): return segment -def clean_segment_number(num_segments, index, segment): +def clean_segment_number(segments, index, segment): # If this is purely a number, and it's the first or second segment, it's # probably a page number. Remove it. - if index >= (num_segments - 2) and re.search(r'^\d{1,2}$', segment): + if index >= (len(segments) - 2) and re.search(r'^\d{1,2}$', segment): return None else: return segment - -def clean_segment_index(num_segments, index, segment): - if index == (num_segments - 1) and segment.lower() == 'index': +def clean_segment_index(segments, index, segment): + if index == (len(segments) - 1) and segment.lower() == 'index': return None else: return segment -def clean_segment(num_segments, index, segment): +def clean_segment_short(segments, index, segment): + # It is not clear to me what this is accomplishing. The original + # readability source just says: + # + # "If our first or second segment is smaller than 3 characters, and the + # first segment was purely alphas, remove it." + # + # However, the code actually checks to make sure that there are no alphas + # in the segment, rather than checking for purely alphas. + alphas = re.search(r'[a-z]', segments[-1], re.IGNORECASE) + if index >= (len(segments) - 2) and len(segment) < 3 and not alphas: + return None + else: + return segment + +def clean_segment(segments, index, segment): """ Cleans a single segment of a URL to find the base URL. The base URL is as a reference when evaluating URLs that might be next-page links. Returns a @@ -158,13 +172,14 @@ def clean_segment(num_segments, index, segment): clean_segment_ewcms, clean_segment_page_number, clean_segment_number, - clean_segment_index + clean_segment_index, + clean_segment_short ] cleaned_segment = segment for func in funcs: if cleaned_segment is None: break - cleaned_segment = func(num_segments, index, cleaned_segment) + cleaned_segment = func(segments, index, cleaned_segment) return cleaned_segment @@ -174,7 +189,7 @@ def filter_none(seq): def clean_segments(segments): cleaned = [ - clean_segment(len(segments), i, s) + clean_segment(segments, i, s) for i, s in enumerate(segments) ] return filter_none(cleaned)