Add scoring of next page link ancestry and href

This adds the scoring of next page link candidates' ancestry and href values
from the readability algorithm.
0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent 0951647c8e
commit 32d1764e83

@ -300,8 +300,41 @@ def eval_possible_next_page_link(
if REGEXES['prevLink'].search(link_data):
candidate.score -= 200
# TODO: Score ancestry.
# TODO: Score a bunch of other stuff.
parent = link.getparent()
positive_node_match = False
negative_node_match = False
while parent is not None:
parent_class = parent.get('class') or ''
parent_id = parent.get('id') or ''
parent_class_and_id = ' '.join([parent_class, parent_id])
if not positive_node_match:
if REGEXES['page'].search(parent_class_and_id):
positive_node_match = True
candidate.score += 25
if not negative_node_match:
if REGEXES['negativeRe'].search(parent_class_and_id):
if not REGEXES['positiveRe'].search(parent_class_and_id):
negative_node_match = True
candidate.score -= 25
parent = parent.getparent()
if REGEXES['page'].search(href):
candidate.score += 25
if REGEXES['extraneous'].search(href):
candidate.score -= 15
try:
link_text_as_int = int(link_text)
# Punish 1 since we're either already there, or it's probably before
# what we want anyways.
if link_text_as_int == 1:
candidate.score -= 10
else:
candidate.score += max(0, 10 - link_text_as_int)
except ValueError as e:
pass
def find_next_page_link(parsed_urls, url, elem):
links = tags(elem, 'a')

Loading…
Cancel
Save