Add scoring of next page link ancestry and href

This adds the scoring of next page link candidates' ancestry and href values from the readability algorithm.
13 years ago · 32d1764e83
parent 0951647c8e
commit 32d1764e83
1 changed files with 35 additions and 2 deletions
--- a/src/readability_lxml/readability.py
+++ b/src/readability_lxml/readability.py
@ -300,8 +300,41 @@ def eval_possible_next_page_link(
    if REGEXES['prevLink'].search(link_data):
        candidate.score -= 200

-    # TODO: Score ancestry.
-    # TODO: Score a bunch of other stuff.
+    parent = link.getparent()
+    positive_node_match = False
+    negative_node_match = False
+    while parent is not None:
+        parent_class = parent.get('class') or ''
+        parent_id = parent.get('id') or ''
+        parent_class_and_id = ' '.join([parent_class, parent_id])
+        if not positive_node_match:
+            if REGEXES['page'].search(parent_class_and_id):
+                positive_node_match = True
+                candidate.score += 25
+        if not negative_node_match:
+            if REGEXES['negativeRe'].search(parent_class_and_id):
+                if not REGEXES['positiveRe'].search(parent_class_and_id):
+                    negative_node_match = True
+                    candidate.score -= 25
+        parent = parent.getparent()
+
+    if REGEXES['page'].search(href):
+        candidate.score += 25
+
+    if REGEXES['extraneous'].search(href):
+        candidate.score -= 15
+
+    try:
+        link_text_as_int = int(link_text)
+
+        # Punish 1 since we're either already there, or it's probably before
+        # what we want anyways.
+        if link_text_as_int == 1:
+            candidate.score -= 10
+        else:
+            candidate.score += max(0, 10 - link_text_as_int)
+    except ValueError as e:
+        pass

 def find_next_page_link(parsed_urls, url, elem):
    links = tags(elem, 'a')