Parameterize numerous hard coded numbers.

9 years ago · e634273705
parent ea28266265
commit e634273705
1 changed files with 14 additions and 6 deletions
--- a/readability/readability.py
+++ b/readability/readability.py
@ -276,6 +276,9 @@ class Document:
        total_length = text_length(elem)
        return float(link_length) / max(total_length, 1)

+    CONTENT_SCORE_START = 1
+    CONTENT_SCORE_INNER_TEXT_MIN_BONUS = 3
+    CONTENT_SCORE_GRAND_PARENT_BONUS_FACTOR = 2.0
    def score_paragraphs(self, ):
        MIN_LEN = self.options.get(
            'min_text_length',
@ -305,16 +308,17 @@ class Document:
                    grand_parent_node)
                ordered.append(grand_parent_node)

-            content_score = 1
+            
+            content_score = self.CONTENT_SCORE_START
            content_score += len(inner_text.split(','))
-            content_score += min((inner_text_len / 100), 3)
+            content_score += min((inner_text_len / 100), self.CONTENT_SCORE_INNER_TEXT_MIN_BONUS)
            #if elem not in candidates:
            #    candidates[elem] = self.score_node(elem)

            #WTF? candidates[elem]['content_score'] += content_score
            candidates[parent_node]['content_score'] += content_score
            if grand_parent_node is not None:
-                candidates[grand_parent_node]['content_score'] += content_score / 2.0
+                candidates[grand_parent_node]['content_score'] += content_score / self.CONTENT_SCORE_GRAND_PARENT_BONUS_FACTOR

        # Scale the final candidates score based on link density. Good content
        # should have a relatively small link density (5% or less) and be
@ -335,15 +339,18 @@ class Document:

        return candidates

+    CLASS_WEIGHT_NEGATIVE_RE_PENALTY = 25
+    CLASS_WEIGHT_POSITVE_RE_BONUS = 25
+
    def class_weight(self, e):
        weight = 0
        for feature in [e.get('class', None), e.get('id', None)]:
            if feature:
                if REGEXES['negativeRe'].search(feature):
-                    weight -= 25
+                    weight -= self.CLASS_WEIGHT_NEGATIVE_RE_PENALTY

                if REGEXES['positiveRe'].search(feature):
-                    weight += 25
+                    weight += self.CLASS_WEIGHT_POSITVE_RE_BONUS

                if self.positive_keywords and self.positive_keywords.search(feature):
                    weight += 25
@ -444,6 +451,7 @@ class Document:
    HIGH_WEIGHT_LINK_DENSITY_THRESHOLD = 0.5
    MIN_EMBED_COMMENT_LENGTH = 75
    SIBLING_CONTENT_LENGTH_SUM = 1000
+    LI_COUNT_REDUCTION = 100

    def sanitize(self, node, candidates):
        MIN_LEN = self.options.get('min_text_length',
@ -475,7 +483,7 @@ class Document:
                counts = {}
                for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
                    counts[kind] = len(el.findall('.//%s' % kind))
-                counts["li"] -= 100
+                counts["li"] -=  self.LI_COUNT_REDUCTION
                counts["input"] -= len(el.findall('.//input[@type="hidden"]'))

                # Count the text length excluding any surrounding whitespace