Parametarize CONTENT_SCORE_DIV_BONUS, CONTENT_SCORE_PRE_TD_BONUS, CONTENT_SCORE_ADDRESS_OL_PENALTY, CONTENT_SCORE_HEADER_PENALTY

pull/60/head
David Larochelle 10 years ago
parent 1923f4d1a7
commit ea28266265

@ -359,17 +359,22 @@ class Document:
return weight
CONTENT_SCORE_DIV_BONUS = 5
CONTENT_SCORE_PRE_TD_BONUS = 3
CONTENT_SCORE_ADDRESS_OL_PENALTY = 3
CONTENT_SCORE_HEADER_PENALTY = 5
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
content_score += 5
content_score += self.CONTENT_SCORE_DIV_BONUS
elif name in ["pre", "td", "blockquote"]:
content_score += 3
content_score += self.CONTENT_SCORE_PRE_TD_BONUS
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
content_score -= 3
content_score -= self.CONTENT_SCORE_ADDRESS_OL_PENALTY
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
content_score -= 5
content_score -= self.CONTENT_SCORE_HEADER_PENALTY
return {
'content_score': content_score,
'elem': elem

Loading…
Cancel
Save