s
- for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"):
+ for el in self.reverse_tags(
+ node, "table", "ul", "div", "aside", "header", "footer", "section"
+ ):
if el in allowed:
continue
weight = self.class_weight(el)
if el in candidates:
- content_score = candidates[el]['content_score']
- #print '!',el, '-> %6.3f' % content_score
+ content_score = candidates[el]["content_score"]
+ # print '!',el, '-> %6.3f' % content_score
else:
content_score = 0
tag = el.tag
if weight + content_score < 0:
- log.debug("Removed %s with score %6.3f and weight %-3s" %
- (describe(el), content_score, weight, ))
+ log.debug(
+ "Removed %s with score %6.3f and weight %-3s"
+ % (describe(el), content_score, weight,)
+ )
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
- for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
- counts[kind] = len(el.findall('.//%s' % kind))
+ for kind in ["p", "img", "li", "a", "embed", "input"]:
+ counts[kind] = len(el.findall(".//%s" % kind))
counts["li"] -= 100
counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
@@ -501,21 +548,21 @@ class Document:
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
- content_score = candidates[parent_node]['content_score']
+ content_score = candidates[parent_node]["content_score"]
else:
content_score = 0
- #if parent_node is not None:
- #pweight = self.class_weight(parent_node) + content_score
- #pname = describe(parent_node)
- #else:
- #pweight = 0
- #pname = "no parent"
+ # if parent_node is not None:
+ # pweight = self.class_weight(parent_node) + content_score
+ # pname = describe(parent_node)
+ # else:
+ # pweight = 0
+ # pname = "no parent"
to_remove = False
reason = ""
- #if el.tag == 'div' and counts["img"] >= 1:
+ # if el.tag == 'div' and counts["img"] >= 1:
# continue
- if counts["p"] and counts["img"] > 1 + counts["p"]*1.3:
+ if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
@@ -525,65 +572,79 @@ class Document:
reason = "less than 3x
s than s"
to_remove = True
elif content_length < MIN_LEN and counts["img"] == 0:
- reason = "too short content length %s without a single image" % content_length
+ reason = (
+ "too short content length %s without a single image"
+ % content_length
+ )
to_remove = True
elif content_length < MIN_LEN and counts["img"] > 2:
- reason = "too short content length %s and too many images" % content_length
+ reason = (
+ "too short content length %s and too many images"
+ % content_length
+ )
to_remove = True
elif weight < 25 and link_density > 0.2:
- reason = "too many links %.3f for its weight %s" % (
- link_density, weight)
- to_remove = True
+ reason = "too many links %.3f for its weight %s" % (
+ link_density,
+ weight,
+ )
+ to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % (
- link_density, weight)
+ link_density,
+ weight,
+ )
to_remove = True
- elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
- reason = "