s
for el in self.reverse_tags(
node, "table", "ul", "div", "aside", "header", "footer", "section"
):
if el in allowed:
continue
weight = self.class_weight(el)
if el in candidates:
content_score = candidates[el]["content_score"]
# print '!',el, '-> %6.3f' % content_score
else:
content_score = 0
tag = el.tag
if weight + content_score < 0:
log.debug(
"Removed %s with score %6.3f and weight %-3s"
% (describe(el), content_score, weight,)
)
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
for kind in ["p", "img", "li", "a", "embed", "input"]:
counts[kind] = len(el.findall(".//%s" % kind))
counts["li"] -= 100
counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
# Count the text length excluding any surrounding whitespace
content_length = text_length(el)
link_density = self.get_link_density(el)
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
content_score = candidates[parent_node]["content_score"]
else:
content_score = 0
# if parent_node is not None:
# pweight = self.class_weight(parent_node) + content_score
# pname = describe(parent_node)
# else:
# pweight = 0
# pname = "no parent"
to_remove = False
reason = ""
# if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
reason = "more
s than s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x
s than s"
to_remove = True
elif content_length < MIN_LEN and counts["img"] == 0:
reason = (
"too short content length %s without a single image"
% content_length
)
to_remove = True
elif content_length < MIN_LEN and counts["img"] > 2:
reason = (
"too short content length %s and too many images"
% content_length
)
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % (
link_density,
weight,
)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = "too many links %.3f for its weight %s" % (
link_density,
weight,
)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts[
"embed"
] > 1:
reason = (
"