#!/usr/bin/env python
import logging
import re
import sys
import urlparse
import urlfetch
from collections import namedtuple
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from cleaners import clean_attributes
from cleaners import html_cleaner
from htmls import build_doc
from htmls import get_body
from htmls import get_title
from htmls import shorten_title
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
PAGE_CLASS = 'article-page'
REGEXES = {
'unlikelyCandidatesRe': re.compile(
('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|'
'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|'
'tweet|twitter'), re.I),
'okMaybeItsACandidateRe': re.compile(
'and|article|body|column|main|shadow', re.I),
'positiveRe': re.compile(
('article|body|content|entry|hentry|main|page|pagination|post|text|'
'blog|story'), re.I),
'negativeRe': re.compile(
('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|'
'outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|'
'tool|widget'), re.I),
'extraneous': re.compile(
(r'print|archive|comment|discuss|e[\-]?mail|share|reply|all|login'
'|sign|single'), re.I),
'divToPElementsRe': re.compile(
'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
# Match: next, continue, >, >>, but not >|, as those usually mean last.
'nextLink': re.compile(r'(next|weiter|continue|>[^\|]$)', re.I), # Match: next, continue, >, >>, but not >|, as those usually mean last.
'prevLink': re.compile(r'(prev|earl|old|new|<)', re.I),
'page': re.compile(r'pag(e|ing|inat)', re.I),
'firstLast': re.compile(r'(first|last)', re.I)
#'replaceBrsRe': re.compile('( ]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
#'normalizeRe': re.compile('\s{2,}/'),
#'killBreaksRe': re.compile('( (\s| ?)*){1,}/'),
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
class Unparseable(ValueError):
pass
# We want to change over the Summary to a nametuple to be more memory
# effecient and because it doesn't need to be mutable.
Summary = namedtuple('Summary', ['html', 'confidence', 'title', 'short_title'])
def describe(node, depth=1):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''):
name += '#' + node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if depth and node.getparent() is not None:
return name + ' - ' + describe(node.getparent(), depth - 1)
return name
def to_int(x):
if not x:
return None
x = x.strip()
if x.endswith('px'):
return int(x[:-2])
if x.endswith('em'):
return int(x[:-2]) * 12
return int(x)
def clean(text):
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
return text.strip()
def text_length(i):
return len(clean(i.text_content() or ""))
def tags(node, *tag_names):
for tag_name in tag_names:
for e in node.findall('.//%s' % tag_name):
yield e
def class_weight(e):
weight = 0
if e.get('class', None):
if REGEXES['negativeRe'].search(e.get('class')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('class')):
weight += 25
if e.get('id', None):
if REGEXES['negativeRe'].search(e.get('id')):
weight -= 25
if REGEXES['positiveRe'].search(e.get('id')):
weight += 25
return weight
def score_node(elem):
content_score = class_weight(elem)
name = elem.tag.lower()
if name == "div":
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
content_score -= 5
return {
'content_score': content_score,
'elem': elem
}
def transform_misused_divs_into_paragraphs(doc):
for elem in tags(doc, 'div'):
# transform
s that do not contain other block elements into
s
if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))):
# log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
#print "Fixed element "+describe(elem)
for elem in tags(doc, 'div'):
if elem.text and elem.text.strip():
p = fragment_fromstring('
')
p.text = elem.text
elem.text = None
elem.insert(0, p)
# log.debug("Appended %s to %s" % (tounicode(p), describe(elem)))
#print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip():
p = fragment_fromstring('')
p.text = child.tail
child.tail = None
elem.insert(pos + 1, p)
# log.debug("Inserted %s to %s" % (
# tounicode(p),
# describe(elem)))
#print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br':
#print 'Dropped at '+describe(elem)
child.drop_tree()
def remove_unlikely_candidates(doc):
for elem in doc.iter():
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#log.debug(s)
if (REGEXES['unlikelyCandidatesRe'].search(s) and
(not REGEXES['okMaybeItsACandidateRe'].search(s)) and
elem.tag != 'body' and
elem.getparent() is not None
):
# log.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree()
def get_link_density(elem):
link_length = 0
for i in elem.findall(".//a"):
link_length += text_length(i)
#if len(elem.findall(".//div") or elem.findall(".//p")):
# link_length = link_length
total_length = text_length(elem)
return float(link_length) / max(total_length, 1)
def score_paragraphs(doc, options):
candidates = {}
#log.debug(str([describe(node) for node in tags(doc, "div")]))
ordered = []
for elem in tags(doc, "p", "pre", "td"):
# log.debug('Scoring %s' % describe(elem))
parent_node = elem.getparent()
if parent_node is None:
continue
grand_parent_node = parent_node.getparent()
inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters, don't even count it.
if inner_text_len < options['min_text_length']:
continue
if parent_node not in candidates:
candidates[parent_node] = score_node(parent_node)
ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = score_node(grand_parent_node)
ordered.append(grand_parent_node)
content_score = 1
content_score += len(inner_text.split(','))
content_score += min((inner_text_len / 100), 3)
#if elem not in candidates:
# candidates[elem] = score_node(elem)
#WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]['content_score'] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]['content_score'] += content_score / 2.0
# Scale the final candidates score based on link density. Good content
# should have a relatively small link density (5% or less) and be mostly
# unaffected by this operation.
for elem in ordered:
candidate = candidates[elem]
ld = get_link_density(elem)
score = candidate['content_score']
# log.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
# score,
# describe(elem),
# ld,
# score * (1 - ld)))
candidate['content_score'] *= (1 - ld)
return candidates
def select_best_candidate(candidates):
sorted_candidates = sorted(candidates.values(),
key=lambda x: x['content_score'],
reverse=True)
for candidate in sorted_candidates[:5]:
elem = candidate['elem']
log.debug("Top 5 : %6.3f %s" % (
candidate['content_score'],
describe(elem)))
if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0]
return best_candidate
def reverse_tags(node, *tag_names):
for tag_name in tag_names:
for e in reversed(node.findall('.//%s' % tag_name)):
yield e
def sanitize(node, candidates, options):
for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if class_weight(header) < 0 or get_link_density(header) > 0.33:
header.drop_tree()
for elem in tags(node, "form", "iframe", "textarea"):
elem.drop_tree()
allowed = {}
# Conditionally clean
s,
s, and
s
for el in reverse_tags(node, "table", "ul", "div"):
if el in allowed:
continue
weight = class_weight(el)
if el in candidates:
content_score = candidates[el]['content_score']
#print '!',el, '-> %6.3f' % content_score
else:
content_score = 0
tag = el.tag
if weight + content_score < 0:
# log.debug("Cleaned %s with score %6.3f and weight %-3s" %
# (describe(el), content_score, weight, ))
el.drop_tree()
elif el.text_content().count(",") < 10:
counts = {}
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
counts[kind] = len(el.findall('.//%s' % kind))
counts["li"] -= 100
# Count the text length excluding any surrounding whitespace
content_length = text_length(el)
link_density = get_link_density(el)
parent_node = el.getparent()
if parent_node is not None:
if parent_node in candidates:
content_score = candidates[parent_node]['content_score']
else:
content_score = 0
#if parent_node is not None:
#pweight = class_weight(parent_node) + content_score
#pname = describe(parent_node)
#else:
#pweight = 0
#pname = "no parent"
to_remove = False
reason = ""
#if el.tag == 'div' and counts["img"] >= 1:
# continue
if counts["p"] and counts["img"] > counts["p"]:
reason = "too many images (%s)" % counts["img"]
to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
reason = "more