Compare commits

...

7 Commits
master ... dev

Author SHA1 Message Date
Yuri Baburov e8f86bdcf9 Several updates from dev version. 9 years ago
Yuri Baburov 40e430c27d Makefile updates 9 years ago
Yuri Baburov 0a082ff020 Fix for Mac OS X 10.10 9 years ago
Yuri Baburov 8048160d66 WIP: update to support python2 and python3 9 years ago
Yuri Baburov 71294f094f Encoding improvements 10 years ago
Yuri Baburov 5855beb32a WIP; Backported features from stable branch 10 years ago
Yuri Baburov ae1f1adfff Switched to use python logging module.
Added xpath option (undocumented yet).
10 years ago

@ -1,9 +1,9 @@
# Makefile to help automate tasks # Makefile to help automate tasks
WD := $(shell pwd) WD := $(shell pwd)
PY := bin/python PY := .env/bin/python
PIP := bin/pip PIP := .env/bin/pip
PEP8 := bin/pep8 PEP8 := .env/bin/pep8
NOSE := bin/nosetests NOSE := .env/bin/nosetests
# ########### # ###########
@ -24,14 +24,14 @@ all: venv develop
venv: bin/python venv: bin/python
bin/python: bin/python:
virtualenv . virtualenv .env
.PHONY: clean_venv .PHONY: clean_venv
clean_venv: clean_venv:
rm -rf bin include lib local man rm -rf .env
develop: lib/python*/site-packages/bookie-api.egg-link develop: .env/lib/python*/site-packages/readability-lxml.egg-link
lib/python*/site-packages/bookie-api.egg-link: .env/lib/python*/site-packages/readability-lxml.egg-link:
$(PY) setup.py develop $(PY) setup.py develop

@ -1,15 +1,20 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds # -*- encoding: utf-8 -*-
# strip out a set of nuisance html attributes that can mess up rendering
# in RSS feeds
import re import re
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] bad_attrs = ['width', 'height', 'style',
'[-a-z]*color', 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'" single_quoted = "'[^']+'"
double_quoted = '"[^"]+"' double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+' non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open htmlstrip = re.compile("<" # open
"([^>]+) " # prefix "([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix "([^>]*)" # postfix
">" # end ">" # end
, re.I) , re.I)
@ -20,13 +25,15 @@ def clean_attributes(html):
return html return html
def normalize_spaces(s): def normalize_spaces(s):
if not s: return '' if not s:
return ''
"""replace any sequence of whitespace """replace any sequence of whitespace
characters with a single space""" characters with a single space"""
return ' '.join(s.split()) return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False, style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False, page_structure=False, processing_instructions=True,
frames=False, forms=False, annoying_tags=False, remove_tags=None, embedded=False, frames=False, forms=False,
annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False) remove_unknown_tags=False, safe_attrs_only=False)

@ -1,25 +1,62 @@
def save_to_file(text, filename): import re
f = open(filename, 'wt')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
f.write(text.encode('utf-8'))
f.close()
uids = {}
def describe(node, depth=2): uids = {}
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
def open_in_browser(html):
"""
Open the HTML document in a web browser, saving it to a temporary
file to open it. Note that this does not delete the file after
use. This is mainly meant for debugging.
"""
import os
import webbrowser
import tempfile
handle, fn = tempfile.mkstemp(suffix='.html')
f = os.fdopen(handle, 'wb')
try:
f.write("<meta charset='UTF-8' />")
f.write(html.encode('utf-8'))
finally:
# we leak the file itself here, but we should at least close it
f.close()
url = 'file://' + fn.replace(os.path.sep, '/')
webbrowser.open(url)
return url
def describe_node(node):
if node is None:
return ''
if not hasattr(node, 'tag'): if not hasattr(node, 'tag'):
return "[%s]" % type(node) return "[%s]" % type(node)
name = node.tag name = node.tag
if node.get('id', ''): name += '#'+node.get('id') if node.get('id', ''):
if node.get('class', ''): name += '#' + node.get('id')
name += '.' + node.get('class').replace(' ','.') if node.get('class', ''):
name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']: if name[:4] in ['div#', 'div.']:
name = name[3:] name = name[3:]
if name in ['tr', 'td', 'div', 'p']: if name in ['tr', 'td', 'div', 'p']:
if not node in uids: uid = uids.get(node)
uid = uids[node] = len(uids)+1 if uid is None:
else: uid = uids[node] = len(uids) + 1
uid = uids.get(node) name += "{%02d}" % uid
name += "%02d" % (uid)
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name return name
def describe(node, depth=2):
#return repr(NodeRepr(node))
parent = ''
if depth and node.getparent() is not None:
parent = describe(node.getparent(), depth=depth - 1)
return parent + '/' + describe_node(node)
def text_content(elem, length=40):
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
if len(content) < length:
return content
return content[:length] + '...'

@ -1,48 +1,60 @@
import re import re
import chardet import chardet
import logging
log = logging.getLogger(__name__)
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'maccyrillic': 'cp1251',
'win1251': 'cp1251',
'win-1251': 'cp1251',
'windows-1251': 'cp1251',
}
def fix_charset(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
return CHARSETS.get(encoding, encoding)
def get_encoding(page): def get_encoding(page):
# Regex for XML and HTML Meta charset declaration declared_encodings = (RE_CHARSET.findall(page) +
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) RE_PRAGMA.findall(page) +
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) RE_XML.findall(page))
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
log.debug("Document has the following encodings: %s" % declared_encodings)
declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) + # Try declared encodings, if any
xml_re.findall(page)) for declared_encoding in declared_encodings:
encoding = fix_charset(declared_encoding)
# Try any declared encodings try:
if len(declared_encodings) > 0: page.decode(encoding)
for declared_encoding in declared_encodings: log.info('Using encoding "%s"' % encoding)
try: return encoding
page.decode(custom_decode(declared_encoding)) except UnicodeDecodeError:
return custom_decode(declared_encoding) log.info('Encoding "%s", specified in the document as "%s" '
except UnicodeDecodeError: 'didn\'t work' % (encoding, declared_encoding))
pass
# Fallback to chardet if declared encodings fail # Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page) text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8' enc = 'utf-8'
if not text.strip() or len(text) < 10: if not text.strip() or len(text) < 10:
return enc # can't guess log.debug("Can't guess encoding because text is too short")
return enc
res = chardet.detect(text) res = chardet.detect(text)
enc = res['encoding'] enc = fix_charset(res['encoding'])
log.info('Trying encoding "%s" guessed '
'with confidence %.2f' % (enc, res['confidence']))
#print '->', enc, "%.2f" % res['confidence'] #print '->', enc, "%.2f" % res['confidence']
enc = custom_decode(enc)
return enc return enc
def custom_decode(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
alternates = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'MacCyrillic': 'cp1251',
}
if encoding in alternates:
return alternates[encoding]
else:
return encoding

@ -3,28 +3,36 @@ from encoding import get_encoding
from lxml.html import tostring from lxml.html import tostring
import logging import logging
import lxml.html import lxml.html
import re, sys import re
log = logging.getLogger(__name__)
utf8_parser = lxml.html.HTMLParser(encoding='utf-8') utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def lxml_fromstring(doc):
return lxml.html.document_fromstring(doc, parser=utf8_parser)
def build_doc(page): def build_doc(page):
if isinstance(page, unicode): if isinstance(page, unicode):
enc = None enc = None
page_unicode = page unicode_page = page
else: else:
enc = get_encoding(page) or 'utf-8' enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace') unicode_page = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) doc = lxml_fromstring(unicode_page.encode('utf-8', 'replace').replace('\r', ''))
return doc, enc return doc, enc
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title): def normalize_entities(cur_title):
entities = { entities = {
u'\u2014':'-', u'\u2014': '-',
u'\u2013':'-', u'\u2013': '-',
u'&mdash;': '-', u'&mdash;': '-',
u'&ndash;': '-', u'&ndash;': '-',
u'\u00A0': ' ', u'\u00A0': ' ',
@ -38,9 +46,11 @@ def normalize_entities(cur_title):
return cur_title return cur_title
def norm_title(title): def norm_title(title):
return normalize_entities(normalize_spaces(title)) return normalize_entities(normalize_spaces(title))
def get_title(doc): def get_title(doc):
title = doc.find('.//title') title = doc.find('.//title')
if title is None or len(title.text) == 0: if title is None or len(title.text) == 0:
@ -48,12 +58,19 @@ def get_title(doc):
return norm_title(title.text) return norm_title(title.text)
def add_match(collection, text, orig): def add_match(collection, text, orig):
text = norm_title(text) text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15: if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''): if text.replace('"', '') in orig.replace('"', ''):
collection.add(text) collection.add(text)
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
'.news_title', '.title', '.head', '.heading',
'.contentheading', '.small_header_red']
def shorten_title(doc): def shorten_title(doc):
title = doc.find('.//title') title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0: if title is None or title.text is None or len(title.text) == 0:
@ -70,7 +87,7 @@ def shorten_title(doc):
if e.text_content(): if e.text_content():
add_match(candidates, e.text_content(), orig) add_match(candidates, e.text_content(), orig)
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: for item in TITLE_CSS_HEURISTICS:
for e in doc.cssselect(item): for e in doc.cssselect(item):
if e.text: if e.text:
add_match(candidates, e.text, orig) add_match(candidates, e.text, orig)
@ -102,13 +119,16 @@ def shorten_title(doc):
return title return title
def get_body(doc): def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] for elem in doc.xpath('.//script | .//link | .//style'):
elem.drop_tree()
raw_html = unicode(tostring(doc.body or doc)) raw_html = unicode(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html) cleaned = clean_attributes(raw_html)
try: try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it? #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned return cleaned
except Exception: #FIXME find the equivalent lxml error except Exception: # FIXME find the equivalent lxml error
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) log.error("cleaning broken html content: "
"%s\n---------\n%s" % (raw_html, cleaned))
return raw_html return raw_html

@ -3,23 +3,21 @@ import logging
import re import re
import sys import sys
from collections import defaultdict
from lxml.etree import tostring from lxml.etree import tostring
from lxml.etree import tounicode from lxml.etree import tounicode
from lxml.html import document_fromstring from lxml.html import document_fromstring
from lxml.html import fragment_fromstring from lxml.html import fragment_fromstring
from cleaners import clean_attributes from .cleaners import clean_attributes
from cleaners import html_cleaner from .cleaners import html_cleaner
from htmls import build_doc from .htmls import build_doc
from htmls import get_body from .htmls import get_body
from htmls import get_title from .htmls import get_title
from htmls import shorten_title from .htmls import shorten_title
from encoding import get_encoding
from debug import describe, text_content, open_in_browser
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
log = logging.getLogger(__file__)
REGEXES = { REGEXES = {
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -41,21 +39,6 @@ class Unparseable(ValueError):
pass pass
def describe(node, depth=1):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''):
name += '#' + node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if depth and node.getparent() is not None:
return name + ' - ' + describe(node.getparent(), depth - 1)
return name
def to_int(x): def to_int(x):
if not x: if not x:
return None return None
@ -68,43 +51,50 @@ def to_int(x):
def clean(text): def clean(text):
text = re.sub('[ \t]+', ' ', text)
text = re.sub('\s*\n\s*', '\n', text) text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
return text.strip() return text.strip()
def text_length(i): def text_length(i):
return len(clean(i.text_content() or "")) return len(clean(i.text_content() or ""))
regexp_type = type(re.compile('hello, world')) regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements): def compile_pattern(elements):
if not elements: if not elements:
return None return None
if isinstance(elements, regexp_type): if isinstance(elements, regexp_type):
return elements return elements
if isinstance(elements, basestring):
if isinstance(elements, _basestring):
elements = elements.split(',') elements = elements.split(',')
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
class Document: class Document:
"""Class to build a etree document out of html.""" """Class to build a etree document out of html."""
TEXT_LENGTH_THRESHOLD = 25 TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250 RETRY_LENGTH = 250
def __init__(self, input, positive_keywords=None, negative_keywords=None, **options): def __init__(self, input, positive_keywords=None, negative_keywords=None,
**options):
"""Generate the document """Generate the document
:param input: string of the html content. :param input: string of the html content.
kwargs: kwargs:
- attributes: - attributes:
- debug: output debug messages
- min_text_length: - min_text_length:
- retry_length: - retry_length:
- url: will allow adjusting links to be absolute - url: will allow adjusting links to be absolute
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"] - positive_keywords: the list of positive search patterns in
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] classes and ids, for example: ["news-item", "block"]
- negative_keywords: the list of negative
search patterns in classes
and ids, for example: ["mysidebar", "related", "ads"]
Also positive_keywords and negative_keywords could be a regexp. Also positive_keywords and negative_keywords could be a regexp.
""" """
self.input = input self.input = input
@ -127,6 +117,11 @@ class Document:
doc.make_links_absolute(base_href, resolve_base_href=True) doc.make_links_absolute(base_href, resolve_base_href=True)
else: else:
doc.resolve_base_href() doc.resolve_base_href()
if self.options.get('xpath'):
root = doc.getroottree()
for i in doc.getiterator():
#print root.getpath(i)
i.attrib['x'] = root.getpath(i)
return doc return doc
def content(self): def content(self):
@ -139,7 +134,7 @@ class Document:
return shorten_title(self._html(True)) return shorten_title(self._html(True))
def get_clean_html(self): def get_clean_html(self):
return clean_attributes(tounicode(self.html)) return clean_attributes(tounicode(self.html))
def summary(self, html_partial=False): def summary(self, html_partial=False):
"""Generate the summary of the html docuemnt """Generate the summary of the html docuemnt
@ -165,18 +160,18 @@ class Document:
if best_candidate: if best_candidate:
article = self.get_article(candidates, best_candidate, article = self.get_article(candidates, best_candidate,
html_partial=html_partial) html_partial=html_partial)
else: else:
if ruthless: if ruthless:
log.debug("ruthless removal did not work. ") log.info("ruthless removal did not work. ")
ruthless = False ruthless = False
self.debug( log.info(
("ended up stripping too much - " ("ended up stripping too much - "
"going for a safer _parse")) "going for a safer parse"))
# try again # try again
continue continue
else: else:
log.debug( log.info(
("Ruthless and lenient parsing did not work. " ("Ruthless and lenient parsing did not work. "
"Returning raw html")) "Returning raw html"))
article = self.html.find('body') article = self.html.find('body')
@ -194,7 +189,7 @@ class Document:
continue continue
else: else:
return cleaned_article return cleaned_article
except StandardError, e: except Exception as e:
log.exception('error getting summary: ') log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2] raise Unparseable(str(e)), None, sys.exc_info()[2]
@ -219,7 +214,8 @@ class Document:
append = True append = True
sibling_key = sibling # HashableElement(sibling) sibling_key = sibling # HashableElement(sibling)
if sibling_key in candidates and \ if sibling_key in candidates and \
candidates[sibling_key]['content_score'] >= sibling_score_threshold: candidates[sibling_key]['content_score'] >= \
sibling_score_threshold:
append = True append = True
if sibling.tag == "p": if sibling.tag == "p":
@ -230,8 +226,8 @@ class Document:
if node_length > 80 and link_density < 0.25: if node_length > 80 and link_density < 0.25:
append = True append = True
elif node_length <= 80 \ elif node_length <= 80 \
and link_density == 0 \ and link_density == 0 \
and re.search('\.( |$)', node_content): and re.search('\.( |$)', node_content):
append = True append = True
if append: if append:
@ -241,21 +237,26 @@ class Document:
output.append(sibling) output.append(sibling)
else: else:
output.getchildren()[0].getchildren()[0].append(sibling) output.getchildren()[0].getchildren()[0].append(sibling)
#if output is not None: # if output is not None:
# output.append(best_elem) # output.append(best_elem)
return output return output
def select_best_candidate(self, candidates): def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) if not candidates:
return None
sorted_candidates = sorted(
candidates.values(),
key=lambda x: x['content_score'],
reverse=True
)
for candidate in sorted_candidates[:5]: for candidate in sorted_candidates[:5]:
elem = candidate['elem'] elem = candidate['elem']
self.debug("Top 5 : %6.3f %s" % ( log.info("Top 5 : %6.3f %s" % (
candidate['content_score'], candidate['content_score'],
describe(elem))) describe(elem)))
if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0] best_candidate = sorted_candidates[0]
return best_candidate return best_candidate
@ -292,7 +293,8 @@ class Document:
candidates[parent_node] = self.score_node(parent_node) candidates[parent_node] = self.score_node(parent_node)
ordered.append(parent_node) ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates: if grand_parent_node is not None and \
grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node( candidates[grand_parent_node] = self.score_node(
grand_parent_node) grand_parent_node)
ordered.append(grand_parent_node) ordered.append(grand_parent_node)
@ -315,11 +317,8 @@ class Document:
candidate = candidates[elem] candidate = candidates[elem]
ld = self.get_link_density(elem) ld = self.get_link_density(elem)
score = candidate['content_score'] score = candidate['content_score']
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % ( log.debug("Branch %6.3f %s link density %.3f -> %6.3f" % (
score, score, describe(elem), ld, score * (1 - ld)))
describe(elem),
ld,
score * (1 - ld)))
candidate['content_score'] *= (1 - ld) candidate['content_score'] *= (1 - ld)
return candidates return candidates
@ -334,16 +333,20 @@ class Document:
if REGEXES['positiveRe'].search(feature): if REGEXES['positiveRe'].search(feature):
weight += 25 weight += 25
if self.positive_keywords and self.positive_keywords.search(feature): if self.positive_keywords and self.positive_keywords.search(
feature):
weight += 25 weight += 25
if self.negative_keywords and self.negative_keywords.search(feature): if self.negative_keywords and self.negative_keywords.search(
feature):
weight -= 25 weight -= 25
if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag): if self.positive_keywords and self.positive_keywords.match(
'tag-' + e.tag):
weight += 25 weight += 25
if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag): if self.negative_keywords and self.negative_keywords.match(
'tag-' + e.tag):
weight -= 25 weight -= 25
return weight return weight
@ -365,8 +368,7 @@ class Document:
} }
def debug(self, *a): def debug(self, *a):
if self.options.get('debug', False): log.warn("debug: " + a[0], *a[1:])
log.debug(*a)
def remove_unlikely_candidates(self): def remove_unlikely_candidates(self):
for elem in self.html.iter(): for elem in self.html.iter():
@ -375,22 +377,22 @@ class Document:
continue continue
#self.debug(s) #self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
self.debug("Removing unlikely candidate - %s" % describe(elem)) log.debug("Removing unlikely candidate - %s" % describe(elem))
elem.drop_tree() elem.drop_tree()
def transform_misused_divs_into_paragraphs(self): def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'): for elem in self.tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into # transform <div>s that do not contain other block elements into
# <p>s # <p>s
#FIXME: The current implementation ignores all descendants that # FIXME: The current implementation ignores all descendants that
# are not direct children of elem # are not direct children of elem
# This results in incorrect results in case there is an <img> # This results in incorrect results in case there is an <img>
# buried within an <a> for example # buried within an <a> for example
if not REGEXES['divToPElementsRe'].search( if not REGEXES['divToPElementsRe'].search(
unicode(''.join(map(tostring, list(elem))))): unicode(''.join(map(tostring, list(elem))))):
#self.debug("Altering %s to p" % (describe(elem))) # self.debug("Altering %s to p" % describe(elem))
elem.tag = "p" elem.tag = "p"
#print "Fixed element "+describe(elem) # self.debug("Fixed element "+describe(elem))
for elem in self.tags(self.html, 'div'): for elem in self.tags(self.html, 'div'):
if elem.text and elem.text.strip(): if elem.text and elem.text.strip():
@ -398,7 +400,7 @@ class Document:
p.text = elem.text p.text = elem.text
elem.text = None elem.text = None
elem.insert(0, p) elem.insert(0, p)
#print "Appended "+tounicode(p)+" to "+describe(elem) # print "Appended "+tounicode(p)+" to "+describe(elem)
for pos, child in reversed(list(enumerate(elem))): for pos, child in reversed(list(enumerate(elem))):
if child.tail and child.tail.strip(): if child.tail and child.tail.strip():
@ -406,9 +408,9 @@ class Document:
p.text = child.tail p.text = child.tail
child.tail = None child.tail = None
elem.insert(pos + 1, p) elem.insert(pos + 1, p)
#print "Inserted "+tounicode(p)+" to "+describe(elem) # print "Inserted "+tounicode(p)+" to "+describe(elem)
if child.tag == 'br': if child.tag == 'br':
#print 'Dropped <br> at '+describe(elem) # print 'Dropped <br> at '+describe(elem)
child.drop_tree() child.drop_tree()
def tags(self, node, *tag_names): def tags(self, node, *tag_names):
@ -422,14 +424,15 @@ class Document:
yield e yield e
def sanitize(self, node, candidates): def sanitize(self, node, candidates):
MIN_LEN = self.options.get('min_text_length', MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
self.TEXT_LENGTH_THRESHOLD)
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: if self.class_weight(header) < 0 or \
self.get_link_density(header) > 0.33:
header.drop_tree() header.drop_tree()
for elem in self.tags(node, "form", "iframe", "textarea"): for elem in self.tags(node, "form", "iframe", "textarea"):
elem.drop_tree() elem.drop_tree()
allowed = {} allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s # Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"): for el in self.reverse_tags(node, "table", "ul", "div"):
@ -438,13 +441,13 @@ class Document:
weight = self.class_weight(el) weight = self.class_weight(el)
if el in candidates: if el in candidates:
content_score = candidates[el]['content_score'] content_score = candidates[el]['content_score']
#print '!',el, '-> %6.3f' % content_score # print '!',el, '-> %6.3f' % content_score
else: else:
content_score = 0 content_score = 0
tag = el.tag tag = el.tag
if weight + content_score < 0: if weight + content_score < 0:
self.debug("Cleaned %s with score %6.3f and weight %-3s" % log.info("Removed %s with score %6.3f and weight %-3s" %
(describe(el), content_score, weight, )) (describe(el), content_score, weight, ))
el.drop_tree() el.drop_tree()
elif el.text_content().count(",") < 10: elif el.text_content().count(",") < 10:
@ -452,6 +455,7 @@ class Document:
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
counts[kind] = len(el.findall('.//%s' % kind)) counts[kind] = len(el.findall('.//%s' % kind))
counts["li"] -= 100 counts["li"] -= 100
counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
# Count the text length excluding any surrounding whitespace # Count the text length excluding any surrounding whitespace
content_length = text_length(el) content_length = text_length(el)
@ -459,31 +463,36 @@ class Document:
parent_node = el.getparent() parent_node = el.getparent()
if parent_node is not None: if parent_node is not None:
if parent_node in candidates: if parent_node in candidates:
content_score = candidates[parent_node]['content_score'] content_score = candidates[
parent_node]['content_score']
else: else:
content_score = 0 content_score = 0
#if parent_node is not None: # if parent_node is not None:
#pweight = self.class_weight(parent_node) + content_score # pweight = self.class_weight(parent_node) + content_score
#pname = describe(parent_node) # pname = describe(parent_node)
#else: # else:
#pweight = 0 # pweight = 0
#pname = "no parent" # pname = "no parent"
to_remove = False to_remove = False
reason = "" reason = ""
#if el.tag == 'div' and counts["img"] >= 1: # if el.tag == 'div' and counts["img"] >= 1:
# continue # continue
if counts["p"] and counts["img"] > counts["p"]: if content_length and counts["img"] * 100 >= content_length:
reason = "too many images (%s)" % counts["img"] reason = "too many images (%s) for text " % counts["img"]
to_remove = True to_remove = True
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": elif counts["li"] > counts["p"] \
and tag != "ul" and tag != "ol":
reason = "more <li>s than <p>s" reason = "more <li>s than <p>s"
to_remove = True to_remove = True
elif counts["input"] > (counts["p"] / 3): elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s" reason = "less than 3x <p>s than <input>s"
to_remove = True to_remove = True
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): elif content_length < MIN_LEN and not counts["img"]:
reason = "too short content length %s without a single image" % content_length reason = "too short content length %s and no images" % content_length
to_remove = True
elif content_length < MIN_LEN and counts["img"] > 2:
reason = "too short content length %s and too much images" % content_length
to_remove = True to_remove = True
elif weight < 25 and link_density > 0.2: elif weight < 25 and link_density > 0.2:
reason = "too many links %.3f for its weight %s" % ( reason = "too many links %.3f for its weight %s" % (
@ -496,6 +505,9 @@ class Document:
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "<embed>s with too short content length, or too many <embed>s" reason = "<embed>s with too short content length, or too many <embed>s"
to_remove = True to_remove = True
elif not content_length:
reason = "no content"
to_remove = True
# if el.tag == 'div' and counts['img'] >= 1 and to_remove: # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
# imgs = el.findall('.//img') # imgs = el.findall('.//img')
# valid_img = False # valid_img = False
@ -523,35 +535,38 @@ class Document:
#self.debug(sib.text_content()) #self.debug(sib.text_content())
sib_content_length = text_length(sib) sib_content_length = text_length(sib)
if sib_content_length: if sib_content_length:
i =+ 1 i += 1
siblings.append(sib_content_length) siblings.append(sib_content_length)
if i == x: if i >= x:
break break
for sib in el.itersiblings(preceding=True): for sib in el.itersiblings(preceding=True):
#self.debug(sib.text_content()) #self.debug(sib.text_content())
sib_content_length = text_length(sib) sib_content_length = text_length(sib)
if sib_content_length: if sib_content_length:
j =+ 1 j += 1
siblings.append(sib_content_length) siblings.append(sib_content_length)
if j == x: if j >= x:
break break
#self.debug(str(siblings)) #self.debug(str(siblings))
if siblings and sum(siblings) > 1000: if siblings and sum(siblings) > 1000:
to_remove = False to_remove = False
self.debug("Allowing %s" % describe(el)) log.info("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"): for desnode in self.tags(el, "table", "ul", "div"):
allowed[desnode] = True allowed[desnode] = True
if to_remove: if to_remove:
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % log.info("Cleaned %s (score=%6.3f, weight=%s) cause it has %s: %s" %
(content_score, describe(el), weight, reason)) (describe(el), content_score, weight, reason, text_content(el)))
#print tounicode(el) #print tounicode(el)
#self.debug("pname %s pweight %.3f" %(pname, pweight)) #self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree() el.drop_tree()
else:
log.info("Not cleaned %s of length %s: %s" %
(describe(el), content_length, text_content(el)))
for el in ([node] + [n for n in node.iter()]): for el in ([node] + [n for n in node.iter()]):
if not self.options.get('attributes', None): if not self.options.get('attributes', None):
#el.attrib = {} #FIXME:Checkout the effects of disabling this # el.attrib = {} #FIXME:Checkout the effects of disabling this
pass pass
self.html = node self.html = node
@ -584,12 +599,23 @@ class HashableElement():
def __getattr__(self, tag): def __getattr__(self, tag):
return getattr(self.node, tag) return getattr(self.node, tag)
VERBOSITY = {
1: logging.WARNING,
2: logging.INFO,
3: logging.DEBUG
}
def main(): def main():
from optparse import OptionParser from optparse import OptionParser
parser = OptionParser(usage="%prog: [options] [file]") parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true') parser.add_option('-v', '--verbose', action='count', default=0)
parser.add_option('-b', '--browser', default=None, action='store_true', help="open in browser")
parser.add_option('-l', '--log', default=None, help="use filename for logs (appended)")
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
parser.add_option('-s', '--show-xpath', default=None, help="show xpath")
parser.add_option('-x', '--xpath', default=None, help="use xpath")
parser.add_option('-t', '--support-text', default=None, help="use this support text")
parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store') parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store') parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
@ -598,20 +624,33 @@ def main():
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
if options.verbose:
logging.basicConfig(level=VERBOSITY[options.verbose], filename=options.log,
format='%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)')
file = None file = None
if options.url: if options.url:
import urllib import urllib
file = urllib.urlopen(options.url) file = urllib.urlopen(options.url)
else: else:
file = open(args[0], 'rt') file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING html = file.read() # bytes object
encoding = get_encoding(html)
html = html.decode(encoding)
try: try:
print Document(file.read(), doc = Document(html, url=options.url,
debug=options.verbose, positive_keywords=options.positive_keywords,
url=options.url, negative_keywords=options.negative_keywords)
positive_keywords = options.positive_keywords, if options.browser:
negative_keywords = options.negative_keywords, result = 'Title: ' + doc.short_title() + '<br/>' + doc.summary()
).summary().encode(enc, 'replace') open_in_browser(result)
else:
# XXX: a hack, better to set PYTHONIOENCODING explicitly
output_encoding = sys.__stdout__.encoding or 'utf-8'
print 'Title:', doc.short_title().encode(output_encoding, 'replace')
print doc.summary().encode(output_encoding, 'replace')
finally: finally:
file.close() file.close()

@ -6,13 +6,13 @@ lxml_requirement = "lxml"
if sys.platform == 'darwin': if sys.platform == 'darwin':
import platform import platform
mac_ver = platform.mac_ver()[0] mac_ver = platform.mac_ver()[0]
if mac_ver < '10.9': if int(mac_ver.split('.')[1]) < 9:
print "Using lxml<2.4" print "Using lxml<2.4 for Mac OS X < 10.9"
lxml_requirement = "lxml<2.4" lxml_requirement = "lxml<2.4"
setup( setup(
name="readability-lxml", name="readability-lxml",
version="0.3.0.3", version="0.5.0.3",
author="Yuri Baburov", author="Yuri Baburov",
author_email="burchik@gmail.com", author_email="burchik@gmail.com",
description="fast python port of arc90's readability tool", description="fast python port of arc90's readability tool",

Loading…
Cancel
Save