diff --git a/README b/README index 72b3b3d..92c0cfb 100644 --- a/README +++ b/README @@ -35,6 +35,11 @@ Command-line usage:: python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml +Using positive/negative keywords example:: + + python -m readability.readability -p intro -n newsindex,homepage-box,news-section -u http://python.org + + Document() kwarg options: - attributes: @@ -42,6 +47,8 @@ Document() kwarg options: - min_text_length: - retry_length: - url: will allow adjusting links to be absolute + - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"] + - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] Updates @@ -49,3 +56,4 @@ Updates - 0.2.5 Update setup.py for uploading .tar.gz to pypi - 0.2.6 Don't crash on documents with no title - 0.2.6.1 Document.short_title() properly works + - 0.3 Added Document.encoding, positive_keywords and negative_keywords diff --git a/readability/htmls.py b/readability/htmls.py index 9b59993..edaaa52 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -3,9 +3,7 @@ from encoding import get_encoding from lxml.html import tostring import logging import lxml.html -import re - -logging.getLogger().setLevel(logging.DEBUG) +import re, sys utf8_parser = lxml.html.HTMLParser(encoding='utf-8') @@ -14,9 +12,19 @@ def build_doc(page): page_unicode = page else: enc = get_encoding(page) - page_unicode = page.decode(enc, 'replace') + if enc: + page_unicode = page.decode(enc, 'replace') + encoding = enc + else: + try: + #try utf-8 + page_unicode = page.decode('utf-8', 'strict') + encoding = 'utf-8' + except UnicodeDecodeError: + page_unicode = page.decode('utf-8', 'replace') + encoding = 'utf-8' doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) - return doc + return doc, encoding def js_re(src, pattern, flags, repl): return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) @@ -111,5 +119,5 @@ def get_body(doc): #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned except Exception: #FIXME find the equivalent lxml error - logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) + #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html diff --git a/readability/readability.py b/readability/readability.py index fc37636..bf058ed 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -76,13 +76,23 @@ def clean(text): def text_length(i): return len(clean(i.text_content() or "")) +regexp_type = type(re.compile('hello, world')) + +def compile_pattern(elements): + if not elements: + return None + if isinstance(elements, regexp_type): + return elements + if isinstance(elements, basestring): + elements = elements.split(',') + return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) class Document: """Class to build a etree document out of html.""" TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 - def __init__(self, input, **options): + def __init__(self, input, positive_keywords=None, negative_keywords=None, **options): """Generate the document :param input: string of the html content. @@ -93,11 +103,16 @@ class Document: - min_text_length: - retry_length: - url: will allow adjusting links to be absolute - + - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"] + - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] + Also positive_keywords and negative_keywords could be a regexp. """ self.input = input self.options = options self.html = None + self.encoding = None + self.positive_keywords = compile_pattern(positive_keywords) + self.negative_keywords = compile_pattern(negative_keywords) def _html(self, force=False): if force or self.html is None: @@ -105,7 +120,7 @@ class Document: return self.html def _parse(self, input): - doc = build_doc(input) + doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.options.get('url', None) if base_href: @@ -311,19 +326,25 @@ class Document: def class_weight(self, e): weight = 0 - if e.get('class', None): - if REGEXES['negativeRe'].search(e.get('class')): - weight -= 25 + for feature in [e.get('class', None), e.get('id', None)]: + if feature: + if REGEXES['negativeRe'].search(feature): + weight -= 25 + + if REGEXES['positiveRe'].search(feature): + weight += 25 + + if self.positive_keywords and self.positive_keywords.search(feature): + weight += 25 - if REGEXES['positiveRe'].search(e.get('class')): - weight += 25 + if self.negative_keywords and self.negative_keywords.search(feature): + weight -= 25 - if e.get('id', None): - if REGEXES['negativeRe'].search(e.get('id')): - weight -= 25 + if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag): + weight += 25 - if REGEXES['positiveRe'].search(e.get('id')): - weight += 25 + if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag): + weight -= 25 return weight @@ -569,6 +590,8 @@ def main(): parser = OptionParser(usage="%prog: [options] [file]") parser.add_option('-v', '--verbose', action='store_true') parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") + parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store') + parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store') (options, args) = parser.parse_args() if not (len(args) == 1 or options.url): @@ -581,11 +604,14 @@ def main(): file = urllib.urlopen(options.url) else: file = open(args[0], 'rt') - enc = sys.__stdout__.encoding or 'utf-8' + enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING try: print Document(file.read(), debug=options.verbose, - url=options.url).summary().encode(enc, 'replace') + url=options.url, + positive_keywords = options.positive_keywords, + negative_keywords = options.negative_keywords, + ).summary().encode(enc, 'replace') finally: file.close() diff --git a/setup.py b/setup.py index 571a9ed..3558177 100755 --- a/setup.py +++ b/setup.py @@ -1,9 +1,15 @@ #!/usr/bin/env python from setuptools import setup, find_packages +import sys + +if sys.platform == 'darwin': + lxml = "lxml<2.4" +else: + lxml = "lxml" setup( name="readability-lxml", - version="0.2.6.1", + version="0.3", author="Yuri Baburov", author_email="burchik@gmail.com", description="fast python port of arc90's readability tool", @@ -14,7 +20,7 @@ setup( packages=['readability'], install_requires=[ "chardet", - "lxml" + lxml ], classifiers=[ "Environment :: Web Environment",