Several updates from dev version.

Makefile updates
Fix for Mac OS X 10.10
7 changed files with 301 additions and 186 deletions
--- a/16
+++ b/16
@ -1,9 +1,9 @@
 # Makefile to help automate tasks
 WD := $(shell pwd)
-PY := bin/python
-PIP := bin/pip
-PEP8 := bin/pep8
-NOSE := bin/nosetests
+PY := .env/bin/python
+PIP := .env/bin/pip
+PEP8 := .env/bin/pep8
+NOSE := .env/bin/nosetests


 # ###########
@ -24,14 +24,14 @@ all: venv develop

 venv: bin/python
 bin/python:
-	virtualenv .
+	virtualenv .env

 .PHONY: clean_venv
 clean_venv:
-	rm -rf bin include lib local man
+	rm -rf .env

-develop: lib/python*/site-packages/bookie-api.egg-link
-lib/python*/site-packages/bookie-api.egg-link:
+develop: .env/lib/python*/site-packages/readability-lxml.egg-link
+.env/lib/python*/site-packages/readability-lxml.egg-link:
 	$(PY) setup.py develop


--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@ -1,15 +1,20 @@
-# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+# -*- encoding: utf-8 -*-
+
+# strip out a set of nuisance html attributes that can mess up rendering
+# in RSS feeds
+
 import re
 from lxml.html.clean import Cleaner

-bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ['width', 'height', 'style',
+             '[-a-z]*color', 'background[-a-z]*', 'on*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
 non_space = '[^ "\'>]+'
-htmlstrip = re.compile("<" # open
-    "([^>]+) " # prefix
-    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
-    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+htmlstrip = re.compile("<"  # open
+    "([^>]+) "  # prefix
+    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
    "([^>]*)"  # postfix
    ">"        # end
 , re.I)
@ -20,13 +25,15 @@ def clean_attributes(html):
    return html

 def normalize_spaces(s):
-    if not s: return ''
+    if not s:
+        return ''
    """replace any sequence of whitespace
    characters with a single space"""
    return ' '.join(s.split())

 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
-                  page_structure=False, processing_instructions=True, embedded=False,
-                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
+                  page_structure=False, processing_instructions=True,
+                  embedded=False, frames=False, forms=False,
+                  annoying_tags=False, remove_tags=None,
                  remove_unknown_tags=False, safe_attrs_only=False)
--- a/readability/debug.py
+++ b/readability/debug.py
@ -1,25 +1,62 @@
-def save_to_file(text, filename):
-    f = open(filename, 'wt')
-    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    f.write(text.encode('utf-8'))
-    f.close()
+import re

-uids = {} 
-def describe(node, depth=2):
+
+uids = {}
+RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
+
+
+def open_in_browser(html):
+    """
+    Open the HTML document in a web browser, saving it to a temporary
+    file to open it.  Note that this does not delete the file after
+    use.  This is mainly meant for debugging.
+    """
+    import os
+    import webbrowser
+    import tempfile
+    handle, fn = tempfile.mkstemp(suffix='.html')
+    f = os.fdopen(handle, 'wb')
+    try:
+        f.write("<meta charset='UTF-8' />")
+        f.write(html.encode('utf-8'))
+    finally:
+        # we leak the file itself here, but we should at least close it
+        f.close()
+    url = 'file://' + fn.replace(os.path.sep, '/')
+    webbrowser.open(url)
+    return url
+
+
+def describe_node(node):
+    if node is None:
+        return ''
    if not hasattr(node, 'tag'):
        return "[%s]" % type(node)
    name = node.tag
-    if node.get('id', ''): name += '#'+node.get('id') 
-    if node.get('class', ''): 
-        name += '.' + node.get('class').replace(' ','.')
+    if node.get('id', ''):
+        name += '#' + node.get('id')
+    if node.get('class', ''):
+        name += '.' + node.get('class').replace(' ', '.')
    if name[:4] in ['div#', 'div.']:
        name = name[3:]
    if name in ['tr', 'td', 'div', 'p']:
-        if not node in uids:
-            uid = uids[node] = len(uids)+1
-        else:
-            uid = uids.get(node)
-        name += "%02d" % (uid)
-    if depth and node.getparent() is not None:
-        return name+' - '+describe(node.getparent(), depth-1)
+        uid = uids.get(node)
+        if uid is None:
+            uid = uids[node] = len(uids) + 1
+        name += "{%02d}" % uid
    return name
+
+
+def describe(node, depth=2):
+    #return repr(NodeRepr(node))
+    parent = ''
+    if depth and node.getparent() is not None:
+        parent = describe(node.getparent(), depth=depth - 1)
+    return parent + '/' + describe_node(node)
+
+
+def text_content(elem, length=40):
+    content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
+    if len(content) < length:
+        return content
+    return content[:length] + '...'
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,48 +1,60 @@
 import re
 import chardet
+import logging
+
+log = logging.getLogger(__name__)
+
+
+RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
+RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
+RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+
+CHARSETS = {
+    'big5': 'big5hkscs',
+    'gb2312': 'gb18030',
+    'ascii': 'utf-8',
+    'maccyrillic': 'cp1251',
+    'win1251': 'cp1251',
+    'win-1251': 'cp1251',
+    'windows-1251': 'cp1251',
+}
+
+
+def fix_charset(encoding):
+    """Overrides encoding when charset declaration
+       or charset determination is a subset of a larger
+       charset.  Created because of issues with Chinese websites"""
+    encoding = encoding.lower()
+    return CHARSETS.get(encoding, encoding)
+

 def get_encoding(page):
-    # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
-
-    declared_encodings = (charset_re.findall(page) +
-            pragma_re.findall(page) +
-            xml_re.findall(page))
-
-    # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    declared_encodings = (RE_CHARSET.findall(page) +
+                          RE_PRAGMA.findall(page) +
+                          RE_XML.findall(page))
+
+    log.debug("Document has the following encodings: %s" % declared_encodings)
+
+    # Try declared encodings, if any
+    for declared_encoding in declared_encodings:
+        encoding = fix_charset(declared_encoding)
+        try:
+            page.decode(encoding)
+            log.info('Using encoding "%s"' % encoding)
+            return encoding
+        except UnicodeDecodeError:
+            log.info('Encoding "%s", specified in the document as "%s" '
+                     'didn\'t work' % (encoding, declared_encoding))

    # Fallback to chardet if declared encodings fail
    text = re.sub('</?[^>]*>\s*', ' ', page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
-        return enc # can't guess
+        log.debug("Can't guess encoding because text is too short")
+        return enc
    res = chardet.detect(text)
-    enc = res['encoding']
+    enc = fix_charset(res['encoding'])
+    log.info('Trying encoding "%s" guessed '
+             'with confidence %.2f' % (enc, res['confidence']))
    #print '->', enc, "%.2f" % res['confidence']
-    enc = custom_decode(enc)
    return enc
-
-def custom_decode(encoding):
-    """Overrides encoding when charset declaration
-       or charset determination is a subset of a larger
-       charset.  Created because of issues with Chinese websites"""
-    encoding = encoding.lower()
-    alternates = {
-        'big5': 'big5hkscs',
-        'gb2312': 'gb18030',
-        'ascii': 'utf-8',
-        'MacCyrillic': 'cp1251',
-    }
-    if encoding in alternates:
-        return alternates[encoding]
-    else:
-        return encoding
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -3,28 +3,36 @@ from encoding import get_encoding
 from lxml.html import tostring
 import logging
 import lxml.html
-import re, sys
+import re
+
+log = logging.getLogger(__name__)

 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

+
+def lxml_fromstring(doc):
+    return lxml.html.document_fromstring(doc, parser=utf8_parser)
+
+
 def build_doc(page):
    if isinstance(page, unicode):
        enc = None
-        page_unicode = page
+        unicode_page = page
    else:
        enc = get_encoding(page) or 'utf-8'
-        page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
+        unicode_page = page.decode(enc, 'replace')
+    doc = lxml_fromstring(unicode_page.encode('utf-8', 'replace').replace('\r', ''))
    return doc, enc

+
 def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


 def normalize_entities(cur_title):
    entities = {
-        u'\u2014':'-',
-        u'\u2013':'-',
+        u'\u2014': '-',
+        u'\u2013': '-',
        u'&mdash;': '-',
        u'&ndash;': '-',
        u'\u00A0': ' ',
@ -38,9 +46,11 @@ def normalize_entities(cur_title):

    return cur_title

+
 def norm_title(title):
    return normalize_entities(normalize_spaces(title))

+
 def get_title(doc):
    title = doc.find('.//title')
    if title is None or len(title.text) == 0:
@ -48,12 +58,19 @@ def get_title(doc):

    return norm_title(title.text)

+
 def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
        if text.replace('"', '') in orig.replace('"', ''):
            collection.add(text)

+
+TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
+                        '.news_title', '.title', '.head', '.heading',
+                        '.contentheading', '.small_header_red']
+
+
 def shorten_title(doc):
    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
@ -70,7 +87,7 @@ def shorten_title(doc):
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

-    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+    for item in TITLE_CSS_HEURISTICS:
        for e in doc.cssselect(item):
            if e.text:
                add_match(candidates, e.text, orig)
@ -102,13 +119,16 @@ def shorten_title(doc):

    return title

+
 def get_body(doc):
-    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    for elem in doc.xpath('.//script | .//link | .//style'):
+        elem.drop_tree()
    raw_html = unicode(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
-    except Exception: #FIXME find the equivalent lxml error
-        #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+    except Exception:  # FIXME find the equivalent lxml error
+        log.error("cleaning broken html content: "
+                  "%s\n---------\n%s" % (raw_html, cleaned))
        return raw_html
--- a/readability/readability.py
+++ b/readability/readability.py
@ -3,23 +3,21 @@ import logging
 import re
 import sys

-from collections import defaultdict
 from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring

-from cleaners import clean_attributes
-from cleaners import html_cleaner
-from htmls import build_doc
-from htmls import get_body
-from htmls import get_title
-from htmls import shorten_title
-
-
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger()
+from .cleaners import clean_attributes
+from .cleaners import html_cleaner
+from .htmls import build_doc
+from .htmls import get_body
+from .htmls import get_title
+from .htmls import shorten_title
+from encoding import get_encoding
+from debug import describe, text_content, open_in_browser

+log = logging.getLogger(__file__)

 REGEXES = {
    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -41,21 +39,6 @@ class Unparseable(ValueError):
    pass


-def describe(node, depth=1):
-    if not hasattr(node, 'tag'):
-        return "[%s]" % type(node)
-    name = node.tag
-    if node.get('id', ''):
-        name += '#' + node.get('id')
-    if node.get('class', ''):
-        name += '.' + node.get('class').replace(' ', '.')
-    if name[:4] in ['div#', 'div.']:
-        name = name[3:]
-    if depth and node.getparent() is not None:
-        return name + ' - ' + describe(node.getparent(), depth - 1)
-    return name
-
-
 def to_int(x):
    if not x:
        return None
@ -68,43 +51,50 @@ def to_int(x):


 def clean(text):
+    text = re.sub('[ \t]+', ' ', text)
    text = re.sub('\s*\n\s*', '\n', text)
-    text = re.sub('[ \t]{2,}', ' ', text)
    return text.strip()


 def text_length(i):
    return len(clean(i.text_content() or ""))

+
 regexp_type = type(re.compile('hello, world'))

+
 def compile_pattern(elements):
    if not elements:
        return None
    if isinstance(elements, regexp_type):
        return elements
-    if isinstance(elements, basestring):
+
+    if isinstance(elements, _basestring):
        elements = elements.split(',')
    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)

+
 class Document:
    """Class to build a etree document out of html."""
    TEXT_LENGTH_THRESHOLD = 25
    RETRY_LENGTH = 250

-    def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
+    def __init__(self, input, positive_keywords=None, negative_keywords=None,
+                 **options):
        """Generate the document

        :param input: string of the html content.

        kwargs:
            - attributes:
-            - debug: output debug messages
            - min_text_length:
            - retry_length:
            - url: will allow adjusting links to be absolute
-            - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
-            - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
+            - positive_keywords: the list of positive search patterns in
+                classes and ids, for example: ["news-item", "block"]
+            - negative_keywords: the list of negative
+                search patterns in classes
+                and ids, for example: ["mysidebar", "related", "ads"]
            Also positive_keywords and negative_keywords could be a regexp.
        """
        self.input = input
@ -127,6 +117,11 @@ class Document:
            doc.make_links_absolute(base_href, resolve_base_href=True)
        else:
            doc.resolve_base_href()
+        if self.options.get('xpath'):
+            root = doc.getroottree()
+            for i in doc.getiterator():
+                #print root.getpath(i)
+                i.attrib['x'] = root.getpath(i)
        return doc

    def content(self):
@ -139,7 +134,7 @@ class Document:
        return shorten_title(self._html(True))

    def get_clean_html(self):
-         return clean_attributes(tounicode(self.html))
+        return clean_attributes(tounicode(self.html))

    def summary(self, html_partial=False):
        """Generate the summary of the html docuemnt
@ -165,18 +160,18 @@ class Document:

                if best_candidate:
                    article = self.get_article(candidates, best_candidate,
-                            html_partial=html_partial)
+                                               html_partial=html_partial)
                else:
                    if ruthless:
-                        log.debug("ruthless removal did not work. ")
+                        log.info("ruthless removal did not work. ")
                        ruthless = False
-                        self.debug(
+                        log.info(
                            ("ended up stripping too much - "
-                             "going for a safer _parse"))
+                             "going for a safer parse"))
                        # try again
                        continue
                    else:
-                        log.debug(
+                        log.info(
                            ("Ruthless and lenient parsing did not work. "
                             "Returning raw html"))
                        article = self.html.find('body')
@ -194,7 +189,7 @@ class Document:
                    continue
                else:
                    return cleaned_article
-        except StandardError, e:
+        except Exception as e:
            log.exception('error getting summary: ')
            raise Unparseable(str(e)), None, sys.exc_info()[2]

@ -219,7 +214,8 @@ class Document:
                append = True
            sibling_key = sibling  # HashableElement(sibling)
            if sibling_key in candidates and \
-                candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+                    candidates[sibling_key]['content_score'] >= \
+                    sibling_score_threshold:
                append = True

            if sibling.tag == "p":
@ -230,8 +226,8 @@ class Document:
                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length <= 80 \
-                    and link_density == 0 \
-                    and re.search('\.( |$)', node_content):
+                        and link_density == 0 \
+                        and re.search('\.( |$)', node_content):
                    append = True

            if append:
@ -241,21 +237,26 @@ class Document:
                    output.append(sibling)
                else:
                    output.getchildren()[0].getchildren()[0].append(sibling)
-        #if output is not None:
-        #    output.append(best_elem)
+        # if output is not None:
+        # output.append(best_elem)
        return output

    def select_best_candidate(self, candidates):
-        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+        if not candidates:
+            return None
+
+        sorted_candidates = sorted(
+            candidates.values(),
+            key=lambda x: x['content_score'],
+            reverse=True
+        )
+
        for candidate in sorted_candidates[:5]:
            elem = candidate['elem']
-            self.debug("Top 5 : %6.3f %s" % (
+            log.info("Top 5 : %6.3f %s" % (
                candidate['content_score'],
                describe(elem)))

-        if len(sorted_candidates) == 0:
-            return None
-
        best_candidate = sorted_candidates[0]
        return best_candidate

@ -292,7 +293,8 @@ class Document:
                candidates[parent_node] = self.score_node(parent_node)
                ordered.append(parent_node)

-            if grand_parent_node is not None and grand_parent_node not in candidates:
+            if grand_parent_node is not None and \
+                grand_parent_node not in candidates:
                candidates[grand_parent_node] = self.score_node(
                    grand_parent_node)
                ordered.append(grand_parent_node)
@ -315,11 +317,8 @@ class Document:
            candidate = candidates[elem]
            ld = self.get_link_density(elem)
            score = candidate['content_score']
-            self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
-                score,
-                describe(elem),
-                ld,
-                score * (1 - ld)))
+            log.debug("Branch %6.3f %s link density %.3f -> %6.3f" % (
+                      score, describe(elem), ld, score * (1 - ld)))
            candidate['content_score'] *= (1 - ld)

        return candidates
@ -334,16 +333,20 @@ class Document:
                if REGEXES['positiveRe'].search(feature):
                    weight += 25

-                if self.positive_keywords and self.positive_keywords.search(feature):
+                if self.positive_keywords and self.positive_keywords.search(
+                        feature):
                    weight += 25

-                if self.negative_keywords and self.negative_keywords.search(feature):
+                if self.negative_keywords and self.negative_keywords.search(
+                        feature):
                    weight -= 25

-        if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
+        if self.positive_keywords and self.positive_keywords.match(
+                'tag-' + e.tag):
            weight += 25

-        if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
+        if self.negative_keywords and self.negative_keywords.match(
+                'tag-' + e.tag):
            weight -= 25

        return weight
@ -365,8 +368,7 @@ class Document:
        }

    def debug(self, *a):
-        if self.options.get('debug', False):
-            log.debug(*a)
+        log.warn("debug: " + a[0], *a[1:])

    def remove_unlikely_candidates(self):
        for elem in self.html.iter():
@ -375,22 +377,22 @@ class Document:
                continue
            #self.debug(s)
            if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
-                self.debug("Removing unlikely candidate - %s" % describe(elem))
+                log.debug("Removing unlikely candidate - %s" % describe(elem))
                elem.drop_tree()

    def transform_misused_divs_into_paragraphs(self):
        for elem in self.tags(self.html, 'div'):
            # transform <div>s that do not contain other block elements into
            # <p>s
-            #FIXME: The current implementation ignores all descendants that
+            # FIXME: The current implementation ignores all descendants that
            # are not direct children of elem
            # This results in incorrect results in case there is an <img>
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
                    unicode(''.join(map(tostring, list(elem))))):
-                #self.debug("Altering %s to p" % (describe(elem)))
+                # self.debug("Altering %s to p" % describe(elem))
                elem.tag = "p"
-                #print "Fixed element "+describe(elem)
+                # self.debug("Fixed element "+describe(elem))

        for elem in self.tags(self.html, 'div'):
            if elem.text and elem.text.strip():
@ -398,7 +400,7 @@ class Document:
                p.text = elem.text
                elem.text = None
                elem.insert(0, p)
-                #print "Appended "+tounicode(p)+" to "+describe(elem)
+                # print "Appended "+tounicode(p)+" to "+describe(elem)

            for pos, child in reversed(list(enumerate(elem))):
                if child.tail and child.tail.strip():
@ -406,9 +408,9 @@ class Document:
                    p.text = child.tail
                    child.tail = None
                    elem.insert(pos + 1, p)
-                    #print "Inserted "+tounicode(p)+" to "+describe(elem)
+                    # print "Inserted "+tounicode(p)+" to "+describe(elem)
                if child.tag == 'br':
-                    #print 'Dropped <br> at '+describe(elem)
+                    # print 'Dropped <br> at '+describe(elem)
                    child.drop_tree()

    def tags(self, node, *tag_names):
@ -422,14 +424,15 @@ class Document:
                yield e

    def sanitize(self, node, candidates):
-        MIN_LEN = self.options.get('min_text_length',
-            self.TEXT_LENGTH_THRESHOLD)
+        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
        for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
-            if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
+            if self.class_weight(header) < 0 or \
+                    self.get_link_density(header) > 0.33:
                header.drop_tree()

        for elem in self.tags(node, "form", "iframe", "textarea"):
            elem.drop_tree()
+
        allowed = {}
        # Conditionally clean <table>s, <ul>s, and <div>s
        for el in self.reverse_tags(node, "table", "ul", "div"):
@ -438,13 +441,13 @@ class Document:
            weight = self.class_weight(el)
            if el in candidates:
                content_score = candidates[el]['content_score']
-                #print '!',el, '-> %6.3f' % content_score
+                # print '!',el, '-> %6.3f' % content_score
            else:
                content_score = 0
            tag = el.tag

            if weight + content_score < 0:
-                self.debug("Cleaned %s with score %6.3f and weight %-3s" %
+                log.info("Removed %s with score %6.3f and weight %-3s" %
                    (describe(el), content_score, weight, ))
                el.drop_tree()
            elif el.text_content().count(",") < 10:
@ -452,6 +455,7 @@ class Document:
                for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
                    counts[kind] = len(el.findall('.//%s' % kind))
                counts["li"] -= 100
+                counts["input"] -= len(el.findall('.//input[@type="hidden"]'))

                # Count the text length excluding any surrounding whitespace
                content_length = text_length(el)
@ -459,31 +463,36 @@ class Document:
                parent_node = el.getparent()
                if parent_node is not None:
                    if parent_node in candidates:
-                        content_score = candidates[parent_node]['content_score']
+                        content_score = candidates[
+                            parent_node]['content_score']
                    else:
                        content_score = 0
-                #if parent_node is not None:
-                    #pweight = self.class_weight(parent_node) + content_score
-                    #pname = describe(parent_node)
-                #else:
-                    #pweight = 0
-                    #pname = "no parent"
+                # if parent_node is not None:
+                    # pweight = self.class_weight(parent_node) + content_score
+                    # pname = describe(parent_node)
+                # else:
+                    # pweight = 0
+                    # pname = "no parent"
                to_remove = False
                reason = ""

-                #if el.tag == 'div' and counts["img"] >= 1:
-                #    continue
-                if counts["p"] and counts["img"] > counts["p"]:
-                    reason = "too many images (%s)" % counts["img"]
+                # if el.tag == 'div' and counts["img"] >= 1:
+                # continue
+                if content_length and counts["img"] * 100 >= content_length:
+                    reason = "too many images (%s) for text " % counts["img"]
                    to_remove = True
-                elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
+                elif counts["li"] > counts["p"] \
+                        and tag != "ul" and tag != "ol":
                    reason = "more <li>s than <p>s"
                    to_remove = True
                elif counts["input"] > (counts["p"] / 3):
                    reason = "less than 3x <p>s than <input>s"
                    to_remove = True
-                elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
-                    reason = "too short content length %s without a single image" % content_length
+                elif content_length < MIN_LEN and not counts["img"]:
+                    reason = "too short content length %s and no images" % content_length
+                    to_remove = True
+                elif content_length < MIN_LEN and counts["img"] > 2:
+                    reason = "too short content length %s and too much images" % content_length
                    to_remove = True
                elif weight < 25 and link_density > 0.2:
                        reason = "too many links %.3f for its weight %s" % (
@ -496,6 +505,9 @@ class Document:
                elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
                    reason = "<embed>s with too short content length, or too many <embed>s"
                    to_remove = True
+                elif not content_length:
+                    reason = "no content"
+                    to_remove = True
 #                if el.tag == 'div' and counts['img'] >= 1 and to_remove:
 #                    imgs = el.findall('.//img')
 #                    valid_img = False
@ -523,35 +535,38 @@ class Document:
                        #self.debug(sib.text_content())
                        sib_content_length = text_length(sib)
                        if sib_content_length:
-                            i =+ 1
+                            i += 1
                            siblings.append(sib_content_length)
-                            if i == x:
+                            if i >= x:
                                break
                    for sib in el.itersiblings(preceding=True):
                        #self.debug(sib.text_content())
                        sib_content_length = text_length(sib)
                        if sib_content_length:
-                            j =+ 1
+                            j += 1
                            siblings.append(sib_content_length)
-                            if j == x:
+                            if j >= x:
                                break
                    #self.debug(str(siblings))
                    if siblings and sum(siblings) > 1000:
                        to_remove = False
-                        self.debug("Allowing %s" % describe(el))
+                        log.info("Allowing %s" % describe(el))
                        for desnode in self.tags(el, "table", "ul", "div"):
                            allowed[desnode] = True

                if to_remove:
-                    self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
-                        (content_score, describe(el), weight, reason))
+                    log.info("Cleaned %s (score=%6.3f, weight=%s) cause it has %s: %s" %
+                              (describe(el), content_score, weight, reason, text_content(el)))
                    #print tounicode(el)
                    #self.debug("pname %s pweight %.3f" %(pname, pweight))
                    el.drop_tree()
+                else:
+                    log.info("Not cleaned %s of length %s: %s" %
+                        (describe(el), content_length, text_content(el)))

        for el in ([node] + [n for n in node.iter()]):
            if not self.options.get('attributes', None):
-                #el.attrib = {} #FIXME:Checkout the effects of disabling this
+                # el.attrib = {} #FIXME:Checkout the effects of disabling this
                pass

        self.html = node
@ -584,12 +599,23 @@ class HashableElement():
    def __getattr__(self, tag):
        return getattr(self.node, tag)

+VERBOSITY = {
+    1: logging.WARNING,
+    2: logging.INFO,
+    3: logging.DEBUG
+}
+

 def main():
    from optparse import OptionParser
    parser = OptionParser(usage="%prog: [options] [file]")
-    parser.add_option('-v', '--verbose', action='store_true')
+    parser.add_option('-v', '--verbose', action='count', default=0)
+    parser.add_option('-b', '--browser', default=None, action='store_true', help="open in browser")
+    parser.add_option('-l', '--log', default=None, help="use filename for logs (appended)")
    parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
+    parser.add_option('-s', '--show-xpath', default=None, help="show xpath")
+    parser.add_option('-x', '--xpath', default=None, help="use xpath")
+    parser.add_option('-t', '--support-text', default=None, help="use this support text")
    parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
    parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
    (options, args) = parser.parse_args()
@ -598,20 +624,33 @@ def main():
        parser.print_help()
        sys.exit(1)

+    if options.verbose:
+        logging.basicConfig(level=VERBOSITY[options.verbose], filename=options.log,
+                            format='%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)')
+
    file = None
    if options.url:
        import urllib
        file = urllib.urlopen(options.url)
    else:
        file = open(args[0], 'rt')
-    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
+    html = file.read()  # bytes object
+
+    encoding = get_encoding(html)
+    html = html.decode(encoding)
    try:
-        print Document(file.read(),
-            debug=options.verbose,
-            url=options.url,
-            positive_keywords = options.positive_keywords,
-            negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        doc = Document(html, url=options.url,
+                       positive_keywords=options.positive_keywords,
+                       negative_keywords=options.negative_keywords)
+        if options.browser:
+            result = 'Title: ' + doc.short_title() + '<br/>' + doc.summary()
+            open_in_browser(result)
+        else:
+		    # XXX: a hack, better to set PYTHONIOENCODING explicitly
+		    output_encoding = sys.__stdout__.encoding or 'utf-8'
+
+            print 'Title:', doc.short_title().encode(output_encoding, 'replace')
+            print doc.summary().encode(output_encoding, 'replace')
    finally:
        file.close()

--- a/setup.py
+++ b/setup.py
@ -6,13 +6,13 @@ lxml_requirement = "lxml"
 if sys.platform == 'darwin':
    import platform
    mac_ver = platform.mac_ver()[0]
-    if mac_ver < '10.9':
-        print "Using lxml<2.4"
+    if int(mac_ver.split('.')[1]) < 9:
+        print "Using lxml<2.4 for Mac OS X < 10.9"
        lxml_requirement = "lxml<2.4"

 setup(
    name="readability-lxml",
-    version="0.3.0.3",
+    version="0.5.0.3",
    author="Yuri Baburov",
    author_email="burchik@gmail.com",
    description="fast python port of arc90's readability tool",
Author	SHA1	Message	Date
Yuri Baburov	e8f86bdcf9	Several updates from dev version.	9 years ago
Yuri Baburov	40e430c27d	Makefile updates	9 years ago
Yuri Baburov	0a082ff020	Fix for Mac OS X 10.10	9 years ago
Yuri Baburov	8048160d66	WIP: update to support python2 and python3	9 years ago
Yuri Baburov	71294f094f	Encoding improvements	10 years ago
Yuri Baburov	5855beb32a	WIP; Backported features from stable branch	10 years ago
Yuri Baburov	ae1f1adfff	Switched to use python logging module. Added xpath option (undocumented yet).	10 years ago