Switched to use python logging module.

Added xpath option (undocumented yet).
10 years ago · ae1f1adfff
parent 2fab5ffa6b
commit ae1f1adfff
5 changed files with 214 additions and 137 deletions
--- a/readability/debug.py
+++ b/readability/debug.py
@ -1,25 +1,62 @@
-def save_to_file(text, filename):
-    f = open(filename, 'wt')
-    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    f.write(text.encode('utf-8'))
-    f.close()
+import re

-uids = {} 
-def describe(node, depth=2):
+
+uids = {}
+RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
+
+
+def open_in_browser(html):
+    """
+    Open the HTML document in a web browser, saving it to a temporary
+    file to open it.  Note that this does not delete the file after
+    use.  This is mainly meant for debugging.
+    """
+    import os
+    import webbrowser
+    import tempfile
+    handle, fn = tempfile.mkstemp(suffix='.html')
+    f = os.fdopen(handle, 'wb')
+    try:
+        f.write("<meta charset='UTF-8' />")
+        f.write(html.encode('utf-8'))
+    finally:
+        # we leak the file itself here, but we should at least close it
+        f.close()
+    url = 'file://' + fn.replace(os.path.sep, '/')
+    webbrowser.open(url)
+    return url
+
+
+def describe_node(node):
+    if node is None:
+        return ''
    if not hasattr(node, 'tag'):
        return "[%s]" % type(node)
    name = node.tag
-    if node.get('id', ''): name += '#'+node.get('id') 
-    if node.get('class', ''): 
-        name += '.' + node.get('class').replace(' ','.')
+    if node.get('id', ''):
+        name += '#' + node.get('id')
+    if node.get('class', ''):
+        name += '.' + node.get('class').replace(' ', '.')
    if name[:4] in ['div#', 'div.']:
        name = name[3:]
    if name in ['tr', 'td', 'div', 'p']:
-        if not node in uids:
-            uid = uids[node] = len(uids)+1
-        else:
-            uid = uids.get(node)
-        name += "%02d" % (uid)
-    if depth and node.getparent() is not None:
-        return name+' - '+describe(node.getparent(), depth-1)
+        uid = uids.get(node)
+        if uid is None:
+            uid = uids[node] = len(uids) + 1
+        name += "{%02d}" % uid
    return name
+
+
+def describe(node, depth=2):
+    #return repr(NodeRepr(node))
+    parent = ''
+    if depth and node.getparent() is not None:
+        parent = describe(node.getparent(), depth=depth - 1)
+    return parent + '/' + describe_node(node)
+
+
+def text_content(elem, length=40):
+    content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
+    if len(content) < length:
+        return content
+    return content[:length] + '...'
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,48 +1,58 @@
 import re
 import chardet
+import logging
+
+log = logging.getLogger('readbility.encoding')
+
+
+RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
+RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
+RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+
+CHARSETS = {
+    'big5': 'big5hkscs',
+    'gb2312': 'gb18030',
+    'ascii': 'utf-8',
+    'MacCyrillic': 'cp1251',
+}
+
+
+def fix_charset(encoding):
+    """Overrides encoding when charset declaration
+       or charset determination is a subset of a larger
+       charset.  Created because of issues with Chinese websites"""
+    encoding = encoding.lower()
+    return CHARSETS.get(encoding, encoding)
+

 def get_encoding(page):
-    # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
-
-    declared_encodings = (charset_re.findall(page) +
-            pragma_re.findall(page) +
-            xml_re.findall(page))
-
-    # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    declared_encodings = (RE_CHARSET.findall(page) +
+                          RE_PRAGMA.findall(page) +
+                          RE_XML.findall(page))
+
+    log.debug("Document has the following encodings: %s" % declared_encodings)
+
+    # Try declared encodings, if any
+    for declared_encoding in declared_encodings:
+        encoding = fix_charset(declared_encoding)
+        try:
+            page.decode(encoding)
+            log.info('Using encoding "%s"' % encoding)
+            return encoding
+        except UnicodeDecodeError:
+            log.info('Encoding "%s", specified in the document as "%s" '
+                     'didn\'t work' % (encoding, declared_encoding))
+            print "Content encoding didn't work:", encoding

    # Fallback to chardet if declared encodings fail
    text = re.sub('</?[^>]*>\s*', ' ', page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
-        return enc # can't guess
+        log.debug("Can't guess encoding because text is too short")
+        return enc
    res = chardet.detect(text)
-    enc = res['encoding']
+    enc = fix_charset(res['encoding'])
+    log.info('Trying encoding "%s" guessed '
+             'with confidence %.2f' % (enc, res['confidence']))
    #print '->', enc, "%.2f" % res['confidence']
-    enc = custom_decode(enc)
    return enc
-
-def custom_decode(encoding):
-    """Overrides encoding when charset declaration
-       or charset determination is a subset of a larger
-       charset.  Created because of issues with Chinese websites"""
-    encoding = encoding.lower()
-    alternates = {
-        'big5': 'big5hkscs',
-        'gb2312': 'gb18030',
-        'ascii': 'utf-8',
-        'MacCyrillic': 'cp1251',
-    }
-    if encoding in alternates:
-        return alternates[encoding]
-    else:
-        return encoding
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -3,28 +3,36 @@ from encoding import get_encoding
 from lxml.html import tostring
 import logging
 import lxml.html
-import re, sys
+import re
+
+log = logging.getLogger('readability.htmls')

 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

+
+def lxml_fromstring(doc):
+    return lxml.html.document_fromstring(doc, parser=utf8_parser)
+
+
 def build_doc(page):
    if isinstance(page, unicode):
        enc = None
-        page_unicode = page
+        unicode_page = page
    else:
        enc = get_encoding(page) or 'utf-8'
-        page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
+        unicode_page = page.decode(enc, 'replace')
+    doc = lxml_fromstring(unicode_page.encode('utf-8', 'replace').replace('\r', ''))
    return doc, enc

+
 def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


 def normalize_entities(cur_title):
    entities = {
-        u'\u2014':'-',
-        u'\u2013':'-',
+        u'\u2014': '-',
+        u'\u2013': '-',
        u'&mdash;': '-',
        u'&ndash;': '-',
        u'\u00A0': ' ',
@ -38,9 +46,11 @@ def normalize_entities(cur_title):

    return cur_title

+
 def norm_title(title):
    return normalize_entities(normalize_spaces(title))

+
 def get_title(doc):
    title = doc.find('.//title')
    if title is None or len(title.text) == 0:
@ -48,12 +58,19 @@ def get_title(doc):

    return norm_title(title.text)

+
 def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
        if text.replace('"', '') in orig.replace('"', ''):
            collection.add(text)

+
+TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
+                        '.news_title', '.title', '.head', '.heading',
+                        '.contentheading', '.small_header_red']
+
+
 def shorten_title(doc):
    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
@ -70,7 +87,7 @@ def shorten_title(doc):
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

-    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+    for item in TITLE_CSS_HEURISTICS:
        for e in doc.cssselect(item):
            if e.text:
                add_match(candidates, e.text, orig)
@ -102,13 +119,16 @@ def shorten_title(doc):

    return title

+
 def get_body(doc):
-    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    for elem in doc.xpath('.//script | .//link | .//style'):
+        elem.drop_tree()
    raw_html = unicode(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
-    except Exception: #FIXME find the equivalent lxml error
-        #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+    except Exception:  # FIXME find the equivalent lxml error
+        log.error("cleaning broken html content: "
+                  "%s\n---------\n%s" % (raw_html, cleaned))
        return raw_html
--- a/readability/readability.py
+++ b/readability/readability.py
@ -3,7 +3,6 @@ import logging
 import re
 import sys

-from collections import defaultdict
 from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
@ -15,11 +14,10 @@ from htmls import build_doc
 from htmls import get_body
 from htmls import get_title
 from htmls import shorten_title
+from encoding import get_encoding
+from debug import describe, text_content, open_in_browser

-
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger()
-
+log = logging.getLogger('readbility.readability')

 REGEXES = {
    'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@ -41,21 +39,6 @@ class Unparseable(ValueError):
    pass


-def describe(node, depth=1):
-    if not hasattr(node, 'tag'):
-        return "[%s]" % type(node)
-    name = node.tag
-    if node.get('id', ''):
-        name += '#' + node.get('id')
-    if node.get('class', ''):
-        name += '.' + node.get('class').replace(' ', '.')
-    if name[:4] in ['div#', 'div.']:
-        name = name[3:]
-    if depth and node.getparent() is not None:
-        return name + ' - ' + describe(node.getparent(), depth - 1)
-    return name
-
-
 def to_int(x):
    if not x:
        return None
@ -68,16 +51,18 @@ def to_int(x):


 def clean(text):
+    text = re.sub('[ \t]+', ' ', text)
    text = re.sub('\s*\n\s*', '\n', text)
-    text = re.sub('[ \t]{2,}', ' ', text)
    return text.strip()


 def text_length(i):
    return len(clean(i.text_content() or ""))

+
 regexp_type = type(re.compile('hello, world'))

+
 def compile_pattern(elements):
    if not elements:
        return None
@ -87,6 +72,7 @@ def compile_pattern(elements):
        elements = elements.split(',')
    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)

+
 class Document:
    """Class to build a etree document out of html."""
    TEXT_LENGTH_THRESHOLD = 25
@ -99,7 +85,6 @@ class Document:

        kwargs:
            - attributes:
-            - debug: output debug messages
            - min_text_length:
            - retry_length:
            - url: will allow adjusting links to be absolute
@ -127,6 +112,11 @@ class Document:
            doc.make_links_absolute(base_href, resolve_base_href=True)
        else:
            doc.resolve_base_href()
+        if self.options.get('xpath'):
+            root = doc.getroottree()
+            for i in doc.getiterator():
+                #print root.getpath(i)
+                i.attrib['x'] = root.getpath(i)
        return doc

    def content(self):
@ -139,7 +129,7 @@ class Document:
        return shorten_title(self._html(True))

    def get_clean_html(self):
-         return clean_attributes(tounicode(self.html))
+        return clean_attributes(tounicode(self.html))

    def summary(self, html_partial=False):
        """Generate the summary of the html docuemnt
@ -165,18 +155,18 @@ class Document:

                if best_candidate:
                    article = self.get_article(candidates, best_candidate,
-                            html_partial=html_partial)
+                                               html_partial=html_partial)
                else:
                    if ruthless:
-                        log.debug("ruthless removal did not work. ")
+                        log.info("ruthless removal did not work. ")
                        ruthless = False
-                        self.debug(
+                        log.info(
                            ("ended up stripping too much - "
-                             "going for a safer _parse"))
+                             "going for a safer parse"))
                        # try again
                        continue
                    else:
-                        log.debug(
+                        log.info(
                            ("Ruthless and lenient parsing did not work. "
                             "Returning raw html"))
                        article = self.html.find('body')
@ -218,8 +208,7 @@ class Document:
            if sibling is best_elem:
                append = True
            sibling_key = sibling  # HashableElement(sibling)
-            if sibling_key in candidates and \
-                candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.tag == "p":
@ -229,13 +218,11 @@ class Document:

                if node_length > 80 and link_density < 0.25:
                    append = True
-                elif node_length <= 80 \
-                    and link_density == 0 \
-                    and re.search('\.( |$)', node_content):
+                elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
-                # We don't want to append directly to output, but the div
+                # We don't want to append directly to output, but to the div
                # in html->body->div
                if html_partial:
                    output.append(sibling)
@ -246,15 +233,15 @@ class Document:
        return output

    def select_best_candidate(self, candidates):
+        if not candidates:
+            return None
+
        sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
        for candidate in sorted_candidates[:5]:
            elem = candidate['elem']
-            self.debug("Top 5 : %6.3f %s" % (
+            log.info("Top 5 : %6.3f %s: %s" % (
                candidate['content_score'],
-                describe(elem)))
-
-        if len(sorted_candidates) == 0:
-            return None
+                describe(elem), text_content(elem)))

        best_candidate = sorted_candidates[0]
        return best_candidate
@ -315,11 +302,8 @@ class Document:
            candidate = candidates[elem]
            ld = self.get_link_density(elem)
            score = candidate['content_score']
-            self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
-                score,
-                describe(elem),
-                ld,
-                score * (1 - ld)))
+            log.debug("Branch %6.3f %s link density %.3f -> %6.3f" % (
+                      score, describe(elem), ld, score * (1 - ld)))
            candidate['content_score'] *= (1 - ld)

        return candidates
@ -340,10 +324,10 @@ class Document:
                if self.negative_keywords and self.negative_keywords.search(feature):
                    weight -= 25

-        if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
+        if self.positive_keywords and self.positive_keywords.match('tag-' + e.tag):
            weight += 25

-        if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
+        if self.negative_keywords and self.negative_keywords.match('tag-' + e.tag):
            weight -= 25

        return weight
@ -365,8 +349,7 @@ class Document:
        }

    def debug(self, *a):
-        if self.options.get('debug', False):
-            log.debug(*a)
+        log.warn("debug: " + a[0], *a[1:])

    def remove_unlikely_candidates(self):
        for elem in self.html.iter():
@ -375,7 +358,7 @@ class Document:
                continue
            #self.debug(s)
            if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
-                self.debug("Removing unlikely candidate - %s" % describe(elem))
+                log.debug("Removing unlikely candidate - %s" % describe(elem))
                elem.drop_tree()

    def transform_misused_divs_into_paragraphs(self):
@ -388,9 +371,9 @@ class Document:
            # buried within an <a> for example
            if not REGEXES['divToPElementsRe'].search(
                    unicode(''.join(map(tostring, list(elem))))):
-                #self.debug("Altering %s to p" % (describe(elem)))
+                #self.debug("Altering %s to p" % describe(elem))
                elem.tag = "p"
-                #print "Fixed element "+describe(elem)
+                #self.debug("Fixed element "+describe(elem))

        for elem in self.tags(self.html, 'div'):
            if elem.text and elem.text.strip():
@ -422,14 +405,14 @@ class Document:
                yield e

    def sanitize(self, node, candidates):
-        MIN_LEN = self.options.get('min_text_length',
-            self.TEXT_LENGTH_THRESHOLD)
+        MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
        for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
            if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
                header.drop_tree()

        for elem in self.tags(node, "form", "iframe", "textarea"):
            elem.drop_tree()
+
        allowed = {}
        # Conditionally clean <table>s, <ul>s, and <div>s
        for el in self.reverse_tags(node, "table", "ul", "div"):
@ -444,7 +427,7 @@ class Document:
            tag = el.tag

            if weight + content_score < 0:
-                self.debug("Cleaned %s with score %6.3f and weight %-3s" %
+                log.info("Removed %s with score %6.3f and weight %-3s" %
                    (describe(el), content_score, weight, ))
                el.drop_tree()
            elif el.text_content().count(",") < 10:
@ -473,8 +456,8 @@ class Document:

                #if el.tag == 'div' and counts["img"] >= 1:
                #    continue
-                if counts["p"] and counts["img"] > counts["p"]:
-                    reason = "too many images (%s)" % counts["img"]
+                if content_length and counts["img"] * 100 >= content_length:
+                    reason = "too many images (%s) for text " % counts["img"]
                    to_remove = True
                elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
                    reason = "more <li>s than <p>s"
@ -482,8 +465,11 @@ class Document:
                elif counts["input"] > (counts["p"] / 3):
                    reason = "less than 3x <p>s than <input>s"
                    to_remove = True
-                elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
-                    reason = "too short content length %s without a single image" % content_length
+                elif content_length < MIN_LEN and not counts["img"]:
+                    reason = "too short content length %s and no images" % content_length
+                    to_remove = True
+                elif content_length < MIN_LEN and counts["img"] > 2:
+                    reason = "too short content length %s and too much images" % content_length
                    to_remove = True
                elif weight < 25 and link_density > 0.2:
                        reason = "too many links %.3f for its weight %s" % (
@ -496,6 +482,9 @@ class Document:
                elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
                    reason = "<embed>s with too short content length, or too many <embed>s"
                    to_remove = True
+                elif not content_length:
+                    reason = "no content"
+                    to_remove = True
 #                if el.tag == 'div' and counts['img'] >= 1 and to_remove:
 #                    imgs = el.findall('.//img')
 #                    valid_img = False
@ -523,31 +512,34 @@ class Document:
                        #self.debug(sib.text_content())
                        sib_content_length = text_length(sib)
                        if sib_content_length:
-                            i =+ 1
+                            i += 1
                            siblings.append(sib_content_length)
-                            if i == x:
+                            if i >= x:
                                break
                    for sib in el.itersiblings(preceding=True):
                        #self.debug(sib.text_content())
                        sib_content_length = text_length(sib)
                        if sib_content_length:
-                            j =+ 1
+                            j += 1
                            siblings.append(sib_content_length)
-                            if j == x:
+                            if j >= x:
                                break
                    #self.debug(str(siblings))
                    if siblings and sum(siblings) > 1000:
                        to_remove = False
-                        self.debug("Allowing %s" % describe(el))
+                        log.info("Allowing %s" % describe(el))
                        for desnode in self.tags(el, "table", "ul", "div"):
                            allowed[desnode] = True

                if to_remove:
-                    self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
-                        (content_score, describe(el), weight, reason))
+                    log.info("Cleaned %s (score=%6.3f, weight=%s) cause it has %s: %s" %
+                              (describe(el), content_score, weight, reason, text_content(el)))
                    #print tounicode(el)
                    #self.debug("pname %s pweight %.3f" %(pname, pweight))
                    el.drop_tree()
+                else:
+                    log.info("Not cleaned %s of length %s: %s" %
+                        (describe(el), content_length, text_content(el)))

        for el in ([node] + [n for n in node.iter()]):
            if not self.options.get('attributes', None):
@ -584,11 +576,19 @@ class HashableElement():
    def __getattr__(self, tag):
        return getattr(self.node, tag)

+VERBOSITY = {
+    1: logging.WARNING,
+    2: logging.INFO,
+    3: logging.DEBUG
+}
+

 def main():
    from optparse import OptionParser
    parser = OptionParser(usage="%prog: [options] [file]")
-    parser.add_option('-v', '--verbose', action='store_true')
+    parser.add_option('-v', '--verbose', action='count', default=0)
+    parser.add_option('-b', '--browser', default=None, action='store_true', help="open in browser")
+    parser.add_option('-l', '--log', default=None, help="use filename for logs (appended)")
    parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
    parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
    parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
@ -598,20 +598,30 @@ def main():
        parser.print_help()
        sys.exit(1)

+    if options.verbose:
+        logging.basicConfig(level=VERBOSITY[options.verbose], filename=options.log,
+                            format='%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)')
+
    file = None
    if options.url:
        import urllib
        file = urllib.urlopen(options.url)
    else:
        file = open(args[0], 'rt')
-    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
+    output_encoding = sys.__stdout__.encoding or 'utf-8'  # XXX: a hack, better set PYTHONIOENCODING explicitly
+    html = file.read()  # bytes object
+    encoding = get_encoding(html)
+    html = html.decode(encoding)
    try:
-        print Document(file.read(),
-            debug=options.verbose,
-            url=options.url,
-            positive_keywords = options.positive_keywords,
-            negative_keywords = options.negative_keywords,
-        ).summary().encode(enc, 'replace')
+        doc = Document(html, url=options.url,
+                       positive_keywords=options.positive_keywords,
+                       negative_keywords=options.negative_keywords)
+        if options.browser:
+            result = 'Title: ' + doc.short_title() + '<br/>' + doc.summary()
+            open_in_browser(result)
+        else:
+            print 'Title:', doc.short_title().encode(output_encoding, 'replace')
+            print doc.summary().encode(output_encoding, 'replace')
    finally:
        file.close()

--- a/setup.py
+++ b/setup.py
@ -12,7 +12,7 @@ if sys.platform == 'darwin':

 setup(
    name="readability-lxml",
-    version="0.3.0.3",
+    version="0.4.0.b1",
    author="Yuri Baburov",
    author_email="burchik@gmail.com",
    description="fast python port of arc90's readability tool",