python-readability/readability/htmls.py

from lxml.html import tostring
import logging
import lxml.html
import re, sys

from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding

utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

if sys.version_info[0] == 2:
    str = unicode

def build_doc(page):
    if isinstance(page, str):
        enc = None
        page_unicode = page
    else:
        enc = get_encoding(page) or 'utf-8'
        page_unicode = page.decode(enc, 'replace')
    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
    return doc, enc

def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


def normalize_entities(cur_title):
    entities = {
        u'\u2014':'-',
        u'\u2013':'-',
        u'&mdash;': '-',
        u'&ndash;': '-',
        u'\u00A0': ' ',
        u'\u00AB': '"',
        u'\u00BB': '"',
        u'&quot;': '"',
    }
    for c, r in list(entities.items()):
        if c in cur_title:
            cur_title = cur_title.replace(c, r)

    return cur_title

def norm_title(title):
    return normalize_entities(normalize_spaces(title))

def get_title(doc):
    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
        return '[no-title]'

    return norm_title(title.text)

def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
        if text.replace('"', '') in orig.replace('"', ''):
            collection.add(text)

def shorten_title(doc):
    title = doc.find('.//title')
    if title is None or title.text is None or len(title.text) == 0:
        return ''

    title = orig = norm_title(title.text)

    candidates = set()

    for item in ['.//h1', './/h2', './/h3']:
        for e in list(doc.iterfind(item)):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
        for e in doc.cssselect(item):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

    if candidates:
        title = sorted(candidates, key=len)[-1]
    else:
        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
            if delimiter in title:
                parts = orig.split(delimiter)
                if len(parts[0].split()) >= 4:
                    title = parts[0]
                    break
                elif len(parts[-1].split()) >= 4:
                    title = parts[-1]
                    break
        else:
            if ': ' in title:
                parts = orig.split(': ')
                if len(parts[-1].split()) >= 4:
                    title = parts[-1]
                else:
                    title = orig.split(': ', 1)[1]

    if not 15 < len(title) < 150:
        return orig

    return title

def get_body(doc):
    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
    raw_html = str(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
    except Exception: #FIXME find the equivalent lxml error
        #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
        return raw_html
Improved title shortener method, and added it to the Document class. 13 years ago			`from lxml.html import tostring`
			`import logging`
			`import lxml.html`
Released v 0.3, and uploaded to the pypi. 11 years ago			`import re, sys`
Improved title shortener method, and added it to the Document class. 13 years ago
Updated package links for Python 2.7 and Python 3 support 9 years ago			`from .cleaners import normalize_spaces, clean_attributes`
			`from .encoding import get_encoding`

Improved title shortener method, and added it to the Document class. 13 years ago			`utf8_parser = lxml.html.HTMLParser(encoding='utf-8')`

Adds Python 3.4 support. Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax. 9 years ago			`if sys.version_info[0] == 2:`
			`str = unicode`

Improved title shortener method, and added it to the Document class. 13 years ago			`def build_doc(page):`
Adds Python 3.4 support. Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax. 9 years ago			`if isinstance(page, str):`
Quickfix for #41 11 years ago			`enc = None`
Sorted out unicode issues, thanks to Lee Semel. 13 years ago			`page_unicode = page`
Allow passing unicode objects 13 years ago			`else:`
Minor fix in encoding guessing. Claiming it v0.3.0.1 11 years ago			`enc = get_encoding(page) or 'utf-8'`
			`page_unicode = page.decode(enc, 'replace')`
Sorted out unicode issues, thanks to Lee Semel. 13 years ago			`doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)`
Minor fix in encoding guessing. Claiming it v0.3.0.1 11 years ago			`return doc, enc`
Improved title shortener method, and added it to the Document class. 13 years ago
			`def js_re(src, pattern, flags, repl):`
			`return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))`


			`def normalize_entities(cur_title):`
			`entities = {`
			`u'\u2014':'-',`
			`u'\u2013':'-',`
			`u'—': '-',`
			`u'–': '-',`
			`u'\u00A0': ' ',`
			`u'\u00AB': '"',`
			`u'\u00BB': '"',`
			`u'"': '"',`
			`}`
Adds Python 3.4 support. Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax. 9 years ago			`for c, r in list(entities.items()):`
Improved title shortener method, and added it to the Document class. 13 years ago			`if c in cur_title:`
			`cur_title = cur_title.replace(c, r)`

			`return cur_title`

			`def norm_title(title):`
			`return normalize_entities(normalize_spaces(title))`

			`def get_title(doc):`
readability.htmls: some docs do not have title elem 12 years ago			`title = doc.find('.//title')`
Fixes #53 9 years ago			`if title is None or title.text is None or len(title.text) == 0:`
Improved title shortener method, and added it to the Document class. 13 years ago			`return '[no-title]'`
readability.htmls: some docs do not have title elem 12 years ago
			`return norm_title(title.text)`
Improved title shortener method, and added it to the Document class. 13 years ago
			`def add_match(collection, text, orig):`
			`text = norm_title(text)`
			`if len(text.split()) >= 2 and len(text) >= 15:`
			`if text.replace('"', '') in orig.replace('"', ''):`
			`collection.add(text)`

			`def shorten_title(doc):`
readability.htmls: some docs do not have title elem 12 years ago			`title = doc.find('.//title')`
Added check on title.text to avoid a TypeError on None. 12 years ago			`if title is None or title.text is None or len(title.text) == 0:`
Improved title shortener method, and added it to the Document class. 13 years ago			`return ''`
readability.htmls: some docs do not have title elem 12 years ago
			`title = orig = norm_title(title.text)`
Improved title shortener method, and added it to the Document class. 13 years ago
			`candidates = set()`

			`for item in ['.//h1', './/h2', './/h3']:`
			`for e in list(doc.iterfind(item)):`
			`if e.text:`
			`add_match(candidates, e.text, orig)`
			`if e.text_content():`
			`add_match(candidates, e.text_content(), orig)`

			`for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:`
			`for e in doc.cssselect(item):`
			`if e.text:`
			`add_match(candidates, e.text, orig)`
			`if e.text_content():`
			`add_match(candidates, e.text_content(), orig)`
readability.htmls: some docs do not have title elem 12 years ago
Improved title shortener method, and added it to the Document class. 13 years ago			`if candidates:`
			`title = sorted(candidates, key=len)[-1]`
			`else:`
			`for delimiter in [' \| ', ' - ', ' :: ', ' / ']:`
			`if delimiter in title:`
			`parts = orig.split(delimiter)`
			`if len(parts[0].split()) >= 4:`
			`title = parts[0]`
			`break`
			`elif len(parts[-1].split()) >= 4:`
			`title = parts[-1]`
			`break`
			`else:`
			`if ': ' in title:`
			`parts = orig.split(': ')`
			`if len(parts[-1].split()) >= 4:`
			`title = parts[-1]`
			`else:`
			`title = orig.split(': ', 1)[1]`

			`if not 15 < len(title) < 150:`
			`return orig`

			`return title`

			`def get_body(doc):`
			`[ elem.drop_tree() for elem in doc.xpath('.//script \| .//link \| .//style') ]`
Adds Python 3.4 support. Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax. 9 years ago			`raw_html = str(tostring(doc.body or doc))`
Improved title shortener method, and added it to the Document class. 13 years ago			`cleaned = clean_attributes(raw_html)`
			`try:`
			`#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?`
			`return cleaned`
			`except Exception: #FIXME find the equivalent lxml error`
Released v 0.3, and uploaded to the pypi. 11 years ago			`#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))`
Improved title shortener method, and added it to the Document class. 13 years ago			`return raw_html`