diff --git a/readability/readability.py b/readability/readability.py
deleted file mode 100644
index 9826210..0000000
--- a/readability/readability.py
+++ /dev/null
@@ -1,528 +0,0 @@
-#!/usr/bin/env python
-from cleaners import html_cleaner, clean_attributes
-from collections import defaultdict
-from htmls import build_doc, get_body, get_title, shorten_title
-from lxml.etree import tostring, tounicode
-from lxml.html import fragment_fromstring, document_fromstring
-from lxml.html import builder as B
-import logging
-import re
-import sys
-
-logging.basicConfig(level=logging.INFO)
-
-REGEXES = {
- 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
- 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
- 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
- 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
- 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
- #'replaceBrsRe': re.compile('(
]*>[ \n\r\t]*){2,}',re.I),
- #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
- #'trimRe': re.compile('^\s+|\s+$/'),
- #'normalizeRe': re.compile('\s{2,}/'),
- #'killBreaksRe': re.compile('(
(\s| ?)*){1,}/'),
- #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
- #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
-}
-
-def describe(node, depth=1):
- if not hasattr(node, 'tag'):
- return "[%s]" % type(node)
- name = node.tag
- if node.get('id', ''): name += '#'+node.get('id')
- if node.get('class', ''):
- name += '.' + node.get('class').replace(' ','.')
- if name[:4] in ['div#', 'div.']:
- name = name[3:]
- if depth and node.getparent() is not None:
- return name+' - '+describe(node.getparent(), depth-1)
- return name
-
-def to_int(x):
- if not x: return None
- x = x.strip()
- if x.endswith('px'):
- return int(x[:-2])
- if x.endswith('em'):
- return int(x[:-2]) * 12
- return int(x)
-
-def clean(text):
- text = re.sub('\s*\n\s*', '\n', text)
- text = re.sub('[ \t]{2,}', ' ', text)
- return text.strip()
-
-def text_length(i):
- return len(clean(i.text_content() or ""))
-
-class Unparseable(ValueError):
- pass
-
-class Summary:
- '''
- The type of object returned by Document.summary(). This includes the
- confidence level we have in our summary. If this is low (<35), our summary
- may not be valid, though we did our best.
- '''
-
- def __init__(self, confidence, html):
- self.confidence = confidence
- self.html = html
-
-class Document:
- TEXT_LENGTH_THRESHOLD = 25
- RETRY_LENGTH = 250
-
- def __init__(self, input, **options):
- self.input = input
- self.options = defaultdict(lambda: None)
- for k, v in options.items():
- self.options[k] = v
- self.html = None
-
- def _html(self, force=False):
- if force or self.html is None:
- self.html = self._parse(self.input)
- return self.html
-
- def _parse(self, input):
- doc = build_doc(input)
- doc = html_cleaner.clean_html(doc)
- base_href = self.options['url']
- if base_href:
- doc.make_links_absolute(base_href, resolve_base_href=True)
- else:
- doc.resolve_base_href()
- return doc
-
- def content(self):
- return get_body(self._html(True))
-
- def title(self):
- return get_title(self._html(True))
-
- def short_title(self):
- return shorten_title(self._html(True))
-
- def summary(self):
- try:
- ruthless = True
- while True:
- self._html(True)
-
- for i in self.tags(self.html, 'script', 'style'):
- i.drop_tree()
- for i in self.tags(self.html, 'body'):
- i.set('id', 'readabilityBody')
- if ruthless:
- self.remove_unlikely_candidates()
- self.transform_misused_divs_into_paragraphs()
- candidates = self.score_paragraphs()
-
- best_candidate = self.select_best_candidate(candidates)
- if best_candidate:
- confidence = best_candidate['content_score']
- article = self.get_article(candidates, best_candidate)
- else:
- if ruthless:
- logging.debug("ruthless removal did not work. ")
- ruthless = False
- self.debug("ended up stripping too much - going for a safer _parse")
- # try again
- continue
- else:
- logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
- confidence = 0;
- article = self.html.find('body') or self.html
-
- unicode_cleaned_article = self.sanitize(article, candidates)
- cleaned_doc = fragment_fromstring(unicode_cleaned_article)
- cleaned_article = tostring(cleaned_doc)
-
- of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
- if ruthless and not of_acceptable_length:
- ruthless = False
- continue # try again
- else:
- return Summary(confidence, cleaned_article)
- except StandardError, e:
- #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
- logging.exception('error getting summary: ' )
- raise Unparseable(str(e)), None, sys.exc_info()[2]
-
- def get_article(self, candidates, best_candidate):
- # Now that we have the top candidate, look through its siblings for content that might also be related.
- # Things like preambles, content split by ads that we removed, etc.
-
- sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
- article = B.DIV()
- article.attrib['id'] = 'article'
- best_elem = best_candidate['elem']
- for sibling in best_elem.getparent().getchildren():
- #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
- append = False
- if sibling is best_elem:
- append = True
- sibling_key = sibling #HashableElement(sibling)
- if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
- append = True
-
- if sibling.tag == "p":
- link_density = self.get_link_density(sibling)
- node_content = sibling.text or ""
- node_length = len(node_content)
-
- if node_length > 80 and link_density < 0.25:
- append = True
- elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
- append = True
-
- if append:
- article.append(sibling)
-
- #if article is not None:
- # article.append(best_elem)
- return article
-
- def select_best_candidate(self, candidates):
- sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
- for candidate in sorted_candidates[:5]:
- elem = candidate['elem']
- self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
-
- if len(sorted_candidates) == 0:
- return None
-
- best_candidate = sorted_candidates[0]
- return best_candidate
-
-
- def get_link_density(self, elem):
- link_length = 0
- for i in elem.findall(".//a"):
- link_length += text_length(i)
- #if len(elem.findall(".//div") or elem.findall(".//p")):
- # link_length = link_length
- total_length = text_length(elem)
- return float(link_length) / max(total_length, 1)
-
- def score_paragraphs(self, ):
- MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
- candidates = {}
- #self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
-
- ordered = []
- for elem in self.tags(self.html, "p", "pre", "td"):
- parent_node = elem.getparent()
- if parent_node is None:
- continue
- grand_parent_node = parent_node.getparent()
-
- inner_text = clean(elem.text_content() or "")
- inner_text_len = len(inner_text)
-
- # If this paragraph is less than 25 characters, don't even count it.
- if inner_text_len < MIN_LEN:
- continue
-
- if parent_node not in candidates:
- candidates[parent_node] = self.score_node(parent_node)
- ordered.append(parent_node)
-
- if grand_parent_node is not None and grand_parent_node not in candidates:
- candidates[grand_parent_node] = self.score_node(grand_parent_node)
- ordered.append(grand_parent_node)
-
- content_score = 1
- content_score += len(inner_text.split(','))
- content_score += min((inner_text_len / 100), 3)
- #if elem not in candidates:
- # candidates[elem] = self.score_node(elem)
-
- #WTF? candidates[elem]['content_score'] += content_score
- candidates[parent_node]['content_score'] += content_score
- if grand_parent_node is not None:
- candidates[grand_parent_node]['content_score'] += content_score / 2.0
-
- # Scale the final candidates score based on link density. Good content should have a
- # relatively small link density (5% or less) and be mostly unaffected by this operation.
- for elem in ordered:
- candidate = candidates[elem]
- ld = self.get_link_density(elem)
- score = candidate['content_score']
- self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
- candidate['content_score'] *= (1 - ld)
-
- return candidates
-
- def class_weight(self, e):
- weight = 0
- if e.get('class', None):
- if REGEXES['negativeRe'].search(e.get('class')):
- weight -= 25
-
- if REGEXES['positiveRe'].search(e.get('class')):
- weight += 25
-
- if e.get('id', None):
- if REGEXES['negativeRe'].search(e.get('id')):
- weight -= 25
-
- if REGEXES['positiveRe'].search(e.get('id')):
- weight += 25
-
- return weight
-
- def score_node(self, elem):
- content_score = self.class_weight(elem)
- name = elem.tag.lower()
- if name == "div":
- content_score += 5
- elif name in ["pre", "td", "blockquote"]:
- content_score += 3
- elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
- content_score -= 3
- elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
- content_score -= 5
- return {
- 'content_score': content_score,
- 'elem': elem
- }
-
- def debug(self, *a):
- #if self.options['debug']:
- logging.debug(*a)
-
- def remove_unlikely_candidates(self):
- for elem in self.html.iter():
- s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
- #self.debug(s)
- if (REGEXES['unlikelyCandidatesRe'].search(s) and
- (not REGEXES['okMaybeItsACandidateRe'].search(s)) and
- elem.tag != 'body' and
- elem.getparent() is not None
- ):
- self.debug("Removing unlikely candidate - %s" % describe(elem))
- elem.drop_tree()
-
- def transform_misused_divs_into_paragraphs(self):
- for elem in self.tags(self.html, 'div'):
- # transform
s - if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))): - #self.debug("Altering %s to p" % (describe(elem))) - elem.tag = "p" - #print "Fixed element "+describe(elem) - - for elem in self.tags(self.html, 'div'): - if elem.text and elem.text.strip(): - p = fragment_fromstring('
') - p.text = elem.text - elem.text = None - elem.insert(0, p) - #print "Appended "+tounicode(p)+" to "+describe(elem) - - for pos, child in reversed(list(enumerate(elem))): - if child.tail and child.tail.strip(): - p = fragment_fromstring('') - p.text = child.tail - child.tail = None - elem.insert(pos + 1, p) - #print "Inserted "+tounicode(p)+" to "+describe(elem) - if child.tag == 'br': - #print 'Dropped