#!/usr/bin/env python import logging import re import sys import urlparse import urlfetch from collections import namedtuple from lxml.etree import tostring from lxml.etree import tounicode from lxml.html import document_fromstring from lxml.html import fragment_fromstring from cleaners import clean_attributes from cleaners import html_cleaner from htmls import build_doc from htmls import get_body from htmls import get_title from htmls import shorten_title logging.basicConfig(level=logging.INFO) log = logging.getLogger() PAGE_CLASS = 'article-page' REGEXES = { 'unlikelyCandidatesRe': re.compile( ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|' 'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|' 'tweet|twitter'), re.I), 'okMaybeItsACandidateRe': re.compile( 'and|article|body|column|main|shadow', re.I), 'positiveRe': re.compile( ('article|body|content|entry|hentry|main|page|pagination|post|text|' 'blog|story'), re.I), 'negativeRe': re.compile( ('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|' 'outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|' 'tool|widget'), re.I), 'extraneous': re.compile( (r'print|archive|comment|discuss|e[\-]?mail|share|reply|all|login' '|sign|single'), re.I), 'divToPElementsRe': re.compile( '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), # Match: next, continue, >, >>, but not >|, as those usually mean last. 'nextLink': re.compile(r'(next|weiter|continue|>[^\|]$)', re.I), # Match: next, continue, >, >>, but not >|, as those usually mean last. 'prevLink': re.compile(r'(prev|earl|old|new|<)', re.I), 'page': re.compile(r'pag(e|ing|inat)', re.I), 'firstLast': re.compile(r'(first|last)', re.I) #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), #'trimRe': re.compile('^\s+|\s+$/'), #'normalizeRe': re.compile('\s{2,}/'), #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } class Unparseable(ValueError): pass # We want to change over the Summary to a nametuple to be more memory # effecient and because it doesn't need to be mutable. Summary = namedtuple('Summary', ['html', 'confidence', 'title', 'short_title']) def describe(node, depth=1): if not hasattr(node, 'tag'): return "[%s]" % type(node) name = node.tag if node.get('id', ''): name += '#' + node.get('id') if node.get('class', ''): name += '.' + node.get('class').replace(' ', '.') if name[:4] in ['div#', 'div.']: name = name[3:] if depth and node.getparent() is not None: return name + ' - ' + describe(node.getparent(), depth - 1) return name def to_int(x): if not x: return None x = x.strip() if x.endswith('px'): return int(x[:-2]) if x.endswith('em'): return int(x[:-2]) * 12 return int(x) def clean(text): text = re.sub('\s*\n\s*', '\n', text) text = re.sub('[ \t]{2,}', ' ', text) return text.strip() def text_length(i): return len(clean(i.text_content() or "")) def tags(node, *tag_names): for tag_name in tag_names: for e in node.findall('.//%s' % tag_name): yield e def class_weight(e): weight = 0 if e.get('class', None): if REGEXES['negativeRe'].search(e.get('class')): weight -= 25 if REGEXES['positiveRe'].search(e.get('class')): weight += 25 if e.get('id', None): if REGEXES['negativeRe'].search(e.get('id')): weight -= 25 if REGEXES['positiveRe'].search(e.get('id')): weight += 25 return weight def score_node(elem): content_score = class_weight(elem) name = elem.tag.lower() if name == "div": content_score += 5 elif name in ["pre", "td", "blockquote"]: content_score += 3 elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: content_score -= 3 elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: content_score -= 5 return { 'content_score': content_score, 'elem': elem } def transform_misused_divs_into_paragraphs(doc): for elem in tags(doc, 'div'): # transform
s that do not contain other block elements into

s if not REGEXES['divToPElementsRe'].search( unicode(''.join(map(tostring, list(elem))))): # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) for elem in tags(doc, 'div'): if elem.text and elem.text.strip(): p = fragment_fromstring('

') p.text = elem.text elem.text = None elem.insert(0, p) # log.debug("Appended %s to %s" % (tounicode(p), describe(elem))) #print "Appended "+tounicode(p)+" to "+describe(elem) for pos, child in reversed(list(enumerate(elem))): if child.tail and child.tail.strip(): p = fragment_fromstring('

') p.text = child.tail child.tail = None elem.insert(pos + 1, p) # log.debug("Inserted %s to %s" % ( # tounicode(p), # describe(elem))) #print "Inserted "+tounicode(p)+" to "+describe(elem) if child.tag == 'br': #print 'Dropped
at '+describe(elem) child.drop_tree() def remove_unlikely_candidates(doc): for elem in doc.iter(): s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) #log.debug(s) if (REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body' and elem.getparent() is not None ): # log.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() def get_link_density(elem): link_length = 0 for i in elem.findall(".//a"): link_length += text_length(i) #if len(elem.findall(".//div") or elem.findall(".//p")): # link_length = link_length total_length = text_length(elem) return float(link_length) / max(total_length, 1) def score_paragraphs(doc, options): candidates = {} #log.debug(str([describe(node) for node in tags(doc, "div")])) ordered = [] for elem in tags(doc, "p", "pre", "td"): # log.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: continue grand_parent_node = parent_node.getparent() inner_text = clean(elem.text_content() or "") inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. if inner_text_len < options['min_text_length']: continue if parent_node not in candidates: candidates[parent_node] = score_node(parent_node) ordered.append(parent_node) if grand_parent_node is not None and grand_parent_node not in candidates: candidates[grand_parent_node] = score_node(grand_parent_node) ordered.append(grand_parent_node) content_score = 1 content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) #if elem not in candidates: # candidates[elem] = score_node(elem) #WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: candidates[grand_parent_node]['content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content # should have a relatively small link density (5% or less) and be mostly # unaffected by this operation. for elem in ordered: candidate = candidates[elem] ld = get_link_density(elem) score = candidate['content_score'] # log.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % ( # score, # describe(elem), # ld, # score * (1 - ld))) candidate['content_score'] *= (1 - ld) return candidates def select_best_candidate(candidates): sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] log.debug("Top 5 : %6.3f %s" % ( candidate['content_score'], describe(elem))) if len(sorted_candidates) == 0: return None best_candidate = sorted_candidates[0] return best_candidate def reverse_tags(node, *tag_names): for tag_name in tag_names: for e in reversed(node.findall('.//%s' % tag_name)): yield e def sanitize(node, candidates, options): for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if class_weight(header) < 0 or get_link_density(header) > 0.33: header.drop_tree() for elem in tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean s,
    s, and
    s for el in reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: # log.debug("Cleaned %s with score %6.3f and weight %-3s" % # (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' % kind)) counts["li"] -= 100 # Count the text length excluding any surrounding whitespace content_length = text_length(el) link_density = get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 #if parent_node is not None: #pweight = class_weight(parent_node) + content_score #pname = describe(parent_node) #else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more
  • s than

    s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

    s than s" to_remove = True elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "s with too short content length, or too many s" to_remove = True # don't really understand what this is doing. Originally # the i/j were =+ which sets the value to 1. I think that # was supposed to be += which would increment. But then # it's compared to x which is hard set to 1. So you only # ever do one loop in each iteration and don't understand # it. Will have to investigate when we get to testing more # pages. #find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #log.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i += 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #log.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j += 1 siblings.append(sib_content_length) if j == x: break #log.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False log.debug("Allowing %s" % describe(el)) for desnode in tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: # log.debug("Cleaned %6.3f %s with weight %s cause it has %s." % # (content_score, describe(el), weight, reason)) #print tounicode(el) #log.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() # for el in ([node] + [n for n in node.iter()]): # if not (self.options['attributes']): # #el.attrib = {} #FIXME:Checkout the effects of disabling this # pass return clean_attributes(tounicode(node)) def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True): # Now that we have the top candidate, look through its siblings for # content that might also be related. Things like preambles, content # split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) if enclose_with_html_tag: output = document_fromstring('

    ') output.getchildren()[0].attrib['id'] = 'page' else: output = fragment_fromstring('
    ') output.attrib['id'] = 'page' best_elem = best_candidate['elem'] if best_elem.getparent() is not None: for sibling in best_elem.getparent().getchildren(): #if isinstance(sibling, NavigableString): continue#in lxml there no # concept of simple text append = False if sibling is best_elem: append = True sibling_key = sibling # HashableElement(sibling) # Print out sibling information for debugging. if sibling_key in candidates: sibling_candidate = candidates[sibling_key] log.debug( "Sibling: %6.3f %s" % (sibling_candidate['content_score'], describe(sibling)) ) else: log.debug("Sibling: %s" % describe(sibling)) if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.tag == "p": link_density = get_link_density(sibling) node_content = sibling.text or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: # We don't want to append directly to output, but the div # in html->body->div if enclose_with_html_tag: if sibling.tag == 'body': for elem in sibling.getchildren(): output.getchildren()[0].getchildren()[0].append(elem) else: output.getchildren()[0].getchildren()[0].append(sibling) else: output.append(sibling) else: output = best_elem return output def get_article(doc, options, enclose_with_html_tag=True): try: ruthless = True while True: for i in tags(doc, 'script', 'style'): i.drop_tree() for i in tags(doc, 'body'): i.set('id', 'readabilityBody') if ruthless: remove_unlikely_candidates(doc) transform_misused_divs_into_paragraphs(doc) candidates = score_paragraphs(doc, options) best_candidate = select_best_candidate(candidates) if best_candidate: confidence = best_candidate['content_score'] article = get_raw_article(candidates, best_candidate, enclose_with_html_tag=enclose_with_html_tag) else: if ruthless: log.debug("ruthless removal did not work. ") ruthless = False log.debug("ended up stripping too much - going for a safer parse") # try again continue else: log.debug("Ruthless and lenient parsing did not work. Returning raw html") return Summary(None, 0, '', '') cleaned_article = sanitize(article, candidates, options) of_acceptable_length = len(cleaned_article or '') >= options['retry_length'] if ruthless and not of_acceptable_length: ruthless = False continue # try again else: return Summary(confidence=confidence, html=cleaned_article, short_title=shorten_title(doc), title=get_title(doc)) except StandardError as e: log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2] def clean_segment_extension(segments, index, segment): if segment.find('.') == -1: return segment else: split_segment = segment.split('.') possible_type = split_segment[1] has_non_alpha = re.search(r'[^a-zA-Z]', possible_type) if has_non_alpha: return segment else: return split_segment[0] def clean_segment_ewcms(segments, index, segment): """ EW-CMS specific segment cleaning. Quoth the original source: "EW-CMS specific segment replacement. Ugly. Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html" """ return segment.replace(',00', '') def clean_segment_page_number(segments, index, segment): # If our first or second segment has anything looking like a page number, # remove it. if index >= (len(segments) - 2): pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$' cleaned = re.sub(pattern, '', segment, re.IGNORECASE) if cleaned == '': return None else: return cleaned else: return segment def clean_segment_number(segments, index, segment): # If this is purely a number, and it's the first or second segment, it's # probably a page number. Remove it. if index >= (len(segments) - 2) and re.search(r'^\d{1,2}$', segment): return None else: return segment def clean_segment_index(segments, index, segment): if index == (len(segments) - 1) and segment.lower() == 'index': return None else: return segment def clean_segment_short(segments, index, segment): # It is not clear to me what this is accomplishing. The original # readability source just says: # # "If our first or second segment is smaller than 3 characters, and the # first segment was purely alphas, remove it." # # However, the code actually checks to make sure that there are no alphas # in the segment, rather than checking for purely alphas. alphas = re.search(r'[a-z]', segments[-1], re.IGNORECASE) if index >= (len(segments) - 2) and len(segment) < 3 and not alphas: return None else: return segment def clean_segment(segments, index, segment): """ Cleans a single segment of a URL to find the base URL. The base URL is as a reference when evaluating URLs that might be next-page links. Returns a cleaned segment string or None, if the segment should be omitted entirely from the base URL. """ funcs = [ clean_segment_extension, clean_segment_ewcms, clean_segment_page_number, clean_segment_number, clean_segment_index, clean_segment_short ] cleaned_segment = segment for func in funcs: if cleaned_segment is None: break cleaned_segment = func(segments, index, cleaned_segment) return cleaned_segment def filter_none(seq): return [x for x in seq if x is not None] def clean_segments(segments): cleaned = [ clean_segment(segments, i, s) for i, s in enumerate(segments) ] return filter_none(cleaned) def find_base_url(url): if url is None: return None parts = urlparse.urlsplit(url) segments = parts.path.split('/') cleaned_segments = clean_segments(segments) new_path = '/'.join(cleaned_segments) new_parts = (parts.scheme, parts.netloc, new_path, '', '') base_url = urlparse.urlunsplit(new_parts) log.debug('url: %s' % url) log.debug('base_url: %s' % base_url) return base_url class NextPageCandidate(): ''' An object that tracks a single href that is a candidate for the location of the next page. Note that this is distinct from the candidates used when trying to find the elements containing the article. ''' def __init__(self, link_text, href): self.link_text = link_text self.href = href self.score = 0 def same_domain(lhs, rhs): split_lhs = urlparse.urlsplit(lhs) split_rhs = urlparse.urlsplit(rhs) if split_lhs.netloc == '' or split_rhs.netloc == '': return True else: return split_lhs.netloc == split_rhs.netloc def strip_trailing_slash(s): return re.sub(r'/$', '', s) def eval_href(parsed_urls, url, base_url, link): raw_href = link.get('href') if raw_href is None: return None, None, False href = strip_trailing_slash(raw_href) # log.debug('evaluating next page link: %s' % href) # If we've already seen this page, ignore it. if href == base_url or href == url or href in parsed_urls: log.debug('rejecting %s: already seen page' % href) return raw_href, href, False # If it's on a different domain, skip it. if url is not None and not same_domain(url, href): # log.debug('rejecting %s: different domain' % href) return raw_href, href, False return raw_href, href, True def eval_link_text(link): link_text = clean(link.text_content() or '') if REGEXES['extraneous'].search(link_text) or len(link_text) > 25: return link_text, False else: return link_text, True def find_or_create_page_candidate(candidates, href, link_text): ''' Finds or creates a candidate page object for a next-page href. If one exists already, which happens if there are multiple links with the same href, it is just returned. This returns the tuple: (, ). ''' if href in candidates: return candidates[href], False else: candidate = NextPageCandidate(link_text, href) candidates[href] = candidate return candidate, True def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link): raw_href, href, ok = eval_href(parsed_urls, url, base_url, link) if not ok: return link_text, ok = eval_link_text(link) if not ok: return # If the leftovers of the URL after removing the base URL don't contain any # digits, it's certainly not a next page link. if base_url is not None: href_leftover = href.replace(base_url, '') if not re.search(r'\d', href_leftover): return candidate, created = find_or_create_page_candidate( candidates, href, link_text ) if not created: candidate.link_text += ' | ' + link_text link_class_name = link.get('class') or '' link_id = link.get('id') or '' link_data = ' '.join([link_text, link_class_name, link_id]) # log.debug('link: %s' % tostring(link)) log.debug('link_data: %s' % link_data) if base_url is not None and href.find(base_url) != 0: log.debug('no base_url') candidate.score -= 25 if REGEXES['nextLink'].search(link_data): log.debug('link_data nextLink regex match') candidate.score += 50 if REGEXES['page'].search(link_data): log.debug('link_data page regex match') candidate.score += 25 if REGEXES['firstLast'].search(link_data): # If we already matched on "next", last is probably fine. If we didn't, # then it's bad. Penalize. if not REGEXES['nextLink'].search(candidate.link_text): log.debug('link_data matched last but not next') candidate.score -= 65 neg_re = REGEXES['negativeRe'] ext_re = REGEXES['extraneous'] if neg_re.search(link_data) or ext_re.search(link_data): log.debug('link_data negative/extraneous regex match') candidate.score -= 50 if REGEXES['prevLink'].search(link_data): log.debug('link_data prevLink match') candidate.score -= 200 parent = link.getparent() positive_node_match = False negative_node_match = False while parent is not None: parent_class = parent.get('class') or '' parent_id = parent.get('id') or '' parent_class_and_id = ' '.join([parent_class, parent_id]) if not positive_node_match: if REGEXES['page'].search(parent_class_and_id): log.debug('positive ancestor match') positive_node_match = True candidate.score += 25 if not negative_node_match: if REGEXES['negativeRe'].search(parent_class_and_id): if not REGEXES['positiveRe'].search(parent_class_and_id): log.debug('negative ancestor match') negative_node_match = True candidate.score -= 25 parent = parent.getparent() if REGEXES['page'].search(href): log.debug('href regex match') candidate.score += 25 if REGEXES['extraneous'].search(href): log.debug('extraneous regex match') candidate.score -= 15 try: link_text_as_int = int(link_text) log.debug('link_text looks like %d' % link_text_as_int) # Punish 1 since we're either already there, or it's probably before # what we want anyways. if link_text_as_int == 1: candidate.score -= 10 else: candidate.score += max(0, 10 - link_text_as_int) except ValueError as exc: pass log.debug('final score is %d' % candidate.score) def find_next_page_url(parsed_urls, url, elem): links = tags(elem, 'a') base_url = find_base_url(url) # candidates is a mapping from URLs to NextPageCandidate objects that # represent information used to determine if a URL points to the next page # in the article. candidates = {} for link in links: eval_possible_next_page_link( parsed_urls, url, base_url, candidates, link ) top_page = None for url, page in candidates.items(): log.debug('next page score of %s: %s' % (url, page.score)) if 50 <= page.score and (not top_page or top_page.score < page.score): top_page = page if top_page: log.debug('next page link found: %s' % top_page.href) parsed_urls.add(top_page.href) return top_page.href else: return None def page_id(i): return 'page-%d' % (i + 1) def make_page_elem(page_index, elem): elem.attrib['id'] = page_id(page_index) elem.attrib['class'] = PAGE_CLASS def first_paragraph(elem): paragraphs = elem.xpath('.//p') logging.debug('len(paragraphs) is %d' % len(paragraphs)) if len(paragraphs) > 0: return paragraphs[0] else: return None def is_suspected_duplicate(doc, page_doc): page_p = first_paragraph(page_doc) if page_p is None: return False pages = doc.xpath('//*[contains(@class, $name)]', name = PAGE_CLASS) for existing_page in pages: existing_page_p = first_paragraph(existing_page) if existing_page_p is not None: page_p_content = page_p.xpath('string()') existing_page_p_content = existing_page_p.xpath('string()') if page_p.xpath('string()') == existing_page_p.xpath('string()'): return True return False def append_next_page(parsed_urls, page_index, page_url, doc, options): logging.debug('appending next page: %s' % page_url) fetcher = options['urlfetch'] html = fetcher.urlread(page_url) orig_page_doc = parse(html, page_url) next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) page_article = get_article(orig_page_doc, options) log.debug('Appending ' + str(page_article)) if page_article.html: page_doc = fragment_fromstring(page_article.html) make_page_elem(page_index, page_doc) if not is_suspected_duplicate(doc, page_doc): # page_doc is a singular element containing the page article elements. We # want to add its children to the main article document to which we are # appending a page. if doc.tag == 'html': children = doc.getchildren() if children[0].tag == 'head': for elem in page_doc: doc.getchildren()[1].append(elem) else: for elem in page_doc: doc.getchildren()[0].append(elem) else: for elem in page_doc: doc.append(elem) doc.append(page_doc) if next_page_url is not None: append_next_page( parsed_urls, page_index + 1, next_page_url, doc, options ) def parse(input, url): raw_doc = build_doc(input) doc = html_cleaner.clean_html(raw_doc) log.debug('parse url: %s', url) if url: log.debug('making links absolute') doc.make_links_absolute(url, resolve_base_href=True) else: doc.resolve_base_href() return doc class Document: """Class to build a etree document out of html.""" TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 def __init__(self, input_doc, **options): """Generate the document :param input_doc: string of the html content. kwargs: - attributes: - debug: output debug messages - min_text_length: - multipage: should we check for page 2/3 of article and build together? - retry_length: - url: will allow adjusting links to be absolute """ if input_doc is None: raise ValueError('You must supply a document to process.') self.input_doc = input_doc self.options = options self.options['urlfetch'] = self.options.get('urlfetch', urlfetch.UrlFetch()) self.options['min_text_length'] = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) self.options['retry_length'] = self.options.get('retry_length', self.RETRY_LENGTH) self._html = None @property def html(self): """The parsed html document from the input""" if not self._html: self._html = parse(self.input_doc, self.options.get('url')) return self._html def content(self): return get_body(self.html) def summary_with_metadata(self, enclose_with_html_tag=True): """Parse the input content and return a Summary object :param enclose_with_html_tag: Bool do you want a full document or just the
    html partial. def summary(self): doc = self._html(True) parsed_urls = set() url = self.options['url'] if url is not None: parsed_urls.add(url) next_page_url = find_next_page_url(parsed_urls, url, doc) page_0 = get_article(doc, self.options) page_0_doc = fragment_fromstring(page_0.html) page_index = 0 make_page_elem(page_index, page_0_doc) article_doc = B.DIV(page_0_doc) article_doc.attrib['id'] = 'article' if next_page_url is not None: append_next_page( parsed_urls, page_index + 1, next_page_url, article_doc, self.options ) return Summary(page_0.confidence, tostring(article_doc)) """ summary = self._summary(enclose_with_html_tag=enclose_with_html_tag) # For this call return the raw Summary object. return summary def summary(self, enclose_with_html_tag=True): """Generate the summary of the html document :param enclose_with_html_tag: Bool do you want a full document or just the
    html partial. """ summary = self._summary(enclose_with_html_tag=enclose_with_html_tag) # Only return the html to be consistent with the backwards api. return summary.html def _summary(self, enclose_with_html_tag=True): # the first page parsed into a elementree element doc = self.html # the set of urls we've processed so far parsed_urls = set() url = self.options.get('url', None) if url is not None: parsed_urls.add(url) # check the current doc for a next page if requested if self.options.get('multipage', False): next_page_url = find_next_page_url(parsed_urls, url, doc) page_0 = get_article(doc, self.options) page_0_doc = fragment_fromstring(page_0.html) page_index = 0 make_page_elem(page_index, page_0_doc) if enclose_with_html_tag: output = document_fromstring('
    ') output.getchildren()[0].attrib['id'] = 'article' output.getchildren()[0].append(page_0_doc) else: output = fragment_fromstring('
    ') output.attrib['id'] = 'article' output.append(page_0_doc) if next_page_url is not None: append_next_page( parsed_urls, page_index + 1, next_page_url, output, self.options ) return Summary(tostring(output), page_0.confidence, short_title=shorten_title(output), title=get_title(output)) return get_article(doc, self.options, enclose_with_html_tag=enclose_with_html_tag)