chore: cleaned up python and other unneeded comments

pull/5/head
Adam Pash 8 years ago
parent bf13b38a9b
commit cbd0636dcf

@ -1 +0,0 @@
0 priests were performing the holy ceremonies in the temple.<sup id="cite_ref-7" class="reference"><a href="#cite_note-7">[7]</a></sup> It is believed a fire cracker lit near the temple fell on the <i>yagasala</i>, a temporary structure built to accommodate the ritual ceremonies, and sparked the fire that spread to the thatched roofs. A stampede resulted when the panic-stricken devotees rushed to the only entrance to the temple on the eastern side.<sup id="cite_ref-yaga_6-1" class="reference"><a href="#cite_note-yaga-6">[6]</a></sup><sup id="cite_ref-8" class="reference"><a href="#cite_note-8">[8]</a></sup> However, another version claimed the fire was caused by a spark from the electric generator.<sup id="cite_ref-yaga_6-2" class="reference"><a href="#cite_note-yaga-6">[6]</a></sup> Most of the deaths were reported be caused by the inhalation of carbon monoxide and a few due to burn injuries. There were lot of inflammable material like ghee, condiments and thatched roof that resulted in spreading of fire. The only entrance was the narrow eastern side where many rushed and fell on stones.<sup id="cite_ref-9" class="reference"><a href="#cite_note-9">[9]</a></sup> Police reported that they recovered 37 bodies from the thatched roof that fell on the worshipers. The fire hampered the electric line in the neighbourhood, slowing down the rescue operations.<sup id="cite_ref-10" class="reference"><a href="#cite_note-10">[10]</a></sup></p> <p>The rescue operations were monitored by Pulavar Senguttuvan, the state Minister for Hindu Religious and Charitable Endowements, T N Ramanathan, the District Collector, S K Dogra, the Deputy Inspector-General of Police and Jayanth Murali, Superintendent of Police of <a href="/wiki/Thanjavur_district" title="Thanjavur district">Thanjavur district</a> at that time. The rescue operations were aided by Home Guards, member of <a href="/wiki/Red_Cross" class="mw-redirect" title="Red Cross">Red Cross</a> and the general public.<sup id="cite_ref-TV_11-0" class="reference"><a href="#cite_note-TV-11">[11]</a></sup><sup id="cite_ref-12" class="reference"><a href="#cite_note-12">[12]</a></sup> A special information cell was opened in the premises of the temple and also at Collector&apos;s office.<sup id="cite_ref-TV_11-1" class="reference"><a href="#cite_note-TV-11">[11]</a></sup></p> <h2><span class="mw-headline" id="Aftermath">Aftermath</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Brihadeeswarar_Temple_fire&amp;action=edit&amp;section=3" title="Edit section: Aftermath">edit</a><span class="mw-editsection-bracket">]</span></span></h2> <p>The accident was one of four major fire accidents in the state along with the fire accidents like the <a href="/wiki/Erwadi_fire_incident" title="Erwadi fire incident">Erwadi fire incident</a> on 6 August 2001 that killed 30 mentally challenged people, <a href="/wiki/Srirangam_marriage_hall_fire" title="Srirangam marriage hall fire">fire at marriage hall</a> on 23 January 2005 at <a href="/wiki/Srirangam" title="Srirangam">Srirangam</a> where 30 people including the bridegroom were killed and <a href="/wiki/2004_Kumbakonam_School_fire" title="2004 Kumbakonam School fire">2004 Kumbakonam School fire</a> where 94 school children were killed.<sup id="cite_ref-Teets103_13-0" class="reference"><a href="#cite_note-Teets103-13">[13]</a></sup> The Tamil Nadu Government announced a compensation of Rs 100,000 to the families of the deceased and the injured were paid from Rs 10,000 to Rs 50,000 each.<sup id="cite_ref-yaga_6-3" class="reference"><a href="#cite_note-yaga-6">[6]</a></sup> The Deputy Inspector General (DGI), during the investigation, ruled out any possibility of sabotage even though there was an attempt was made to blast the TV relay station at Eswari Nagar the previous week.<sup id="cite_ref-TV_11-2" class="reference"><a href="#cite_note-TV-11">[11]</a></sup></p> <h2><span class="mw-headline" id="Notes">Notes</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Brihadeeswarar_Temple_fire&amp;action=edit&amp;section=4" title="Edit section: Notes">edit</a><span class="mw-editsection-bracket">]</span></span></h2> <h2><span class="mw-headline" id="References">References</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Brihadeeswarar_Temple_fire&amp;action=edit&amp;section=5" title="Edit section: References">edit</a><span class="mw-editsection-bracket">]</span></span></h2> </div></div>

@ -17,16 +17,9 @@ describe('extractCleanNode(article, { $, cleanConditionally, title } })', () =>
};
const bestNode = extractBestNode($, opts);
// let result = $.html(bestNode);
// // console.log(result)
// // console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts });
// result = $.html(cleanNode);
// // console.log(result.length)
// // console.log(result)
// // console.log(bestNode.html())
assert.equal($(cleanNode).text().length, 2834);
});
});

@ -19,8 +19,5 @@ describe('extractBestNode($, flags)', () => {
const bestNode = extractBestNode($, opts);
assert(typeof bestNode, 'object');
// console.log(bestNode.html())
// assert.equal($(bestNode).text().length, 3652)
});
});

@ -1,7 +1,6 @@
import assert from 'assert';
import cheerio from 'cheerio';
// import HTML from './fixtures/html'
import GenericDekExtractor from './extractor';
describe('GenericDekExtractor', () => {

@ -67,382 +67,3 @@ const Resource = {
};
export default Resource;
// def __init__(self, url, parse_non_2xx=False, response=None):
// """ Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parse_non_2xx: If True, attempt to parse non-200 level
// resources. If False, raise a RetrievalFailed
// based exception. Default is False.
// :param response: If not None, use as the response rather than
// attempting to fetch it ourselves. Expects a
// requests.models.Response object.
// """
// self.url = url
// self.parse_non_2xx = parse_non_2xx
//
// if response:
// self.response = response
// else:
// self.response = self._fetch_resource()
// Iris: Human-friendly content extraction.
// import logging
// import lxml
// import re
// import requests
// import socket
//
// from django.conf import settings
// from lxml.etree import XPathEvaluator
// from lxml.html.clean import Cleaner
// from urlparse import urlparse
//
// from utils.dom import extract_by_selector as ebs, convert_lazy_loaded_images
// from utils.dom.attribmap import AttribMap
// from utils.statsd import stats
// from utils.text import is_text
// from utils.html import get_charset_from_html, strip_content_encodings
//
// from . import exceptions
//
// logger = logging.getLogger(__name__)
//
// # Hosts that are allowed to use embeds and iframes. We should be very
// # restrictive with this and only include top-tier video sites.
// host_whitelist = ['www.youtube.com', 'www.vimeo.com']
//
// # The number of seconds to attempt to fetch a resource before timing out.
// FETCH_TIMEOUT = 10
//
// cleaner = Cleaner(
// style=True,
// page_structure=False,
// meta=False,
// add_nofollow=False, # done by hand
// remove_unknown_tags=False,
// links=False,
// host_whitelist=host_whitelist)
//
//
//
// class Resource(object):
// """ A Resource is a wrapper class for an HTTP resource. Provides
// functionality to fetch a resource as well as a handful of shortcut
// methods to run xpath efficiently on HTML, etc.
//
// Uses requests and lxml internally for fetching and querying.
// """
//
//
// def __init__(self, url, parse_non_2xx=False, response=None):
// """ Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parse_non_2xx: If True, attempt to parse non-200 level
// resources. If False, raise a RetrievalFailed
// based exception. Default is False.
// :param response: If not None, use as the response rather than
// attempting to fetch it ourselves. Expects a
// requests.models.Response object.
// """
// self.url = url
// self.parse_non_2xx = parse_non_2xx
//
// if response:
// self.response = response
// else:
// self.response = self._fetch_resource()
//
// def __unicode__(self):
// return u'<Resource ({0})>'.format(self.url)
//
// def __repr__(self):
// return "<Resource ({0})>".format(self.url)
//
// @classmethod
// def fabricate(kls, url, content, headers=None):
// """ Given a URL and some content, create a fake Resource that looks
// as though it has already fetched the content. Useful for using
// Resource objects without having to do a GET.
// """
//
// if type(content) != unicode:
// raise TypeError("Provided content must be unicode.")
//
// if headers is None:
// headers = {}
//
// try:
// utf8_content = content.encode('utf-8', 'strict')
// except UnicodeDecodeError:
// logger.warning("Unable to encode content for url %s. Content "
// "should be unicode and encodeable at this point.")
// utf8_content = content.encode('utf-8', 'replace')
//
// mocked_response_dict = {
// "cookies": {},
// "_content": utf8_content,
// "headers": dict({
// "content-length": len(content),
// "accept-ranges": "bytes",
// "vary": "Accept-Encoding,Cookie",
// "server": "Apache/2.2.21",
// "content-type": "text/html; charset=UTF-8"
// }, **headers),
// "url": url,
// "status_code": 200,
// "_content_consumed": False,
// "request": None,
// "raw": None,
// "error": None,
// "config": {
// "decode_unicode": True,
// "pool_connections": 10,
// "verbose": None,
// "keep_alive": True,
// "max_retries": 0,
// "base_headers": {
// "Accept-Encoding": "identity, deflate, compress, gzip",
// "Accept": "|)}>#*",
// "User-Agent": "python-requests/0.8.1"
// },
// "pool_maxsize": 10,
// "safe_mode": False,
// "max_redirects": 30
// },
// "history": []
// }
// mocked_response = requests.Response()
// for k, v in mocked_response_dict.items():
// setattr(mocked_response, k, v)
//
// return Resource(
// url = url,
// response = mocked_response
// )
//
//
// @property
// def url(self):
// return self._url
//
//
// @url.setter
// def url(self, value):
// parsed_url = urlparse(value)
// if parsed_url.scheme not in ('http', 'https'):
// raise ValueError("Resource only allows HTTP and HTTPS urls.")
//
// if not parsed_url.netloc:
// raise ValueError("Relative URLs are not allowed.")
//
// self._url = value
//
// _parsed_url = None
// @property
// def parsed_url(self):
// if self._parsed_url is None:
// self._parsed_url = urlparse(self.url)
// return self._parsed_url
//
// @property
// def status_code(self):
// return self.response.status_code
//
//
// _content = None
// @property
// def content(self):
// """Return the content for a resource. Always returns unicode.
//
// """
// if self._content is None:
// # Requests that come in without content-type encoding headers will
// # default to iso-8859-1, which could be wrong
// if (self.response.encoding and
// self.response.encoding.lower() == 'iso-8859-1'):
// # Dont send unicode, because it could have been decoded wrong
// # by an incorrect content-type guess.
// encoding = get_charset_from_html(self.response.content) or 'iso-8859-1'
//
// if encoding != self.response.encoding:
// # First, try to use the encoding we found in the markup
// try:
// self._content = self.response.content.decode(encoding)
// except (LookupError, UnicodeDecodeError):
// stats.increment(
// 'iris.resource.encoding.encoding_mismatch')
// # That encoding might be wrong though, so if it is, use
// # the one it reported since they could have the wrong
// # one set in the markup. eg. sending the content over
// # as iso but declaring it to be utf-8 like gq.com does.
// # We may also end up with an invalid encoding type, at
// # which point we should also just use the request
// # encoding and replace silently.
// self._content = self.response.content.decode(
// self.response.encoding, 'replace')
// else:
// # If the encoding guess was right, just use the unicode
// self._content = self.response.text
//
// else:
// # Otherwise we trust the encoding
// self._content = self.response.text
//
// return self._content
//
//
// @property
// def content_type(self):
// return self.response.headers.get('content-type', '')
//
//
// @property
// def is_html(self):
// if 'html' in self.content_type:
// return True
//
// # Otherwise, just try parsing it and see if it succeeds
// try:
// return (self.doc is not None)
// except:
// return False
//
// @property
// def is_plaintext(self):
// if 'text/plain' in self.content_type:
// return True
//
// return False
//
// @property
// def is_image(self):
// if 'image' in self.content_type:
// return True
//
// return False
//
// @property
// def is_pdf(self):
// if 'pdf' in self.content_type:
// return True
//
// return False
//
// _lxml_doc = None
// @property
// def doc(self):
// if self._lxml_doc is None:
// self._generate_lxml_doc()
//
// return self._lxml_doc
//
// _docxp = None
// @property
// def docxp(self):
// """ Generate an XPath Evaluator for this doc. """
// if self._docxp is None:
// self._docxp = XPathEvaluator(self.doc)
//
// return self._docxp
//
// _redocxp = None
// @property
// def redocxp(self):
// """ Generate an XPath Evaluator for this doc, that includes the RE
// namespace for regular expression matching.
//
// """
// if self._redocxp is None:
// _rens = {'re':'http://exslt.org/regular-expressions'}
// self._redocxp = XPathEvaluator(self.doc, namespaces=_rens)
//
// return self._redocxp
//
// def _generate_lxml_doc(self):
// # First check if we have a text based resource
// if (not 'html' in self.content_type and
// not 'text' in self.content_type and
// not is_text(self.content[:512])):
// raise ValueError("Content does not appear to be text.")
//
//
// # Remove useless carriage returns which get parsed as &#13; otherwise
// content = re.sub(r'(\n\r|\r\n)', '\n', self.content)
//
// # Dont pass any content encodings into lxml, it is dumb about them
// content = strip_content_encodings(content)
//
// self._lxml_doc = lxml.html.fromstring(content)
//
//
//
//
// if len(self._lxml_doc.getchildren()) == 0:
// stats.increment('iris.resource.encoding.no_children')
// raise ValueError("No children, likely a bad parse.")
//
//
// # Sometimes, lxml (or BeautifulSoup) will wrap the whole document
// # in an extra html tag. This screws up a whole bunch of things in
// # the parsing process. If this is the case, reset the doc to the
// # ACTUAL root of the doc.
// # Sample cases:
// # * Strange Doctype causing issues: http://bit.ly/IATz0B
// # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o
// # Also check for a body inside of our internal HTML tag, to determine
// # that it's not just a junk HTML tag sibling at the bottom of the
// # doc or something.
// internal_html_tag = self._lxml_doc.find('html')
// if (internal_html_tag is not None and
// len(internal_html_tag.xpath('.//body')) > 0):
// self._lxml_doc = internal_html_tag
//
// self._normalize_meta_tags()
//
// self._lxml_doc.make_links_absolute(self.url)
//
// # Convert any lazy loaded images into normal images before clean_html
// # which will strip all other attributes
// self._lxml_doc = convert_lazy_loaded_images(self._lxml_doc)
//
// # Clean the doc of anything malicious.
// self._lxml_doc = cleaner.clean_html(self._lxml_doc)
//
// # Manually nofollow links so that we don't clobber rel author
// # Workaround for https://bugs.launchpad.net/lxml/+bug/971754
// for a in self.docxp('//a'):
// if a.attrib.get('rel', None):
// rel_attribs = set(a.attrib['rel'].split())
// rel_attribs.add('nofollow')
// a.attrib['rel'] = ' '.join(rel_attribs)
// else:
// a.attrib['rel'] = 'nofollow'
//
// # Re-relativize anchor links
// anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" %
// self.url.replace("'", "%27"))
// for link in self.docxp(anchor_link_xpath):
// link.attrib['href'] = link.attrib['href'].replace(self.url, '')
//
//
// _attrib_map = None
// @property
// def attrib_map(self):
// """ Create an AttribMap object for fast checking of class/id existence
// in the document. Used in association with extract_by_selector.
//
// """
// if self._attrib_map is None:
// self._attrib_map = AttribMap(self.doc)
//
// return self._attrib_map
//
//
// def extract_by_selector(self, selector):
// " Shortcut to run extract_by_selector on our doc with our AttribMap. "
// return ebs(self.doc, selector, self.attrib_map, self.docxp)
//
//

@ -4,8 +4,8 @@ import { convertNodeTo } from 'utils/dom';
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
export default function cleanHOnes(article, $) {
// const hOnes = $.find('h1')
const $hOnes = $('h1', article);
if ($hOnes.length < 3) {
$hOnes.each((index, node) => $(node).remove());
} else {

@ -6,7 +6,7 @@ function convertDivs($) {
$('div').each((index, div) => {
const $div = $(div);
const convertable = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;
// .not(DIV_TO_P_BLOCK_TAGS).length === 0;
if (convertable) {
convertNodeTo($div, $, 'p');
}

@ -12,6 +12,5 @@ function absolutize($, rootUrl, attr, $content) {
export default function makeLinksAbsolute($content, $, url) {
['href', 'src'].forEach(attr => absolutize($, url, attr, $content));
// console.log($content.html())
return $content;
}

@ -3,10 +3,6 @@ import {
CANDIDATES_BLACKLIST,
} from './constants';
// ## NOTES:
// This is a working first pass, but if/when we start optimizing
// this is a good candidate. - AP
export default function stripUnlikelyCandidates($) {
// Loop through the provided document and remove any non-link nodes
// that are unlikely candidates for article content.

@ -1,4 +1,3 @@
// extremely simple url validation as a first step
export default function validateUrl({ hostname }) {
// If this isn't a valid url, return an error message

Loading…
Cancel
Save