remove dead code

pull/405/head
Björgvin Ragnarsson 7 years ago committed by Gijs
parent b3fde168cb
commit c3ff1a2d2c

@ -31,7 +31,6 @@ function Readability(uri, doc, options) {
this._uri = uri;
this._doc = doc;
this._biggestFrame = false;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
@ -40,7 +39,6 @@ function Readability(uri, doc, options) {
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
// Start with all flags set
@ -48,17 +46,6 @@ function Readability(uri, doc, options) {
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
// The list of pages we've parsed in this call of readability,
// for autopaging. As a key store for easier searching.
this._parsedPages = {};
// A list of the ETag headers of pages we've parsed, in case they happen to match,
// we'll know it's a duplicate.
this._pageETags = {};
// Make an AJAX request for each page and append it to the document.
this._curPageNum = 1;
var logEl;
// Control whether log messages are sent to the console
@ -104,10 +91,6 @@ Readability.prototype = {
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call
// it quits and just show a link.
DEFAULT_MAX_PAGES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
@ -1037,24 +1020,22 @@ Readability.prototype = {
if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML);
if (this._curPageNum === 1) {
if (neededToCreateTopCandidate) {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class names here. No need to append
// because that already happened anyway.
topCandidate.id = "readability-page-1";
topCandidate.className = "page";
} else {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
if (neededToCreateTopCandidate) {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class names here. No need to append
// because that already happened anyway.
topCandidate.id = "readability-page-1";
topCandidate.className = "page";
} else {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
}
if (this._debug)
@ -1315,363 +1296,6 @@ Readability.prototype = {
return linkLength / textLength;
},
/**
* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
*
* @author Dan Lacy
* @return string the base url
**/
_findBaseUrl: function() {
var uri = this._uri;
var noUrlParams = uri.path.split("?")[0];
var urlSlashes = noUrlParams.split("/").reverse();
var cleanedSegments = [];
var possibleType = "";
for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
var segment = urlSlashes[i];
// Split off and save anything that looks like a file type.
if (segment.indexOf(".") !== -1) {
possibleType = segment.split(".")[1];
// If the type isn't alpha-only, it's probably not actually a file extension.
if (!possibleType.match(/[^a-zA-Z]/))
segment = segment.split(".")[0];
}
// If our first or second segment has anything looking like a page number, remove it.
if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
var del = false;
// If this is purely a number, and it's the first or second segment,
// it's probably a page number. Remove it.
if (i < 2 && segment.match(/^\d{1,2}$/))
del = true;
// If this is the first segment and it's just "index", remove it.
if (i === 0 && segment.toLowerCase() === "index")
del = true;
// If our first or second segment is smaller than 3 characters,
// and the first segment was purely alphas, remove it.
if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
del = true;
// If it's not marked for deletion, push it to cleanedSegments.
if (!del)
cleanedSegments.push(segment);
}
// This is our final, cleaned, base article URL.
return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
},
/**
* Look for any paging links that may occur within the document.
*
* @param body
* @return object (array)
**/
_findNextPageLink: function(elem) {
var uri = this._uri;
var possiblePages = {};
var allLinks = elem.getElementsByTagName('a');
var articleBaseUrl = this._findBaseUrl();
// Loop through all links, looking for hints that they may be next-page links.
// Things like having "page" in their textContent, className or id, or being a child
// of a node with a page-y className or id.
//
// Also possible: levenshtein distance? longest common subsequence?
//
// After we do that, assign each page a score, and
for (var i = 0, il = allLinks.length; i < il; i += 1) {
var link = allLinks[i];
var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
// If we've already seen this page, ignore it.
if (linkHref === "" ||
linkHref === articleBaseUrl ||
linkHref === uri.spec ||
linkHref in this._parsedPages) {
continue;
}
// If it's on a different domain, skip it.
if (uri.host !== linkHref.split(/\/+/g)[1])
continue;
var linkText = this._getInnerText(link);
// If the linkText looks like it's not the next page, skip it.
if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
continue;
// If the leftovers of the URL after removing the base URL don't contain
// any digits, it's certainly not a next page link.
var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
if (!linkHrefLeftover.match(/\d/))
continue;
if (!(linkHref in possiblePages)) {
possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
} else {
possiblePages[linkHref].linkText += ' | ' + linkText;
}
var linkObj = possiblePages[linkHref];
// If the articleBaseUrl isn't part of this URL, penalize this link. It could
// still be the link, but the odds are lower.
// Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (linkHref.indexOf(articleBaseUrl) !== 0)
linkObj.score -= 25;
var linkData = linkText + ' ' + link.className + ' ' + link.id;
if (linkData.match(this.REGEXPS.nextLink))
linkObj.score += 50;
if (linkData.match(/pag(e|ing|inat)/i))
linkObj.score += 25;
if (linkData.match(/(first|last)/i)) {
// -65 is enough to negate any bonuses gotten from a > or » in the text,
// If we already matched on "next", last is probably fine.
// If we didn't, then it's bad. Penalize.
if (!linkObj.linkText.match(this.REGEXPS.nextLink))
linkObj.score -= 65;
}
if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
linkObj.score -= 50;
if (linkData.match(this.REGEXPS.prevLink))
linkObj.score -= 200;
// If a parentNode contains page or paging or paginat
var parentNode = link.parentNode;
var positiveNodeMatch = false;
var negativeNodeMatch = false;
while (parentNode) {
var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
positiveNodeMatch = true;
linkObj.score += 25;
}
if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
// If this is just something like "footer", give it a negative.
// If it's something like "body-and-footer", leave it be.
if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
linkObj.score -= 25;
negativeNodeMatch = true;
}
}
parentNode = parentNode.parentNode;
}
// If the URL looks like it has paging in it, add to the score.
// Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
linkObj.score += 25;
// If the URL contains negative values, give a slight decrease.
if (linkHref.match(this.REGEXPS.extraneous))
linkObj.score -= 15;
/**
* Minor punishment to anything that doesn't match our current URL.
* NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
* Dan, can you show me a counterexample where this is necessary?
* if (linkHref.indexOf(window.location.href) !== 0) {
* linkObj.score -= 1;
* }
**/
// If the link text can be parsed as a number, give it a minor bonus, with a slight
// bias towards lower numbered pages. This is so that pages that might not have 'next'
// in their text can still get scored, and sorted properly by score.
var linkTextAsNumber = parseInt(linkText, 10);
if (linkTextAsNumber) {
// Punish 1 since we're either already there, or it's probably
// before what we want anyways.
if (linkTextAsNumber === 1) {
linkObj.score -= 10;
} else {
linkObj.score += Math.max(0, 10 - linkTextAsNumber);
}
}
}
// Loop thrugh all of our possible pages from above and find our top
// candidate for the next page URL. Require at least a score of 50, which
// is a relatively high confidence that this page is the next link.
var topPage = null;
for (var page in possiblePages) {
if (possiblePages.hasOwnProperty(page)) {
if (possiblePages[page].score >= 50 &&
(!topPage || topPage.score < possiblePages[page].score))
topPage = possiblePages[page];
}
}
var nextHref = null;
if (topPage) {
nextHref = topPage.href.replace(/\/$/, '');
this.log('NEXT PAGE IS ' + nextHref);
this._parsedPages[nextHref] = true;
}
return nextHref;
},
_successfulRequest: function(request) {
return (request.status >= 200 && request.status < 300) ||
request.status === 304 ||
(request.status === 0 && request.responseText);
},
_ajax: function(url, options) {
var request = new XMLHttpRequest();
function respondToReadyState(readyState) {
if (request.readyState === 4) {
if (this._successfulRequest(request)) {
if (options.success)
options.success(request);
} else if (options.error) {
options.error(request);
}
}
}
if (typeof options === 'undefined')
options = {};
request.onreadystatechange = respondToReadyState;
request.open('get', url, true);
request.setRequestHeader('Accept', 'text/html');
try {
request.send(options.postBody);
} catch (e) {
if (options.error)
options.error();
}
return request;
},
_appendNextPage: function(nextPageLink) {
var doc = this._doc;
this._curPageNum += 1;
var articlePage = doc.createElement("DIV");
articlePage.id = 'readability-page-' + this._curPageNum;
articlePage.className = 'page';
articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
doc.getElementById("readability-content").appendChild(articlePage);
if (this._curPageNum > this._maxPages) {
var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
return;
}
// Now that we've built the article page DOM element, get the page content
// asynchronously and load the cleaned content into the div we created for it.
(function(pageUrl, thisPage) {
this._ajax(pageUrl, {
success: function(r) {
// First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
var eTag = r.getResponseHeader('ETag');
if (eTag) {
if (eTag in this._pageETags) {
this.log("Exact duplicate page found via ETag. Aborting.");
articlePage.style.display = 'none';
return;
}
this._pageETags[eTag] = 1;
}
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
var page = doc.createElement("DIV");
// Do some preprocessing to our HTML to make it ready for appending.
// - Remove any script tags. Swap and reswap newlines with a unicode
// character because multiline regex doesn't work in javascript.
// - Turn any noscript tags into divs so that we can parse them. This
// allows us to find any next page links hidden via javascript.
// - Turn all double br's into p's - was handled by prepDocument in the original view.
// Maybe in the future abstract out prepDocument to work for both the original document
// and AJAX-added pages.
var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
page.innerHTML = responseHtml;
this._replaceBrs(page);
// Reset all flags for the next page, as they will search through it and
// disable as necessary at the end of grabArticle.
this._flags = 0x1 | 0x2 | 0x4;
var secondNextPageLink = this._findNextPageLink(page);
// NOTE: if we end up supporting _appendNextPage(), we'll need to
// change this call to be async
var content = this._grabArticle(page);
if (!content) {
this.log("No content found in page to append. Aborting.");
return;
}
// Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
// Compare it against all of the the previous document's we've gotten. If the previous
// document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
if (firstP && firstP.innerHTML.length > 100) {
for (var i = 1; i <= this._curPageNum; i += 1) {
var rPage = doc.getElementById('readability-page-' + i);
if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
this.log('Duplicate of page ' + i + ' - skipping.');
articlePage.style.display = 'none';
this._parsedPages[pageUrl] = true;
return;
}
}
}
this._removeScripts(content);
thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
// After the page has rendered, post process the content. This delay is necessary because,
// in webkit at least, offsetWidth is not set in time to determine image width. We have to
// wait a little bit for reflow to finish before we can fix floating images.
setTimeout((function() {
this._postProcessContent(thisPage);
}).bind(this), 500);
if (secondNextPageLink)
this._appendNextPage(secondNextPageLink);
}
});
}).bind(this)(nextPageLink, articlePage);
},
/**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
@ -1954,10 +1578,6 @@ Readability.prototype = {
return (this._flags & flag) > 0;
},
_addFlag: function(flag) {
this._flags = this._flags | flag;
},
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
@ -2048,16 +1668,6 @@ Readability.prototype = {
// Remove script tags from the document.
this._removeScripts(this._doc);
// FIXME: Disabled multi-page article support for now as it
// needs more work on infrastructure.
// Make sure this document is added to the list of parsed pages first,
// so we don't double up on the first page.
// this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
// Pull out any possible next page link first.
// var nextPageLink = this._findNextPageLink(doc.body);
this._prepDocument();
var metadata = this._getArticleMetadata();
@ -2071,14 +1681,6 @@ Readability.prototype = {
this._postProcessContent(articleContent);
// if (nextPageLink) {
// // Append any additional pages after a small timeout so that people
// // can start reading without having to wait for this to finish processing.
// setTimeout((function() {
// this._appendNextPage(nextPageLink);
// }).bind(this), 500);
// }
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.

@ -198,11 +198,6 @@ describe("Readability API", function() {
expect(new Readability({}, {}, {nbTopCandidates: 42})._nbTopCandidates).eql(42);
});
it("should accept a maxPages option", function() {
expect(new Readability({}, {})._maxPages).eql(5);
expect(new Readability({}, {}, {maxPages: 42})._maxPages).eql(42);
});
it("should accept a maxElemsToParse option", function() {
expect(new Readability({}, {})._maxElemsToParse).eql(0);
expect(new Readability({}, {}, {maxElemsToParse: 42})._maxElemsToParse).eql(42);

Loading…
Cancel
Save