Split off isProbablyReaderable implementation

5 years ago · 2620542dd1
parent 8c41d92560
commit 2620542dd1
6 changed files with 134 additions and 68 deletions
--- a/README.md
+++ b/README.md
@ -45,6 +45,15 @@ var documentClone = document.cloneNode(true);
 var article = new Readability(documentClone).parse();
 ```

+## What's Readability-readerable?
+
+It's a quick-and-dirty way of figuring out if it's plausible that the contents of a given
+document are suitable for processing with Readability. It is likely to produce both false
+positives and false negatives. The reason it exists is to avoid bogging down a time-sensitive
+process (like loading and showing the user a webpage) with the complex logic in the core of
+Readability. Improvements to its logic (while not deteriorating its performance) are very
+welcome.
+
 ## Tests

 Please run [eslint](http://eslint.org/) as a first check that your changes adhere to our style guidelines.
--- a/Readability-readerable.js
+++ b/Readability-readerable.js
@ -0,0 +1,97 @@
+/* eslint-env es6:false */
+/* globals exports */
+/*
+ * Copyright (c) 2010 Arc90 Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This code is heavily based on Arc90's readability.js (1.7.1) script
+ * available at: http://code.google.com/p/arc90labs-readability
+ */
+
+var REGEXPS = {
+  // NOTE: These two regular expressions are duplicated in
+  // Readability.js. Please keep both copies in sync.
+  unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+  okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
+};
+
+function isNodeVisible(node) {
+  // Have to null-check node.style to deal with SVG and MathML nodes.
+  return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
+}
+
+/**
+ * Decides whether or not the document is reader-able without parsing the whole thing.
+ *
+ * @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object.
+ */
+function isProbablyReaderable(doc, isVisible) {
+  if (!isVisible) {
+    isVisible = isNodeVisible;
+  }
+
+  var nodes = doc.querySelectorAll("p, pre");
+
+  // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
+  // Some articles' DOM structures might look like
+  // <div>
+  //   Sentences<br>
+  //   <br>
+  //   Sentences<br>
+  // </div>
+  var brNodes = doc.querySelectorAll("div > br");
+  if (brNodes.length) {
+    var set = new Set(nodes);
+    [].forEach.call(brNodes, function(node) {
+      set.add(node.parentNode);
+    });
+    nodes = Array.from(set);
+  }
+
+  var score = 0;
+  // This is a little cheeky, we use the accumulator 'score' to decide what to return from
+  // this callback:
+  return [].some.call(nodes, function(node) {
+    if (!isVisible(node))
+      return false;
+
+    var matchString = node.className + " " + node.id;
+    if (REGEXPS.unlikelyCandidates.test(matchString) &&
+        !REGEXPS.okMaybeItsACandidate.test(matchString)) {
+      return false;
+    }
+
+    if (node.matches("li p")) {
+      return false;
+    }
+
+    var textContentLength = node.textContent.trim().length;
+    if (textContentLength < 140) {
+      return false;
+    }
+
+    score += Math.sqrt(textContentLength - 140);
+
+    if (score > 20) {
+      return true;
+    }
+    return false;
+  });
+}
+
+if (typeof exports === "object") {
+  exports.isProbablyReaderable = isProbablyReaderable;
+}
--- a/Readability.js
+++ b/Readability.js
@ -1708,65 +1708,6 @@ Readability.prototype = {
    return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
  },

-  /**
-   * Decides whether or not the document is reader-able without parsing the whole thing.
-   *
-   * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
-   */
-  isProbablyReaderable: function(helperIsVisible) {
-    var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
-
-    // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
-    // Some articles' DOM structures might look like
-    // <div>
-    //   Sentences<br>
-    //   <br>
-    //   Sentences<br>
-    // </div>
-    var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
-    if (brNodes.length) {
-      var set = new Set();
-      [].forEach.call(brNodes, function(node) {
-        set.add(node.parentNode);
-      });
-      nodes = [].concat.apply(Array.from(set), nodes);
-    }
-
-    if (!helperIsVisible) {
-      helperIsVisible = this._isProbablyVisible;
-    }
-
-    var score = 0;
-    // This is a little cheeky, we use the accumulator 'score' to decide what to return from
-    // this callback:
-    return this._someNode(nodes, function(node) {
-      if (helperIsVisible && !helperIsVisible(node))
-        return false;
-      var matchString = node.className + " " + node.id;
-
-      if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
-          !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
-        return false;
-      }
-
-      if (node.matches && node.matches("li p")) {
-        return false;
-      }
-
-      var textContentLength = node.textContent.trim().length;
-      if (textContentLength < 140) {
-        return false;
-      }
-
-      score += Math.sqrt(textContentLength - 140);
-
-      if (score > 20) {
-        return true;
-      }
-      return false;
-    });
-  },
-
  /**
   * Runs readability.
   *
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "readability",
-  "version": "0.1.0",
+  "version": "0.2.0",
  "description": "A standalone version of the readability library used for Firefox Reader View.",
  "main": "Readability.js",
  "scripts": {
--- a/test/test-isProbablyReaderable.js
+++ b/test/test-isProbablyReaderable.js
@ -0,0 +1,27 @@
+var jsdom = require("jsdom").jsdom;
+var chai = require("chai");
+chai.config.includeStack = true;
+var expect = chai.expect;
+
+var testPages = require("./utils").getTestPages();
+var readabilityCheck = require("../Readability-readerable.js");
+
+describe("isProbablyReaderable - test pages", function() {
+  testPages.forEach(function(testPage) {
+    var uri = "http://fakehost/test/page.html";
+    describe(testPage.dir, function() {
+      var doc = jsdom(testPage.source, {
+        url: uri,
+        features: {
+          FetchExternalResources: false,
+          ProcessExternalResources: false,
+        },
+      });
+      var expected = testPage.expectedMetadata.readerable;
+      it("The result should " + (expected ? "" : "not ") + "be readerable", function() {
+        expect(readabilityCheck.isProbablyReaderable(doc)).eql(expected);
+      });
+    });
+  });
+});
+
--- a/test/test-readability.js
+++ b/test/test-readability.js
@ -61,11 +61,7 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe
        // Provide one class name to preserve, which we know appears in a few
        // of the test documents.
        var myReader = new Readability(doc, { classesToPreserve: ["caption"] });
-        // Needs querySelectorAll function to test isProbablyReaderable method.
-        // jsdom implements querySelector but JSDOMParser doesn't.
-        var readerable = label === "jsdom" ? myReader.isProbablyReaderable() : null;
        result = myReader.parse();
-        result.readerable = readerable;
      } catch (err) {
        throw reformatError(err);
      }
@ -174,10 +170,6 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe
    expectedMetadata.dir && it("should extract expected direction", function() {
      expect(expectedMetadata.dir).eql(result.dir);
    });
-
-    label === "jsdom" && it("should probably be readerable", function() {
-      expect(expectedMetadata.readerable).eql(result.readerable);
-    });
  });
 }