Improve URL handling in JSDOMParser and Readability.js

This change ups the required node version to 7.0 because it relies on the builtin url module.

We now pass a url when constructing a jsdom document or JSDOMParser document.
Because this is an API change, I'm increasing the package version.

Ultimately, I would like to remove the  argument from the readability constructor. It should
use the documentURI from the document it is passed.
pull/394/merge
Gijs Kruitbosch 6 years ago committed by Gijs
parent 834672ef86
commit d598baf02b

@ -1,7 +1,7 @@
language: node_js
sudo: false
node_js:
- '6.5'
- '7.0'
script:
- npm run lint
- npm run test

@ -553,7 +553,8 @@
},
};
var Document = function () {
var Document = function (url) {
this.documentURI = url;
this.styleSheets = [];
this.childNodes = [];
this.children = [];
@ -593,6 +594,20 @@
node.textContent = text;
return node;
},
get baseURI() {
if (!this.hasOwnProperty("_baseURI")) {
this._baseURI = this.documentURI;
var baseElements = this.getElementsByTagName("base");
var href = baseElements[0] && baseElements[0].getAttribute("href");
if (href) {
try {
this._baseURI = (new URL(href, this._baseURI)).href;
} catch (ex) {/* Just fall back to documentURI */}
}
}
return this._baseURI;
},
};
var Element = function (tag) {
@ -1111,9 +1126,9 @@
/**
* Parses an HTML string and returns a JS implementation of the Document.
*/
parse: function (html) {
parse: function (html, url) {
this.html = html;
var doc = this.doc = new Document();
var doc = this.doc = new Document(url);
this.readChildren(doc);
// If this is an HTML document, remove root-level children except for the

@ -1,5 +1,6 @@
var path = require("path");
var fs = require("fs");
var url = require("url");
// We want to load Readability and JSDOMParser, which aren't set up as commonjs libraries,
// and so we need to do some hocus-pocus with 'vm' to import them on a separate scope
@ -14,6 +15,7 @@ var scopeContext = {};
// in the scope we're using:
scopeContext.dump = console.log;
scopeContext.console = console;
scopeContext.URL = url.URL;
// Actually load files. NB: if either of the files has parse errors,
// node is dumb and shows you a syntax error *at this callsite* . Don't try to find

@ -1,6 +1,6 @@
{
"name": "readability",
"version": "0.0.1",
"version": "0.1.0",
"description": "A standalone version of the readability library used for Firefox Reader View.",
"main": "Readability.js",
"scripts": {
@ -20,7 +20,7 @@
"url": "https://github.com/mozilla/readability/issues"
},
"engines": {
"node" : ">=6.5"
"node" : ">=7.0"
},
"homepage": "https://github.com/mozilla/readability",
"devDependencies": {

@ -9,7 +9,7 @@ var BASETESTCASE = '<html><body><p>Some text and <a class="someclass" href="#">a
'<div id="foo">With a <script>With &lt; fancy " characters in it because' +
'</script> that is fun.<span>And another node to make it harder</span></div><form><input type="text"/><input type="number"/>Here\'s a form</form></body></html>';
var baseDoc = new JSDOMParser().parse(BASETESTCASE);
var baseDoc = new JSDOMParser().parse(BASETESTCASE, "http://fakehost/");
describe("Test JSDOM functionality", function() {
function nodeExpect(actual, expected) {
@ -37,6 +37,11 @@ describe("Test JSDOM functionality", function() {
});
it("should have basic URI information", function() {
expect(baseDoc.documentURI, "http://fakehost/");
expect(baseDoc.baseURI, "http://fakehost/");
});
it("should deal with script tags", function() {
// Check our script parsing worked:
var scripts = baseDoc.getElementsByTagName("script");
@ -300,3 +305,18 @@ describe("Recovery from self-closing tags that have close tags", function() {
expect(doc.firstChild.firstChild.firstChild.localName).eql("p");
});
});
describe("baseURI parsing", function() {
it("should handle various types of relative and absolute base URIs", function() {
function checkBase(base, expectedResult) {
var html = "<html><head><base href='" + base + "'></base></head><body/></html>";
var doc = new JSDOMParser().parse(html, "http://fakehost/some/dir/");
expect(doc.baseURI).eql(expectedResult);
}
checkBase("relative/path", "http://fakehost/some/dir/relative/path");
checkBase("/path", "http://fakehost/path");
checkBase("http://absolute/", "http://absolute/");
checkBase("//absolute/path", "http://absolute/path");
});
});

@ -229,6 +229,7 @@ describe("Test pages", function() {
runTestsWithItems("jsdom", function(source) {
var doc = jsdom(source, {
url: uri.spec,
features: {
FetchExternalResources: false,
ProcessExternalResources: false
@ -240,7 +241,7 @@ describe("Test pages", function() {
runTestsWithItems("JSDOMParser", function(source) {
var parser = new JSDOMParser();
var doc = parser.parse(source);
var doc = parser.parse(source, uri.spec);
if (parser.errorState) {
console.error("Parsing this DOM caused errors:", parser.errorState);
return null;

Loading…
Cancel
Save