Merge pull request #226 from snshn/base-tag-option

Add base URL option
pull/227/head
Sunshine 3 years ago committed by GitHub
commit 8ad252868e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -54,6 +54,7 @@ The guide can be found [here](docs/containers.md)
---------------------------------------------------
## Options
- `-b`: Use custom base URL
- `-c`: Exclude CSS
- `-e`: Ignore network errors
- `-f`: Omit frames

@ -0,0 +1,40 @@
# 8. Base Tag
Date: 2020-12-25
## Status
Accepted
## Context
HTML documents may contain `base` tag, which influences resolution of anchor links and relative URLs as well as dynamically loaded resources.
Sometimes, in order to make certain saved documents function closer to how they operate while being served from a remote server, the `base` tag specifying the source page's URL may need to be added to the document.
There can be only one such tag. If multiple `base` tags are present, only the first encountered tag ends up being used.
## Decision
Adding the `base` tag should be optional — saved documents should not contain the `base` tag unless it was specified by the user, or the document originally had the `base` tag in it.
Existing `href` attribute's value of the original `base` tag should be used for resolving the document's relative links instead of document's own URL (precisely the way browsers do it).
## Consequences
#### If the base tag does not exist in the source document
- If the base tag does not exist in the source document
- With base URL option provided
- use the specified base URL value to retrieve assets, keep original base URL value in the document
- Without base URL option provided
- download document as usual, do not add base tag
- If the base tag already exists in the source document
- With base URL option provided
- we overwrite the original base URL before retrieving assets, keep new base URL value in the document
- Without base URL option provided:
- use the base URL from the original document to retrieve assets, keep original base URL value in the document
The program will obtain ability to retrieve remote assets for non-remote sources (such as data URLs and local files).
The program will obatin ability to get rid of existing base tag values (by provind an empty one).

File diff suppressed because it is too large Load Diff

@ -9,12 +9,12 @@ use std::process;
use std::time::Duration;
use monolith::html::{
add_base_tag, add_favicon, has_base_tag, has_favicon, html_to_dom, metadata_tag,
add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url,
stringify_document, walk_and_embed_assets,
};
use monolith::opts::Options;
use monolith::url::{
data_to_data_url, data_url_to_data, is_data_url, is_file_url, is_http_url, resolve_url,
data_to_data_url, is_data_url, is_file_url, is_http_url, parse_data_url, resolve_url,
};
use monolith::utils::retrieve_asset;
@ -52,7 +52,7 @@ fn main() {
let options = Options::from_args();
let original_target: &str = &options.target;
let target_url: &str;
let base_url;
let mut base_url: String;
let mut dom;
// Pre-process the input
@ -64,7 +64,9 @@ fn main() {
// Determine exact target URL
if target.clone().len() == 0 {
eprintln!("No target specified");
if !options.silent {
eprintln!("No target specified");
}
process::exit(1);
} else if is_http_url(target.clone()) || is_data_url(target.clone()) {
target_url = target.as_str();
@ -72,7 +74,9 @@ fn main() {
target_url = target.as_str();
} else if path.exists() {
if !path.is_file() {
eprintln!("Local target is not a file: {}", original_target);
if !options.silent {
eprintln!("Local target is not a file: {}", original_target);
}
process::exit(1);
}
target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" });
@ -111,11 +115,16 @@ fn main() {
.build()
.expect("Failed to initialize HTTP client");
// At this stage we assume that the base URL is the same as the target URL
base_url = str!(target_url);
// Retrieve target document
if is_file_url(target_url) || is_http_url(target_url) {
match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) {
Ok((data, final_url, _media_type)) => {
base_url = final_url;
if options.base_url.clone().unwrap_or(str!()).is_empty() {
base_url = final_url
}
dom = html_to_dom(&String::from_utf8_lossy(&data));
}
Err(_) => {
@ -126,23 +135,40 @@ fn main() {
}
}
} else if is_data_url(target_url) {
let (media_type, data): (String, Vec<u8>) = data_url_to_data(target_url);
let (media_type, data): (String, Vec<u8>) = parse_data_url(target_url);
if !media_type.eq_ignore_ascii_case("text/html") {
eprintln!("Unsupported data URL media type");
if !options.silent {
eprintln!("Unsupported data URL media type");
}
process::exit(1);
}
base_url = str!(target_url);
dom = html_to_dom(&String::from_utf8_lossy(&data));
} else {
process::exit(1);
}
// Use custom base URL if specified, read and use what's in the DOM otherwise
if !options.base_url.clone().unwrap_or(str!()).is_empty() {
if is_data_url(options.base_url.clone().unwrap()) {
if !options.silent {
eprintln!("Data URLs cannot be used as base URL");
}
process::exit(1);
} else {
base_url = options.base_url.clone().unwrap();
}
} else {
if let Some(existing_base_url) = get_base_url(&dom.document) {
base_url = resolve_url(target_url, existing_base_url).unwrap();
}
}
// Embed remote assets
walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0);
// Take care of BASE tag
if is_http_url(base_url.clone()) && !has_base_tag(&dom.document) {
dom = add_base_tag(&dom.document, base_url.clone());
// Update or add new BASE tag to reroute network requests and hash-links in the final document
if let Some(new_base_url) = options.base_url.clone() {
dom = set_base_url(&dom.document, new_base_url);
}
// Request and embed /favicon.ico (unless it's already linked in the document)
@ -172,7 +198,7 @@ fn main() {
// Add metadata tag
if !options.no_metadata {
let metadata_comment: String = metadata_tag(&base_url);
let metadata_comment: String = create_metadata_tag(&base_url);
result.insert_str(0, &metadata_comment);
if metadata_comment.len() > 0 {
result.insert_str(metadata_comment.len(), "\n");

@ -2,20 +2,21 @@ use clap::{App, Arg};
#[derive(Default)]
pub struct Options {
pub target: String,
pub base_url: Option<String>,
pub no_css: bool,
pub ignore_errors: bool,
pub no_fonts: bool,
pub no_frames: bool,
pub no_fonts: bool,
pub no_images: bool,
pub isolate: bool,
pub no_js: bool,
pub insecure: bool,
pub isolate: bool,
pub no_metadata: bool,
pub output: String,
pub silent: bool,
pub timeout: u64,
pub user_agent: String,
pub no_metadata: bool,
pub target: String,
}
const ASCII: &str = " \
@ -37,14 +38,8 @@ impl Options {
.version(crate_version!())
.author(crate_authors!("\n"))
.about(format!("{}\n{}", ASCII, crate_description!()).as_str())
.arg(
Arg::with_name("target")
.required(true)
.takes_value(true)
.index(1)
.help("URL or file path"),
)
// .args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Use custom base URL'")
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
@ -53,12 +48,19 @@ impl Options {
.args_from_usage("-I, --isolate 'Cuts off document from the Internet'")
.args_from_usage("-j, --no-js 'Removes JavaScript'")
.args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Excludes metadata information from the document'")
.args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'")
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
.args_from_usage("-s, --silent 'Suppresses verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'")
// .args_from_usage("-v, --no-video 'Removes video sources'")
.arg(
Arg::with_name("target")
.required(true)
.takes_value(true)
.index(1)
.help("URL or file path"),
)
.get_matches();
let mut options: Options = Options::default();
@ -67,6 +69,9 @@ impl Options {
.value_of("target")
.expect("please set target")
.to_string();
if let Some(base_url) = app.value_of("base-url") {
options.base_url = Some(str!(base_url));
}
options.no_css = app.is_present("no-css");
options.ignore_errors = app.is_present("ignore-errors");
options.no_frames = app.is_present("no-frames");

@ -0,0 +1,123 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use assert_cmd::prelude::*;
use std::env;
use std::process::Command;
#[test]
fn add_new_when_provided() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")
.arg("-b")
.arg("http://localhost:8000/")
.arg("data:text/html,Hello%2C%20World!")
.output()
.unwrap();
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\
<base href=\"http://localhost:8000/\"></base>\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn keep_existing_when_none_provided() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")
.arg("data:text/html,<base href=\"http://localhost:8000/\" />Hello%2C%20World!")
.output()
.unwrap();
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\
<base href=\"http://localhost:8000/\">\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn override_existing_when_provided() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")
.arg("-b")
.arg("http://localhost/")
.arg("data:text/html,<base href=\"http://localhost:8000/\" />Hello%2C%20World!")
.output()
.unwrap();
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\
<base href=\"http://localhost/\">\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn remove_existing_when_empty_provided() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")
.arg("-b")
.arg("")
.arg("data:text/html,<base href=\"http://localhost:8000/\" />Hello%2C%20World!")
.output()
.unwrap();
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\
<base href=\"\">\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
Ok(())
}
}

@ -337,7 +337,7 @@ mod passing {
<title>Local HTML file</title>\n \
<link rel=\"stylesheet\" type=\"text/css\">\n \
<link rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"{empty_image}\">\n \
<img src=\"{empty_image}\" alt=\"\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script></script>\n\n\n\n\
@ -399,7 +399,7 @@ mod passing {
<title>Local HTML file</title>\n \
<link rel=\"stylesheet\" type=\"text/css\">\n \
<link rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"{empty_image}\">\n \
<img src=\"{empty_image}\" alt=\"\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script></script>\n\n\n\n\

@ -0,0 +1,2 @@
mod base_url;
mod basic;

@ -11,7 +11,7 @@ mod passing {
#[test]
fn empty_input_sha256() {
assert!(html::has_proper_integrity(
assert!(html::check_integrity(
"".as_bytes(),
"sha256-47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU="
));
@ -19,7 +19,7 @@ mod passing {
#[test]
fn sha256() {
assert!(html::has_proper_integrity(
assert!(html::check_integrity(
"abcdef0123456789".as_bytes(),
"sha256-9EWAHgy4mSYsm54hmDaIDXPKLRsLnBX7lZyQ6xISNOM="
));
@ -27,7 +27,7 @@ mod passing {
#[test]
fn sha384() {
assert!(html::has_proper_integrity(
assert!(html::check_integrity(
"abcdef0123456789".as_bytes(),
"sha384-gc9l7omltke8C33bedgh15E12M7RrAQa5t63Yb8APlpe7ZhiqV23+oqiulSJl3Kw"
));
@ -35,7 +35,7 @@ mod passing {
#[test]
fn sha512() {
assert!(html::has_proper_integrity(
assert!(html::check_integrity(
"abcdef0123456789".as_bytes(),
"sha512-zG5B88cYMqcdiMi9gz0XkOFYw2BpjeYdn5V6+oFrMgSNjRpqL7EF8JEwl17ztZbK3N7I/tTwp3kxQbN1RgFBww=="
));
@ -55,20 +55,17 @@ mod failing {
#[test]
fn empty_hash() {
assert!(!html::has_proper_integrity(
"abcdef0123456789".as_bytes(),
""
));
assert!(!html::check_integrity("abcdef0123456789".as_bytes(), ""));
}
#[test]
fn empty_input_empty_hash() {
assert!(!html::has_proper_integrity("".as_bytes(), ""));
assert!(!html::check_integrity("".as_bytes(), ""));
}
#[test]
fn sha256() {
assert!(!html::has_proper_integrity(
assert!(!html::check_integrity(
"abcdef0123456789".as_bytes(),
"sha256-badhash"
));
@ -76,7 +73,7 @@ mod failing {
#[test]
fn sha384() {
assert!(!html::has_proper_integrity(
assert!(!html::check_integrity(
"abcdef0123456789".as_bytes(),
"sha384-badhash"
));
@ -84,7 +81,7 @@ mod failing {
#[test]
fn sha512() {
assert!(!html::has_proper_integrity(
assert!(!html::check_integrity(
"abcdef0123456789".as_bytes(),
"sha512-badhash"
));

@ -14,7 +14,7 @@ mod passing {
fn isolated() {
let mut options = Options::default();
options.isolate = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "default-src 'unsafe-inline' data:;");
}
@ -23,7 +23,7 @@ mod passing {
fn no_css() {
let mut options = Options::default();
options.no_css = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "style-src 'none';");
}
@ -32,7 +32,7 @@ mod passing {
fn no_fonts() {
let mut options = Options::default();
options.no_fonts = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "font-src 'none';");
}
@ -41,7 +41,7 @@ mod passing {
fn no_frames() {
let mut options = Options::default();
options.no_frames = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "frame-src 'none'; child-src 'none';");
}
@ -50,7 +50,7 @@ mod passing {
fn no_js() {
let mut options = Options::default();
options.no_js = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "script-src 'none';");
}
@ -59,7 +59,7 @@ mod passing {
fn no_images() {
let mut options = Options::default();
options.no_images = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "img-src data:;");
}
@ -73,7 +73,7 @@ mod passing {
options.no_frames = true;
options.no_js = true;
options.no_images = true;
let csp_content = html::csp(&options);
let csp_content = html::compose_csp(&options);
assert_eq!(csp_content, "default-src 'unsafe-inline' data:; style-src 'none'; font-src 'none'; frame-src 'none'; child-src 'none'; script-src 'none'; img-src data:;");
}

@ -15,7 +15,7 @@ mod passing {
fn http_url() {
let url = "http://192.168.1.1/";
let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
let metadata_comment: String = html::metadata_tag(url);
let metadata_comment: String = html::create_metadata_tag(url);
assert_eq!(
metadata_comment,
@ -33,7 +33,7 @@ mod passing {
fn file_url() {
let url = "file:///home/monolith/index.html";
let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
let metadata_comment: String = html::metadata_tag(url);
let metadata_comment: String = html::create_metadata_tag(url);
assert_eq!(
metadata_comment,
@ -50,7 +50,7 @@ mod passing {
fn data_url() {
let url = "data:text/html,Hello%2C%20World!";
let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
let metadata_comment: String = html::metadata_tag(url);
let metadata_comment: String = html::create_metadata_tag(url);
assert_eq!(
metadata_comment,
@ -77,6 +77,6 @@ mod failing {
#[test]
fn empty_string() {
assert_eq!(html::metadata_tag(""), "");
assert_eq!(html::create_metadata_tag(""), "");
}
}

@ -0,0 +1,104 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use crate::html;
#[test]
fn present() {
let html = "<!doctype html>
<html>
<head>
<base href=\"https://musicbrainz.org\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
assert_eq!(
html::get_base_url(&dom.document),
Some(str!("https://musicbrainz.org"))
);
}
#[test]
fn multiple_tags() {
let html = "<!doctype html>
<html>
<head>
<base href=\"https://www.discogs.com/\" />
<base href=\"https://musicbrainz.org\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
assert_eq!(
html::get_base_url(&dom.document),
Some(str!("https://www.discogs.com/"))
);
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use crate::html;
#[test]
fn absent() {
let html = "<!doctype html>
<html>
<head>
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
assert_eq!(html::get_base_url(&dom.document), None);
}
#[test]
fn no_href() {
let html = "<!doctype html>
<html>
<head>
<base />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
assert_eq!(html::get_base_url(&dom.document), None);
}
#[test]
fn empty_href() {
let html = "<!doctype html>
<html>
<head>
<base href=\"\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
assert_eq!(html::get_base_url(&dom.document), Some(str!()));
}
}

@ -0,0 +1,54 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use html5ever::rcdom::{Handle, NodeData};
use crate::html;
#[test]
fn div_two_style_attributes() {
let html = "<!doctype html><html><head></head><body><DIV STYLE=\"color: blue;\" style=\"display: none;\"></div></body></html>";
let dom = html::html_to_dom(&html);
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
*i += 1;
match &node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string();
if node_name == "body" {
assert_eq!(html::get_node_attr(node, "class"), None);
} else if node_name == "div" {
assert_eq!(
html::get_node_attr(node, "style"),
Some(str!("color: blue;"))
);
}
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
_ => (),
};
}
test_walk(&dom.document, &mut count);
assert_eq!(count, 6);
}
}

@ -12,7 +12,7 @@ mod passing {
use crate::html;
#[test]
fn get_node_name() {
fn parent_node_names() {
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
let dom = html::html_to_dom(&html);
let mut count = 0;

@ -1,10 +1,13 @@
mod add_favicon;
mod csp;
mod check_integrity;
mod compose_csp;
mod create_metadata_tag;
mod embed_srcset;
mod get_base_url;
mod get_node_attr;
mod get_node_name;
mod has_favicon;
mod has_proper_integrity;
mod is_icon;
mod metadata_tag;
mod set_node_attr;
mod stringify_document;
mod walk_and_embed_assets;

@ -0,0 +1,105 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use html5ever::rcdom::{Handle, NodeData};
use crate::html;
#[test]
fn html_lang_and_body_style() {
let html = "<!doctype html><html lang=\"en\"><head></head><body></body></html>";
let dom = html::html_to_dom(&html);
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
*i += 1;
match &node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string();
if node_name == "html" {
assert_eq!(html::get_node_attr(node, "lang"), Some(str!("en")));
html::set_node_attr(node, "lang", Some(str!("de")));
assert_eq!(html::get_node_attr(node, "lang"), Some(str!("de")));
html::set_node_attr(node, "lang", None);
assert_eq!(html::get_node_attr(node, "lang"), None);
html::set_node_attr(node, "lang", Some(str!("")));
assert_eq!(html::get_node_attr(node, "lang"), Some(str!("")));
} else if node_name == "body" {
assert_eq!(html::get_node_attr(node, "style"), None);
html::set_node_attr(node, "style", Some(str!("display: none;")));
assert_eq!(
html::get_node_attr(node, "style"),
Some(str!("display: none;"))
);
}
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
_ => (),
};
}
test_walk(&dom.document, &mut count);
assert_eq!(count, 5);
}
#[test]
fn body_background() {
let html = "<!doctype html><html lang=\"en\"><head></head><body background=\"1\" background=\"2\"></body></html>";
let dom = html::html_to_dom(&html);
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
*i += 1;
match &node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string();
if node_name == "body" {
assert_eq!(html::get_node_attr(node, "background"), Some(str!("1")));
html::set_node_attr(node, "background", None);
assert_eq!(html::get_node_attr(node, "background"), None);
}
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
_ => (),
};
}
test_walk(&dom.document, &mut count);
assert_eq!(count, 5);
}
}

@ -319,8 +319,8 @@ mod passing {
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
<head>\
<meta>\
<meta>\
<meta http-equiv=\"disabled by monolith (Refresh)\" value=\"20\">\
<meta http-equiv=\"disabled by monolith (Location)\" value=\"https://freebsd.org\">\
</head>\
<body></body>\
</html>"

@ -1,12 +1,12 @@
mod clean_url;
mod data_to_data_url;
mod data_url_to_data;
mod decode_url;
mod file_url_to_fs_path;
mod get_url_fragment;
mod is_data_url;
mod is_file_url;
mod is_http_url;
mod parse_data_url;
mod resolve_url;
mod url_has_protocol;
mod url_with_fragment;

@ -11,7 +11,7 @@ mod passing {
#[test]
fn parse_text_html_base64() {
let (media_type, data) = url::data_url_to_data("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==");
let (media_type, data) = url::parse_data_url("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==");
assert_eq!(media_type, "text/html");
assert_eq!(
@ -22,7 +22,7 @@ mod passing {
#[test]
fn parse_text_html_utf8() {
let (media_type, data) = url::data_url_to_data(
let (media_type, data) = url::parse_data_url(
"data:text/html;utf8,Work expands so as to fill the time available for its completion",
);
@ -35,7 +35,7 @@ mod passing {
#[test]
fn parse_text_html_plaintext() {
let (media_type, data) = url::data_url_to_data(
let (media_type, data) = url::parse_data_url(
"data:text/html,Work expands so as to fill the time available for its completion",
);
@ -48,7 +48,7 @@ mod passing {
#[test]
fn parse_text_html_charset_utf_8_between_two_whitespaces() {
let (media_type, data) = url::data_url_to_data(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion ");
let (media_type, data) = url::parse_data_url(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion ");
assert_eq!(media_type, "text/html");
assert_eq!(
@ -60,7 +60,7 @@ mod passing {
#[test]
fn parse_text_css_url_encoded() {
let (media_type, data) =
url::data_url_to_data("data:text/css,div{background-color:%23000}");
url::parse_data_url("data:text/css,div{background-color:%23000}");
assert_eq!(media_type, "text/css");
assert_eq!(String::from_utf8_lossy(&data), "div{background-color:#000}");
@ -68,7 +68,7 @@ mod passing {
#[test]
fn parse_no_media_type_base64() {
let (media_type, data) = url::data_url_to_data("data:;base64,dGVzdA==");
let (media_type, data) = url::parse_data_url("data:;base64,dGVzdA==");
assert_eq!(media_type, "");
assert_eq!(String::from_utf8_lossy(&data), "test");
@ -76,7 +76,7 @@ mod passing {
#[test]
fn parse_no_media_type_no_encoding() {
let (media_type, data) = url::data_url_to_data("data:;,test%20test");
let (media_type, data) = url::parse_data_url("data:;,test%20test");
assert_eq!(media_type, "");
assert_eq!(String::from_utf8_lossy(&data), "test test");
@ -96,7 +96,7 @@ mod failing {
#[test]
fn just_word_data() {
let (media_type, data) = url::data_url_to_data("data");
let (media_type, data) = url::parse_data_url("data");
assert_eq!(media_type, "");
assert_eq!(String::from_utf8_lossy(&data), "");

@ -33,45 +33,6 @@ pub fn data_to_data_url(media_type: &str, data: &[u8], url: &str) -> String {
format!("data:{};base64,{}", media_type, base64::encode(data))
}
pub fn data_url_to_data<T: AsRef<str>>(url: T) -> (String, Vec<u8>) {
let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap());
let path: String = parsed_url.path().to_string();
let comma_loc: usize = path.find(',').unwrap_or(path.len());
let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect();
let text: String = decode_url(raw_data);
let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut media_type: String = str!();
let mut encoding: &str = "";
let mut i: i8 = 0;
for item in &meta_data_items {
if i == 0 {
media_type = str!(item);
} else {
if item.eq_ignore_ascii_case("base64")
|| item.eq_ignore_ascii_case("utf8")
|| item.eq_ignore_ascii_case("charset=UTF-8")
{
encoding = item;
}
}
i = i + 1;
}
let data: Vec<u8> = if encoding.eq_ignore_ascii_case("base64") {
base64::decode(&text).unwrap_or(vec![])
} else {
text.as_bytes().to_vec()
};
(media_type, data)
}
pub fn decode_url(input: String) -> String {
let input: String = input.replace("+", "%2B");
@ -138,6 +99,45 @@ pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
.unwrap_or(false)
}
pub fn parse_data_url<T: AsRef<str>>(url: T) -> (String, Vec<u8>) {
let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap());
let path: String = parsed_url.path().to_string();
let comma_loc: usize = path.find(',').unwrap_or(path.len());
let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect();
let text: String = decode_url(raw_data);
let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut media_type: String = str!();
let mut encoding: &str = "";
let mut i: i8 = 0;
for item in &meta_data_items {
if i == 0 {
media_type = str!(item);
} else {
if item.eq_ignore_ascii_case("base64")
|| item.eq_ignore_ascii_case("utf8")
|| item.eq_ignore_ascii_case("charset=UTF-8")
{
encoding = item;
}
}
i = i + 1;
}
let data: Vec<u8> = if encoding.eq_ignore_ascii_case("base64") {
base64::decode(&text).unwrap_or(vec![])
} else {
text.as_bytes().to_vec()
};
(media_type, data)
}
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
let result = if is_http_url(to.as_ref()) {
to.as_ref().to_string()

@ -5,7 +5,7 @@ use std::fs;
use std::path::Path;
use crate::opts::Options;
use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url};
use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url};
const INDENT: &str = " ";
@ -83,7 +83,7 @@ pub fn retrieve_asset(
}
if is_data_url(&url) {
let (media_type, data) = data_url_to_data(url);
let (media_type, data) = parse_data_url(url);
Ok((data, url.to_string(), media_type))
} else if is_file_url(&url) {
// Check if parent_url is also file:///

Loading…
Cancel
Save