add support for working with local assets

pull/135/head
Sunshine 4 years ago
parent a653bbe7d4
commit 59a8be493d
No known key found for this signature in database
GPG Key ID: B80CA68703CD8AB1

3
.gitignore vendored

@ -4,6 +4,3 @@
# These are backup files generated by rustfmt # These are backup files generated by rustfmt
**/*.rs.bk **/*.rs.bk
# Exclude accidental HTML files
*.html

@ -11,8 +11,8 @@ rust:
- beta - beta
- nightly - nightly
services: git:
- docker autocrlf: false # don't mangle LF into CRLF on windows
before_script: before_script:
- rustup component add rustfmt - rustup component add rustfmt

@ -1,7 +1,7 @@
use crate::http::retrieve_asset;
use crate::js::attr_is_event_handler; use crate::js::attr_is_event_handler;
use crate::utils::{ use crate::utils::{
data_to_data_url, is_http_url, resolve_css_imports, resolve_url, url_has_protocol, data_to_data_url, is_http_url, resolve_css_imports, resolve_url, retrieve_asset,
url_has_protocol,
}; };
use html5ever::interface::QualName; use html5ever::interface::QualName;
use html5ever::parse_document; use html5ever::parse_document;
@ -133,6 +133,7 @@ pub fn walk_and_embed_assets(
let (favicon_data_url, _) = retrieve_asset( let (favicon_data_url, _) = retrieve_asset(
cache, cache,
client, client,
&url,
&href_full_url, &href_full_url,
true, true,
"", "",
@ -156,6 +157,7 @@ pub fn walk_and_embed_assets(
let replacement_text = match retrieve_asset( let replacement_text = match retrieve_asset(
cache, cache,
client, client,
&url,
&href_full_url, &href_full_url,
false, false,
"text/css", "text/css",
@ -167,6 +169,7 @@ pub fn walk_and_embed_assets(
client, client,
&css_data, &css_data,
true, true,
&url,
&href_full_url, &href_full_url,
opt_no_images, opt_no_images,
opt_silent, opt_silent,
@ -231,6 +234,7 @@ pub fn walk_and_embed_assets(
retrieve_asset( retrieve_asset(
cache, cache,
client, client,
&url,
&abs_src, &abs_src,
true, true,
"", "",
@ -278,6 +282,7 @@ pub fn walk_and_embed_assets(
retrieve_asset( retrieve_asset(
cache, cache,
client, client,
&url,
&abs_src, &abs_src,
true, true,
"", "",
@ -311,6 +316,7 @@ pub fn walk_and_embed_assets(
let (source_data_url, _) = retrieve_asset( let (source_data_url, _) = retrieve_asset(
cache, cache,
client, client,
&url,
&srcset_full_url, &srcset_full_url,
true, true,
"", "",
@ -375,6 +381,7 @@ pub fn walk_and_embed_assets(
let (js_data_url, _) = retrieve_asset( let (js_data_url, _) = retrieve_asset(
cache, cache,
client, client,
&url,
&src_full_url, &src_full_url,
true, true,
"application/javascript", "application/javascript",
@ -401,6 +408,7 @@ pub fn walk_and_embed_assets(
tendril.as_ref(), tendril.as_ref(),
false, false,
&url, &url,
&url,
opt_no_images, opt_no_images,
opt_silent, opt_silent,
); );
@ -444,6 +452,7 @@ pub fn walk_and_embed_assets(
let (frame_data, frame_final_url) = retrieve_asset( let (frame_data, frame_final_url) = retrieve_asset(
cache, cache,
client, client,
&url,
&src_full_url, &src_full_url,
false, false,
"text/html", "text/html",
@ -488,6 +497,7 @@ pub fn walk_and_embed_assets(
let (poster_data_url, _) = retrieve_asset( let (poster_data_url, _) = retrieve_asset(
cache, cache,
client, client,
&url,
&poster_full_url, &poster_full_url,
true, true,
"", "",
@ -528,6 +538,7 @@ pub fn walk_and_embed_assets(
attribute.value.as_ref(), attribute.value.as_ref(),
false, false,
&url, &url,
&url,
opt_no_images, opt_no_images,
opt_silent, opt_silent,
); );

@ -1,68 +0,0 @@
use crate::utils::{clean_url, data_to_data_url, is_data_url};
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use std::collections::HashMap;
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
client: &Client,
url: &str,
as_data_url: bool,
mime: &str,
opt_silent: bool,
) -> Result<(String, String), reqwest::Error> {
let cache_key = clean_url(&url);
if is_data_url(&url) {
Ok((url.to_string(), url.to_string()))
} else {
if cache.contains_key(&cache_key) {
// url is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&cache_key).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// url not in cache, we request it
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
if !opt_silent {
if url == res_url {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &res_url);
}
}
let new_cache_key = clean_url(&res_url);
if as_data_url {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&mime)
} else {
mime
};
let data_url = data_to_data_url(&mimetype, &data);
// insert in cache
cache.insert(new_cache_key, data_url.clone());
Ok((data_url, res_url))
} else {
let content = response.text().unwrap();
// insert in cache
cache.insert(new_cache_key, content.clone());
Ok((content, res_url))
}
}
}
}

@ -1,7 +1,7 @@
const JS_DOM_EVENT_ATTRS: &[&str] = &[ const JS_DOM_EVENT_ATTRS: &[&str] = &[
// From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects': // From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects // https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
// https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes') // https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")
// Global event handlers // Global event handlers
"onabort", "onabort",

@ -5,7 +5,6 @@ extern crate lazy_static;
mod macros; mod macros;
pub mod html; pub mod html;
pub mod http;
pub mod js; pub mod js;
pub mod utils; pub mod utils;

@ -6,19 +6,20 @@ mod macros;
use crate::args::AppArgs; use crate::args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset; use monolith::utils::{data_url_to_text, is_data_url, is_file_url, is_http_url, retrieve_asset};
use monolith::utils::{data_url_to_text, is_data_url, is_http_url};
use reqwest::blocking::Client; use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::env;
use std::fs;
use std::io::{self, Error, Write}; use std::io::{self, Error, Write};
use std::path::Path;
use std::process; use std::process;
use std::time::Duration; use std::time::Duration;
enum Output { enum Output {
Stdout(io::Stdout), Stdout(io::Stdout),
File(File), File(fs::File),
} }
impl Output { impl Output {
@ -26,7 +27,7 @@ impl Output {
if file_path.is_empty() { if file_path.is_empty() {
Ok(Output::Stdout(io::stdout())) Ok(Output::Stdout(io::stdout()))
} else { } else {
Ok(Output::File(File::create(file_path)?)) Ok(Output::File(fs::File::create(file_path)?))
} }
} }
@ -46,16 +47,41 @@ impl Output {
fn main() { fn main() {
let app_args = AppArgs::get(); let app_args = AppArgs::get();
let target_url: &str = app_args.url_target.as_str(); let mut original_target: String = app_args.url_target.clone();
let target_url: &str;
let base_url; let base_url;
let dom; let dom;
if !is_http_url(target_url) && !is_data_url(target_url) { // Pre-process the input
eprintln!( let cwd_normalized: String =
"Only HTTP(S) or data URLs are supported but got: {}", str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
&target_url let path = Path::new(original_target.as_str());
); let path_is_relative: bool = path.is_relative();
if original_target.clone().len() == 0 {
eprintln!("No target specified");
process::exit(1); process::exit(1);
} else if is_http_url(original_target.clone()) || is_data_url(original_target.clone()) {
target_url = original_target.as_str();
} else if is_file_url(original_target.clone()) {
target_url = original_target.as_str();
} else if path.exists() {
if !path.is_file() {
eprintln!("Local target is not a file: {}", original_target);
process::exit(1);
}
original_target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" });
original_target = original_target.replace("\\", "/");
if path_is_relative {
original_target.insert_str(if cfg!(windows) { 8 } else { 7 }, &cwd_normalized);
original_target.insert_str(
if cfg!(windows) { 8 } else { 7 } + &cwd_normalized.len(),
"/",
);
}
target_url = original_target.as_str();
} else {
original_target.insert_str(0, "http://");
target_url = original_target.as_str();
} }
let mut output = Output::new(&app_args.output).expect("Could not prepare output"); let mut output = Output::new(&app_args.output).expect("Could not prepare output");
@ -81,21 +107,26 @@ fn main() {
.expect("Failed to initialize HTTP client"); .expect("Failed to initialize HTTP client");
// Retrieve root document // Retrieve root document
if is_http_url(target_url) { if is_file_url(target_url) || is_http_url(target_url) {
let (data, final_url) = let (data, final_url) = retrieve_asset(
retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent) &mut cache,
.expect("Could not retrieve assets in HTML"); &client,
target_url,
target_url,
false,
"",
app_args.silent,
)
.expect("Could not retrieve target document");
base_url = final_url; base_url = final_url;
dom = html_to_dom(&data); dom = html_to_dom(&data);
} else if is_data_url(target_url) { } else if is_data_url(target_url) {
let text: String = data_url_to_text(target_url); let text: String = data_url_to_text(target_url);
if text.len() == 0 { if text.len() == 0 {
eprintln!("Unsupported data URL input"); eprintln!("Unsupported data URL input");
process::exit(1); process::exit(1);
} }
base_url = str!(target_url);
base_url = str!();
dom = html_to_dom(&text); dom = html_to_dom(&text);
} else { } else {
process::exit(1); process::exit(1);

@ -1,4 +1,5 @@
use assert_cmd::prelude::*; use assert_cmd::prelude::*;
use std::env;
use std::process::Command; use std::process::Command;
#[test] #[test]
@ -22,9 +23,9 @@ fn print_version() -> Result<(), Box<dyn std::error::Error>> {
} }
#[test] #[test]
fn bad_input() -> Result<(), Box<dyn std::error::Error>> { fn bad_input_empty_target() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd.arg("kernel.org").output().unwrap(); let out = cmd.arg("").output().unwrap();
// STDOUT should be empty // STDOUT should be empty
assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), "");
@ -32,7 +33,7 @@ fn bad_input() -> Result<(), Box<dyn std::error::Error>> {
// STDERR should contain error description // STDERR should contain error description
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(), std::str::from_utf8(&out.stderr).unwrap(),
"Only HTTP(S) or data URLs are supported but got: kernel.org\n" "No target specified\n"
); );
// The exit code should be 1 // The exit code should be 1
@ -73,7 +74,9 @@ fn isolate_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain isolated HTML // STDOUT should contain isolated HTML
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(), std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta></head><body>Hello, World!</body></html>\n" "<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
</head><body>Hello, World!</body></html>\n"
); );
// STDERR should be empty // STDERR should be empty
@ -97,7 +100,10 @@ fn remove_css_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no CSS // STDOUT should contain HTML with no CSS
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(), std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta><style></style></head><body>Hello</body></html>\n" "<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
<style></style>\
</head><body>Hello</body></html>\n"
); );
// STDERR should be empty // STDERR should be empty
@ -121,7 +127,9 @@ fn remove_frames_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no iframes // STDOUT should contain HTML with no iframes
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(), std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta></head><body><iframe src=\"\"></iframe>Hi</body></html>\n" "<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta>\
</head><body><iframe src=\"\"></iframe>Hi</body></html>\n"
); );
// STDERR should be empty // STDERR should be empty
@ -145,7 +153,15 @@ fn remove_images_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no images // STDOUT should contain HTML with no images
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(), std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"img-src data:;\"></meta></head><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">Hi</body></html>\n" "<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"img-src data:;\"></meta>\
</head>\
<body>\
<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
Hi\
</body>\
</html>\n"
); );
// STDERR should be empty // STDERR should be empty
@ -169,7 +185,203 @@ fn remove_js_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no JS // STDOUT should contain HTML with no JS
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(), std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"script-src 'none';\"></meta><script></script></head><body>Hi</body></html>\n" "<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"script-src 'none';\"></meta>\
<script></script></head>\
<body>Hi</body>\
</html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn local_file_target_input() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let cwd_normalized: String =
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let out = cmd
.arg(if cfg!(windows) {
"src\\tests\\data\\local-file.html"
} else {
"src/tests/data/local-file.html"
})
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<!DOCTYPE html><html lang=\"en\"><head>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
<title>Local HTML file</title>\n \
<link href=\"data:text/css;base64,Ym9keSB7CiAgICBiYWNrZ3JvdW5kLWNvbG9yOiAjMDAwOwogICAgY29sb3I6ICNmZmY7Cn0K\" rel=\"stylesheet\" type=\"text/css\">\n \
<link href=\"data:text/css;base64,\" rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script src=\"data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==\"></script>\n\n\n\n\
</body></html>\n"
);
// STDERR should contain list of retrieved file URLs
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/local-file.html\n\
{file}{cwd}/src/tests/data/local-style.css\n\
{file}{cwd}/src/tests/data/local-script.js\n",
file = file_url_protocol,
cwd = cwd_normalized
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String =
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-jciI")
.arg(if cfg!(windows) {
format!(
"{cwd}\\src\\tests\\data\\local-file.html",
cwd = cwd.to_str().unwrap()
)
} else {
format!(
"{cwd}/src/tests/data/local-file.html",
cwd = cwd.to_str().unwrap()
)
})
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<!DOCTYPE html><html lang=\"en\"><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:; style-src 'none'; script-src 'none'; img-src data:;\"></meta>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
<title>Local HTML file</title>\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script src=\"\"></script>\n\n\n\n\
</body></html>\n"
);
// STDERR should contain only the target file
let cwd = env::current_dir().unwrap();
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/local-file.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn local_file_url_target_input() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let cwd = env::current_dir().unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
let out = cmd
.arg("-cji")
.arg(if cfg!(windows) {
format!(
"{file}{cwd}\\src\\tests\\data\\local-file.html",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
} else {
format!(
"{file}{cwd}/src/tests/data/local-file.html",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
})
.output()
.unwrap();
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<!DOCTYPE html><html lang=\"en\"><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none'; script-src 'none'; img-src data:;\"></meta>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
<title>Local HTML file</title>\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script src=\"\"></script>\n\n\n\n\
</body></html>\n"
);
// STDERR should contain list of retrieved file URLs
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
if cfg!(windows) {
format!(
"{file}{cwd}\\src\\tests\\data\\local-file.html\n",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
} else {
format!(
"{file}{cwd}/src/tests/data/local-file.html\n",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
}
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn security_disallow_local_assets_within_data_url_targets() -> Result<(), Box<dyn std::error::Error>>
{
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("data:text/html,%3Cscript%20src=\"src/tests/data/local-script.js\"%3E%3C/script%3E")
.output()
.unwrap();
// STDOUT should contain HTML with no JS in it
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><script src=\"\"></script></head><body></body></html>\n"
); );
// STDERR should be empty // STDERR should be empty

@ -0,0 +1,19 @@
<!doctype html>
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>Local HTML file</title>
<link href="local-style.css" rel="stylesheet" type="text/css" />
<link href="local-style-does-not-exist.css" rel="stylesheet" type="text/css" />
</head>
<body>
<img src="monolith.png" alt="" />
<a href="//local-file.html">Tricky href</a>
<a href="https://github.com/Y2Z/monolith">Remote URL</a>
<script src="local-script.js"></script>
</body>
</html>

@ -0,0 +1,2 @@
document.body.style.backgroundColor = "green";
document.body.style.color = "red";

@ -0,0 +1,4 @@
body {
background-color: #000;
color: #fff;
}

@ -1,25 +0,0 @@
use crate::http::retrieve_asset;
use reqwest::blocking::Client;
use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
let cache = &mut HashMap::new();
let client = Client::new();
let (data, final_url) =
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
cache,
&client,
"data:text/html;base64,...",
true,
"image/png",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
}

@ -1,5 +1,4 @@
mod cli; mod cli;
mod html; mod html;
mod http;
mod js; mod js;
mod utils; mod utils;

@ -1,14 +1,14 @@
use crate::utils::{ use crate::utils;
clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url, use reqwest::blocking::Client;
resolve_url, url_has_protocol, use std::collections::HashMap;
}; use std::env;
use url::ParseError; use url::ParseError;
#[test] #[test]
fn test_data_to_data_url() { fn data_to_data_url() {
let mime = "application/javascript"; let mime = "application/javascript";
let data = "var word = 'hello';\nalert(word);\n"; let data = "var word = 'hello';\nalert(word);\n";
let datauri = data_to_data_url(mime, data.as_bytes()); let datauri = utils::data_to_data_url(mime, data.as_bytes());
assert_eq!( assert_eq!(
&datauri, &datauri,
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
@ -16,90 +16,124 @@ fn test_data_to_data_url() {
} }
#[test] #[test]
fn test_detect_mimetype() { fn detect_mimetype() {
// image // Image
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif"); assert_eq!(utils::detect_mimetype(b"GIF87a"), "image/gif");
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif"); assert_eq!(utils::detect_mimetype(b"GIF89a"), "image/gif");
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg"); assert_eq!(utils::detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png"); assert_eq!(
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml"); utils::detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"),
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml"); "image/png"
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp"); );
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon"); assert_eq!(utils::detect_mimetype(b"<?xml "), "image/svg+xml");
// audio assert_eq!(utils::detect_mimetype(b"<svg "), "image/svg+xml");
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg"); assert_eq!(utils::detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg"); assert_eq!(utils::detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg"); // Audio
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav"); assert_eq!(utils::detect_mimetype(b"ID3"), "audio/mpeg");
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac"); assert_eq!(utils::detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
// video assert_eq!(utils::detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi"); assert_eq!(utils::detect_mimetype(b"OggS"), "audio/ogg");
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4"); assert_eq!(utils::detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg"); assert_eq!(utils::detect_mimetype(b"fLaC"), "audio/x-flac");
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm"); // Video
assert_eq!(utils::detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
assert_eq!(utils::detect_mimetype(b"....ftyp"), "video/mp4");
assert_eq!(utils::detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
assert_eq!(utils::detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(utils::detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
} }
#[test] #[test]
fn test_url_has_protocol() { fn url_has_protocol() {
// passing // Passing
assert_eq!( assert_eq!(
url_has_protocol("mailto:somebody@somewhere.com?subject=hello"), utils::url_has_protocol("mailto:somebody@somewhere.com?subject=hello"),
true true
); );
assert_eq!(url_has_protocol("tel:5551234567"), true); assert_eq!(utils::url_has_protocol("tel:5551234567"), true);
assert_eq!( assert_eq!(
url_has_protocol("ftp:user:password@some-ftp-server.com"), utils::url_has_protocol("ftp:user:password@some-ftp-server.com"),
true true
); );
assert_eq!(url_has_protocol("javascript:void(0)"), true); assert_eq!(utils::url_has_protocol("javascript:void(0)"), true);
assert_eq!(url_has_protocol("http://news.ycombinator.com"), true); assert_eq!(utils::url_has_protocol("http://news.ycombinator.com"), true);
assert_eq!(url_has_protocol("https://github.com"), true); assert_eq!(utils::url_has_protocol("https://github.com"), true);
assert_eq!( assert_eq!(
url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), utils::url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
true true
); );
// failing
// Failing
assert_eq!(
utils::url_has_protocol("//some-hostname.com/some-file.html"),
false
);
assert_eq!( assert_eq!(
url_has_protocol("//some-hostname.com/some-file.html"), utils::url_has_protocol("some-hostname.com/some-file.html"),
false false
); );
assert_eq!(url_has_protocol("some-hostname.com/some-file.html"), false); assert_eq!(utils::url_has_protocol("/some-file.html"), false);
assert_eq!(url_has_protocol("/some-file.html"), false); assert_eq!(utils::url_has_protocol(""), false);
assert_eq!(url_has_protocol(""), false); }
#[test]
fn is_file_url() {
// Passing
assert!(utils::is_file_url(
"file:///home/user/Websites/my-website/index.html"
));
assert!(utils::is_file_url(
"file:///C:/Documents%20and%20Settings/user/Websites/my-website/assets/images/logo.png"
));
assert!(utils::is_file_url(
"file:\\\\\\home\\user\\Websites\\my-website\\index.html"
));
// Failing
assert!(!utils::is_file_url("//kernel.org"));
assert!(!utils::is_file_url("./index.html"));
assert!(!utils::is_file_url("some-local-page.htm"));
assert!(!utils::is_file_url("https://1.2.3.4:80/www/index.html"));
assert!(!utils::is_file_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
} }
#[test] #[test]
fn test_is_http_url() { fn is_http_url() {
// passing // Passing
assert!(is_http_url("https://www.rust-lang.org/")); assert!(utils::is_http_url("https://www.rust-lang.org/"));
assert!(is_http_url("http://kernel.org")); assert!(utils::is_http_url("http://kernel.org"));
// failing assert!(utils::is_http_url("http:\\\\freebsd.org\\"));
assert!(!is_http_url("//kernel.org"));
assert!(!is_http_url("./index.html")); // Failing
assert!(!is_http_url("some-local-page.htm")); assert!(!utils::is_http_url("//kernel.org"));
assert!(!is_http_url("ftp://1.2.3.4/www/index.html")); assert!(!utils::is_http_url("./index.html"));
assert!(!is_http_url( assert!(!utils::is_http_url("some-local-page.htm"));
assert!(!utils::is_http_url("ftp://1.2.3.4/www/index.html"));
assert!(!utils::is_http_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
)); ));
} }
#[test] #[test]
fn test_resolve_url() -> Result<(), ParseError> { fn resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?; let resolved_url = utils::resolve_url("https://www.kernel.org", "../category/signatures.html")?;
assert_eq!( assert_eq!(
resolved_url.as_str(), resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"
); );
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?; let resolved_url = utils::resolve_url("https://www.kernel.org", "category/signatures.html")?;
assert_eq!( assert_eq!(
resolved_url.as_str(), resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"saved_page.htm", "saved_page.htm",
"https://www.kernel.org/category/signatures.html", "https://www.kernel.org/category/signatures.html",
)?; )?;
@ -108,7 +142,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"https://www.kernel.org", "https://www.kernel.org",
"//www.kernel.org/theme/images/logos/tux.png", "//www.kernel.org/theme/images/logos/tux.png",
)?; )?;
@ -117,7 +151,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/theme/images/logos/tux.png" "https://www.kernel.org/theme/images/logos/tux.png"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"https://www.kernel.org", "https://www.kernel.org",
"//another-host.org/theme/images/logos/tux.png", "//another-host.org/theme/images/logos/tux.png",
)?; )?;
@ -126,7 +160,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://another-host.org/theme/images/logos/tux.png" "https://another-host.org/theme/images/logos/tux.png"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"https://www.kernel.org/category/signatures.html", "https://www.kernel.org/category/signatures.html",
"/theme/images/logos/tux.png", "/theme/images/logos/tux.png",
)?; )?;
@ -135,7 +169,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/theme/images/logos/tux.png" "https://www.kernel.org/theme/images/logos/tux.png"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"https://www.w3schools.com/html/html_iframe.asp", "https://www.w3schools.com/html/html_iframe.asp",
"default.asp", "default.asp",
)?; )?;
@ -144,7 +178,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.w3schools.com/html/default.asp" "https://www.w3schools.com/html/default.asp"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"https://www.kernel.org/category/signatures.html", "https://www.kernel.org/category/signatures.html",
)?; )?;
@ -153,62 +187,197 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"
); );
let resolved_url = resolve_url( let resolved_url = utils::resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"//www.w3schools.com/html/html_iframe.asp", "//www.w3schools.com/html/html_iframe.asp",
) )
.unwrap_or(str!()); .unwrap_or(str!());
assert_eq!(resolved_url.as_str(), ""); assert_eq!(resolved_url.as_str(), "");
let resolved_url = utils::resolve_url(
"file:///home/user/Websites/my-website/index.html",
"assets/images/logo.png",
)
.unwrap_or(str!());
assert_eq!(
resolved_url.as_str(),
"file:///home/user/Websites/my-website/assets/images/logo.png"
);
let resolved_url = utils::resolve_url(
"file:\\\\\\home\\user\\Websites\\my-website\\index.html",
"assets\\images\\logo.png",
)
.unwrap_or(str!());
assert_eq!(
resolved_url.as_str(),
"file:///home/user/Websites/my-website/assets/images/logo.png"
);
Ok(()) Ok(())
} }
#[test] #[test]
fn test_is_data_url() { fn is_data_url() {
// passing // Passing
assert!(is_data_url( assert!(utils::is_data_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
)); ));
// failing
assert!(!is_data_url("https://kernel.org")); // Failing
assert!(!is_data_url("//kernel.org")); assert!(!utils::is_data_url("https://kernel.org"));
assert!(!is_data_url("")); assert!(!utils::is_data_url("//kernel.org"));
assert!(!utils::is_data_url(""));
} }
#[test] #[test]
fn test_clean_url() { fn clean_url() {
assert_eq!( assert_eq!(
clean_url("https://somewhere.com/font.eot#iefix"), utils::clean_url("https://somewhere.com/font.eot#iefix"),
"https://somewhere.com/font.eot" "https://somewhere.com/font.eot"
); );
assert_eq!( assert_eq!(
clean_url("https://somewhere.com/font.eot#"), utils::clean_url("https://somewhere.com/font.eot#"),
"https://somewhere.com/font.eot" "https://somewhere.com/font.eot"
); );
assert_eq!( assert_eq!(
clean_url("https://somewhere.com/font.eot?#"), utils::clean_url("https://somewhere.com/font.eot?#"),
"https://somewhere.com/font.eot" "https://somewhere.com/font.eot"
); );
} }
#[test] #[test]
fn test_data_url_to_text() { fn data_url_to_text() {
assert_eq!( assert_eq!(
data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
assert_eq!( assert_eq!(
data_url_to_text( utils::data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion" "data:text/html;utf8,Work expands so as to fill the time available for its completion"
), ),
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
assert_eq!( assert_eq!(
data_url_to_text( utils::data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion" "data:text/html,Work expands so as to fill the time available for its completion"
), ),
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
assert_eq!(
utils::data_url_to_text(
" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "
),
"Work expands so as to fill the time available for its completion"
);
}
#[test]
fn decode_url() {
assert_eq!(
utils::decode_url(str!(
"%E6%A4%9C%E3%83%92%E3%83%A0%E8%A7%A3%E5%A1%97%E3%82%83%E3%83%83%20%3D%20%E3%82%B5"
)),
"検ヒム解塗ゃッ = サ"
);
assert_eq!(utils::decode_url(str!("%20 %20")), " ");
}
#[test]
fn retrieve_asset() {
let cache = &mut HashMap::new();
let client = Client::new();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// If both source and target are data URLs,
// ensure the result contains target data URL
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"data:text/html;base64,TaRgEt",
true,
"",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,TaRgEt");
assert_eq!(&final_url, "data:text/html;base64,TaRgEt");
// Media type parameter should not influence data URLs
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"data:text/html;base64,TaRgEt",
true,
"image/png",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,TaRgEt");
assert_eq!(&final_url, "data:text/html;base64,TaRgEt");
// Inclusion of local assets from data URL sources should not be allowed
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"file:///etc/passwd",
true,
"",
false,
)
.unwrap();
assert_eq!(&data, "");
assert_eq!(&final_url, "");
// Inclusion of local assets from remote sources should not be allowed
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"https://kernel.org/",
"file:///etc/passwd",
true,
"",
false,
)
.unwrap();
assert_eq!(&data, "");
assert_eq!(&final_url, "");
// Inclusion of local assets from local sources should be allowed
let cwd = env::current_dir().unwrap();
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
&format!(
"{file}{cwd}/src/tests/data/local-file.html",
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
),
&format!(
"{file}{cwd}/src/tests/data/local-script.js",
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
),
true,
"application/javascript",
false,
)
.unwrap();
assert_eq!(&data, "data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==");
assert_eq!(
&final_url,
&format!(
"{file}{cwd}/src/tests/data/local-script.js",
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
)
);
} }

@ -1,8 +1,10 @@
use crate::http::retrieve_asset; use base64;
use base64::{decode, encode};
use regex::Regex; use regex::Regex;
use reqwest::blocking::Client; use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use std::collections::HashMap; use std::collections::HashMap;
use std::fs;
use std::path::Path;
use url::{form_urlencoded, ParseError, Url}; use url::{form_urlencoded, ParseError, Url};
/// This monster of a regex is used to match any kind of URL found in CSS. /// This monster of a regex is used to match any kind of URL found in CSS.
@ -71,7 +73,7 @@ pub fn data_to_data_url(mime: &str, data: &[u8]) -> String {
} else { } else {
mime.to_string() mime.to_string()
}; };
format!("data:{};base64,{}", mimetype, encode(data)) format!("data:{};base64,{}", mimetype, base64::encode(data))
} }
pub fn detect_mimetype(data: &[u8]) -> String { pub fn detect_mimetype(data: &[u8]) -> String {
@ -95,6 +97,12 @@ pub fn is_data_url<T: AsRef<str>>(url: T) -> bool {
.unwrap_or(false) .unwrap_or(false)
} }
pub fn is_file_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "file"))
.unwrap_or(false)
}
pub fn is_http_url<T: AsRef<str>>(url: T) -> bool { pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref()) Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https")) .and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https"))
@ -118,6 +126,7 @@ pub fn resolve_css_imports(
client: &Client, client: &Client,
css_string: &str, css_string: &str,
as_data_url: bool, as_data_url: bool,
parent_url: &str,
href: &str, href: &str,
opt_no_images: bool, opt_no_images: bool,
opt_silent: bool, opt_silent: bool,
@ -127,12 +136,12 @@ pub fn resolve_css_imports(
for link in REGEX_CSS_URL.captures_iter(&css_string) { for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str(); let target_link = link.name("url").unwrap().as_str();
// Determine the type of link // Determine linked asset type
let is_stylesheet = link.name("stylesheet").is_some(); let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some(); let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font; let is_image = !is_stylesheet && !is_font;
// Generate absolute URL for content // Generate absolute URL for the content
let embedded_url = match resolve_url(href, target_link) { let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url, Ok(url) => url,
Err(_) => continue, // Malformed URL Err(_) => continue, // Malformed URL
@ -144,8 +153,9 @@ pub fn resolve_css_imports(
retrieve_asset( retrieve_asset(
cache, cache,
client, client,
&parent_url,
&embedded_url, &embedded_url,
false, // Formating as data URL will be done later false, // Formatting as data URL will be done later
"text/css", // Expect CSS "text/css", // Expect CSS
opt_silent, opt_silent,
) )
@ -155,6 +165,7 @@ pub fn resolve_css_imports(
client, client,
&content, &content,
true, // Finally, convert to a data URL true, // Finally, convert to a data URL
&parent_url,
&embedded_url, &embedded_url,
opt_no_images, opt_no_images,
opt_silent, opt_silent,
@ -165,6 +176,7 @@ pub fn resolve_css_imports(
retrieve_asset( retrieve_asset(
cache, cache,
client, client,
&parent_url,
&embedded_url, &embedded_url,
true, // Format as data URL true, // Format as data URL
"", // Unknown MIME type "", // Unknown MIME type
@ -186,10 +198,11 @@ pub fn resolve_css_imports(
let replacement = format!("\"{}\"", &content); let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap(); let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len(); if resolved_css.len() > css_string.len() {
let target_range = (dest.start() + offset)..(dest.end() + offset); let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement); resolved_css.replace_range(target_range, &replacement);
}
} }
if as_data_url { if as_data_url {
@ -222,20 +235,7 @@ pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
let meta_data: String = path.chars().take(comma_loc).collect(); let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect(); let raw_data: String = path.chars().skip(comma_loc + 1).collect();
let data: String = form_urlencoded::parse(raw_data.as_bytes()) let data: String = decode_url(raw_data);
.map(|(key, val)| {
[
key.to_string(),
if val.to_string().len() == 0 {
str!()
} else {
str!('=')
},
val.to_string(),
]
.concat()
})
.collect();
let meta_data_items: Vec<&str> = meta_data.split(';').collect(); let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut mime_type: &str = ""; let mut mime_type: &str = "";
@ -259,7 +259,7 @@ pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
if mime_type.eq_ignore_ascii_case("text/html") { if mime_type.eq_ignore_ascii_case("text/html") {
if encoding.eq_ignore_ascii_case("base64") { if encoding.eq_ignore_ascii_case("base64") {
String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!()) String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
} else { } else {
data data
} }
@ -267,3 +267,114 @@ pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
str!() str!()
} }
} }
pub fn decode_url(input: String) -> String {
form_urlencoded::parse(input.as_bytes())
.map(|(key, val)| {
[
key.to_string(),
if val.to_string().len() == 0 {
str!()
} else {
str!('=')
},
val.to_string(),
]
.concat()
})
.collect()
}
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
client: &Client,
parent_url: &str,
url: &str,
as_data_url: bool,
mime: &str,
opt_silent: bool,
) -> Result<(String, String), reqwest::Error> {
if url.len() == 0 {
return Ok((str!(), str!()));
}
let cache_key = clean_url(&url);
if is_data_url(&url) {
Ok((url.to_string(), url.to_string()))
} else if is_file_url(&url) {
// Check if parent_url is also file:///
// (if not then we don't download/embed the asset)
if !is_file_url(&parent_url) {
return Ok((str!(), str!()));
}
let cutoff = if cfg!(windows) { 8 } else { 7 };
let fs_file_path: String = decode_url(url.to_string()[cutoff..].to_string());
let path = Path::new(&fs_file_path);
if path.exists() {
if !opt_silent {
eprintln!("{}", &url);
}
if as_data_url {
let data_url: String = data_to_data_url(&mime, &fs::read(&fs_file_path).unwrap());
Ok((data_url, url.to_string()))
} else {
let data: String = fs::read_to_string(&fs_file_path).expect(url);
Ok((data, url.to_string()))
}
} else {
Ok((str!(), url.to_string()))
}
} else {
if cache.contains_key(&cache_key) {
// URL is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&cache_key).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// URL not in cache, we request it
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
if !opt_silent {
if url == res_url {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &res_url);
}
}
let new_cache_key = clean_url(&res_url);
if as_data_url {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&mime)
} else {
mime
};
let data_url = data_to_data_url(&mimetype, &data);
// Add to cache
cache.insert(new_cache_key, data_url.clone());
Ok((data_url, res_url))
} else {
let content = response.text().unwrap();
// Add to cache
cache.insert(new_cache_key, content.clone());
Ok((content, res_url))
}
}
}
}

Loading…
Cancel
Save