From 59a8be493df70bfe575c52fde8eb73972bcfbf77 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Sun, 8 Mar 2020 15:31:42 -0400 Subject: [PATCH] add support for working with local assets --- .gitignore | 3 - .travis.yml | 4 +- src/html.rs | 15 +- src/http.rs | 68 ------- src/js.rs | 4 +- src/lib.rs | 1 - src/main.rs | 67 +++++-- src/tests/cli.rs | 228 ++++++++++++++++++++++- src/tests/data/local-file.html | 19 ++ src/tests/data/local-script.js | 2 + src/tests/data/local-style.css | 4 + src/tests/http.rs | 25 --- src/tests/mod.rs | 1 - src/tests/utils.rs | 325 +++++++++++++++++++++++++-------- src/utils.rs | 161 +++++++++++++--- 15 files changed, 694 insertions(+), 233 deletions(-) delete mode 100644 src/http.rs create mode 100644 src/tests/data/local-file.html create mode 100644 src/tests/data/local-script.js create mode 100644 src/tests/data/local-style.css delete mode 100644 src/tests/http.rs diff --git a/.gitignore b/.gitignore index 70905ef..f2e972d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,3 @@ # These are backup files generated by rustfmt **/*.rs.bk - -# Exclude accidental HTML files -*.html diff --git a/.travis.yml b/.travis.yml index 2b6fb9a..7c0fd84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,8 +11,8 @@ rust: - beta - nightly -services: -- docker +git: + autocrlf: false # don't mangle LF into CRLF on windows before_script: - rustup component add rustfmt diff --git a/src/html.rs b/src/html.rs index 106f264..c0e291d 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,7 +1,7 @@ -use crate::http::retrieve_asset; use crate::js::attr_is_event_handler; use crate::utils::{ - data_to_data_url, is_http_url, resolve_css_imports, resolve_url, url_has_protocol, + data_to_data_url, is_http_url, resolve_css_imports, resolve_url, retrieve_asset, + url_has_protocol, }; use html5ever::interface::QualName; use html5ever::parse_document; @@ -133,6 +133,7 @@ pub fn walk_and_embed_assets( let (favicon_data_url, _) = retrieve_asset( cache, client, + &url, &href_full_url, true, "", @@ -156,6 +157,7 @@ pub fn walk_and_embed_assets( let replacement_text = match retrieve_asset( cache, client, + &url, &href_full_url, false, "text/css", @@ -167,6 +169,7 @@ pub fn walk_and_embed_assets( client, &css_data, true, + &url, &href_full_url, opt_no_images, opt_silent, @@ -231,6 +234,7 @@ pub fn walk_and_embed_assets( retrieve_asset( cache, client, + &url, &abs_src, true, "", @@ -278,6 +282,7 @@ pub fn walk_and_embed_assets( retrieve_asset( cache, client, + &url, &abs_src, true, "", @@ -311,6 +316,7 @@ pub fn walk_and_embed_assets( let (source_data_url, _) = retrieve_asset( cache, client, + &url, &srcset_full_url, true, "", @@ -375,6 +381,7 @@ pub fn walk_and_embed_assets( let (js_data_url, _) = retrieve_asset( cache, client, + &url, &src_full_url, true, "application/javascript", @@ -401,6 +408,7 @@ pub fn walk_and_embed_assets( tendril.as_ref(), false, &url, + &url, opt_no_images, opt_silent, ); @@ -444,6 +452,7 @@ pub fn walk_and_embed_assets( let (frame_data, frame_final_url) = retrieve_asset( cache, client, + &url, &src_full_url, false, "text/html", @@ -488,6 +497,7 @@ pub fn walk_and_embed_assets( let (poster_data_url, _) = retrieve_asset( cache, client, + &url, &poster_full_url, true, "", @@ -528,6 +538,7 @@ pub fn walk_and_embed_assets( attribute.value.as_ref(), false, &url, + &url, opt_no_images, opt_silent, ); diff --git a/src/http.rs b/src/http.rs deleted file mode 100644 index d458c34..0000000 --- a/src/http.rs +++ /dev/null @@ -1,68 +0,0 @@ -use crate::utils::{clean_url, data_to_data_url, is_data_url}; -use reqwest::blocking::Client; -use reqwest::header::CONTENT_TYPE; -use std::collections::HashMap; - -pub fn retrieve_asset( - cache: &mut HashMap, - client: &Client, - url: &str, - as_data_url: bool, - mime: &str, - opt_silent: bool, -) -> Result<(String, String), reqwest::Error> { - let cache_key = clean_url(&url); - - if is_data_url(&url) { - Ok((url.to_string(), url.to_string())) - } else { - if cache.contains_key(&cache_key) { - // url is in cache - if !opt_silent { - eprintln!("{} (from cache)", &url); - } - let data = cache.get(&cache_key).unwrap(); - Ok((data.to_string(), url.to_string())) - } else { - // url not in cache, we request it - let mut response = client.get(url).send()?; - let res_url = response.url().to_string(); - - if !opt_silent { - if url == res_url { - eprintln!("{}", &url); - } else { - eprintln!("{} -> {}", &url, &res_url); - } - } - - let new_cache_key = clean_url(&res_url); - - if as_data_url { - // Convert response into a byte array - let mut data: Vec = vec![]; - response.copy_to(&mut data)?; - - // Attempt to obtain MIME type by reading the Content-Type header - let mimetype = if mime == "" { - response - .headers() - .get(CONTENT_TYPE) - .and_then(|header| header.to_str().ok()) - .unwrap_or(&mime) - } else { - mime - }; - let data_url = data_to_data_url(&mimetype, &data); - // insert in cache - cache.insert(new_cache_key, data_url.clone()); - Ok((data_url, res_url)) - } else { - let content = response.text().unwrap(); - // insert in cache - cache.insert(new_cache_key, content.clone()); - Ok((content, res_url)) - } - } - } -} diff --git a/src/js.rs b/src/js.rs index b8d9f2d..428c4ef 100644 --- a/src/js.rs +++ b/src/js.rs @@ -1,7 +1,7 @@ const JS_DOM_EVENT_ATTRS: &[&str] = &[ - // From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects': + // From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects": // https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects - // https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes') + // https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes") // Global event handlers "onabort", diff --git a/src/lib.rs b/src/lib.rs index 51c768b..348c4ff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,6 @@ extern crate lazy_static; mod macros; pub mod html; -pub mod http; pub mod js; pub mod utils; diff --git a/src/main.rs b/src/main.rs index 83fc37e..22147bf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,19 +6,20 @@ mod macros; use crate::args::AppArgs; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; -use monolith::http::retrieve_asset; -use monolith::utils::{data_url_to_text, is_data_url, is_http_url}; +use monolith::utils::{data_url_to_text, is_data_url, is_file_url, is_http_url, retrieve_asset}; use reqwest::blocking::Client; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use std::collections::HashMap; -use std::fs::File; +use std::env; +use std::fs; use std::io::{self, Error, Write}; +use std::path::Path; use std::process; use std::time::Duration; enum Output { Stdout(io::Stdout), - File(File), + File(fs::File), } impl Output { @@ -26,7 +27,7 @@ impl Output { if file_path.is_empty() { Ok(Output::Stdout(io::stdout())) } else { - Ok(Output::File(File::create(file_path)?)) + Ok(Output::File(fs::File::create(file_path)?)) } } @@ -46,16 +47,41 @@ impl Output { fn main() { let app_args = AppArgs::get(); - let target_url: &str = app_args.url_target.as_str(); + let mut original_target: String = app_args.url_target.clone(); + let target_url: &str; let base_url; let dom; - if !is_http_url(target_url) && !is_data_url(target_url) { - eprintln!( - "Only HTTP(S) or data URLs are supported but got: {}", - &target_url - ); + // Pre-process the input + let cwd_normalized: String = + str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); + let path = Path::new(original_target.as_str()); + let path_is_relative: bool = path.is_relative(); + if original_target.clone().len() == 0 { + eprintln!("No target specified"); process::exit(1); + } else if is_http_url(original_target.clone()) || is_data_url(original_target.clone()) { + target_url = original_target.as_str(); + } else if is_file_url(original_target.clone()) { + target_url = original_target.as_str(); + } else if path.exists() { + if !path.is_file() { + eprintln!("Local target is not a file: {}", original_target); + process::exit(1); + } + original_target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" }); + original_target = original_target.replace("\\", "/"); + if path_is_relative { + original_target.insert_str(if cfg!(windows) { 8 } else { 7 }, &cwd_normalized); + original_target.insert_str( + if cfg!(windows) { 8 } else { 7 } + &cwd_normalized.len(), + "/", + ); + } + target_url = original_target.as_str(); + } else { + original_target.insert_str(0, "http://"); + target_url = original_target.as_str(); } let mut output = Output::new(&app_args.output).expect("Could not prepare output"); @@ -81,21 +107,26 @@ fn main() { .expect("Failed to initialize HTTP client"); // Retrieve root document - if is_http_url(target_url) { - let (data, final_url) = - retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent) - .expect("Could not retrieve assets in HTML"); + if is_file_url(target_url) || is_http_url(target_url) { + let (data, final_url) = retrieve_asset( + &mut cache, + &client, + target_url, + target_url, + false, + "", + app_args.silent, + ) + .expect("Could not retrieve target document"); base_url = final_url; dom = html_to_dom(&data); } else if is_data_url(target_url) { let text: String = data_url_to_text(target_url); - if text.len() == 0 { eprintln!("Unsupported data URL input"); process::exit(1); } - - base_url = str!(); + base_url = str!(target_url); dom = html_to_dom(&text); } else { process::exit(1); diff --git a/src/tests/cli.rs b/src/tests/cli.rs index 9a6aa82..d0d0b90 100644 --- a/src/tests/cli.rs +++ b/src/tests/cli.rs @@ -1,4 +1,5 @@ use assert_cmd::prelude::*; +use std::env; use std::process::Command; #[test] @@ -22,9 +23,9 @@ fn print_version() -> Result<(), Box> { } #[test] -fn bad_input() -> Result<(), Box> { +fn bad_input_empty_target() -> Result<(), Box> { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; - let out = cmd.arg("kernel.org").output().unwrap(); + let out = cmd.arg("").output().unwrap(); // STDOUT should be empty assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); @@ -32,7 +33,7 @@ fn bad_input() -> Result<(), Box> { // STDERR should contain error description assert_eq!( std::str::from_utf8(&out.stderr).unwrap(), - "Only HTTP(S) or data URLs are supported but got: kernel.org\n" + "No target specified\n" ); // The exit code should be 1 @@ -73,7 +74,9 @@ fn isolate_data_url() -> Result<(), Box> { // STDOUT should contain isolated HTML assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "Hello, World!\n" + "\ +\ +Hello, World!\n" ); // STDERR should be empty @@ -97,7 +100,10 @@ fn remove_css_from_data_url() -> Result<(), Box> { // STDOUT should contain HTML with no CSS assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "Hello\n" + "\ +\ +\ +Hello\n" ); // STDERR should be empty @@ -121,7 +127,9 @@ fn remove_frames_from_data_url() -> Result<(), Box> { // STDOUT should contain HTML with no iframes assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "Hi\n" + "\ +\ +Hi\n" ); // STDERR should be empty @@ -145,7 +153,15 @@ fn remove_images_from_data_url() -> Result<(), Box> { // STDOUT should contain HTML with no images assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "Hi\n" + "\ +\ +\ +\ +\ +\ +Hi\ +\ +\n" ); // STDERR should be empty @@ -169,7 +185,203 @@ fn remove_js_from_data_url() -> Result<(), Box> { // STDOUT should contain HTML with no JS assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "Hi\n" + "\ +\ +\ +\ +Hi\ +\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn local_file_target_input() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let cwd_normalized: String = + str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); + let out = cmd + .arg(if cfg!(windows) { + "src\\tests\\data\\local-file.html" + } else { + "src/tests/data/local-file.html" + }) + .output() + .unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + + // STDOUT should contain HTML from the local file + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\n \ +\n \ +Local HTML file\n \ +\n \ +\n\n\n\n \ +\"\"\n \ +Tricky href\n \ +Remote URL\n \ +\n\n\n\n\ +\n" + ); + + // STDERR should contain list of retrieved file URLs + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "{file}{cwd}/src/tests/data/local-file.html\n\ +{file}{cwd}/src/tests/data/local-style.css\n\ +{file}{cwd}/src/tests/data/local-script.js\n", + file = file_url_protocol, + cwd = cwd_normalized + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn local_file_target_input_absolute_target_path() -> Result<(), Box> { + let cwd = env::current_dir().unwrap(); + let cwd_normalized: String = + str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-jciI") + .arg(if cfg!(windows) { + format!( + "{cwd}\\src\\tests\\data\\local-file.html", + cwd = cwd.to_str().unwrap() + ) + } else { + format!( + "{cwd}/src/tests/data/local-file.html", + cwd = cwd.to_str().unwrap() + ) + }) + .output() + .unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + + // STDOUT should contain HTML from the local file + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ +\n \ +\n \ +Local HTML file\n \ +\n \ +\n\n\n\n \ +\"\"\n \ +Tricky href\n \ +Remote URL\n \ +\n\n\n\n\ +\n" + ); + + // STDERR should contain only the target file + let cwd = env::current_dir().unwrap(); + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "{file}{cwd}/src/tests/data/local-file.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn local_file_url_target_input() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let cwd = env::current_dir().unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + let out = cmd + .arg("-cji") + .arg(if cfg!(windows) { + format!( + "{file}{cwd}\\src\\tests\\data\\local-file.html", + file = file_url_protocol, + cwd = cwd.to_str().unwrap(), + ) + } else { + format!( + "{file}{cwd}/src/tests/data/local-file.html", + file = file_url_protocol, + cwd = cwd.to_str().unwrap(), + ) + }) + .output() + .unwrap(); + + // STDOUT should contain HTML from the local file + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ +\n \ +\n \ +Local HTML file\n \ +\n \ +\n\n\n\n \ +\"\"\n \ +Tricky href\n \ +Remote URL\n \ +\n\n\n\n\ +\n" + ); + + // STDERR should contain list of retrieved file URLs + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + if cfg!(windows) { + format!( + "{file}{cwd}\\src\\tests\\data\\local-file.html\n", + file = file_url_protocol, + cwd = cwd.to_str().unwrap(), + ) + } else { + format!( + "{file}{cwd}/src/tests/data/local-file.html\n", + file = file_url_protocol, + cwd = cwd.to_str().unwrap(), + ) + } + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn security_disallow_local_assets_within_data_url_targets() -> Result<(), Box> +{ + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("data:text/html,%3Cscript%20src=\"src/tests/data/local-script.js\"%3E%3C/script%3E") + .output() + .unwrap(); + + // STDOUT should contain HTML with no JS in it + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\n" ); // STDERR should be empty diff --git a/src/tests/data/local-file.html b/src/tests/data/local-file.html new file mode 100644 index 0000000..e74e9c5 --- /dev/null +++ b/src/tests/data/local-file.html @@ -0,0 +1,19 @@ + + + + + + + Local HTML file + + + + + + + Tricky href + Remote URL + + + + diff --git a/src/tests/data/local-script.js b/src/tests/data/local-script.js new file mode 100644 index 0000000..cf2d4cb --- /dev/null +++ b/src/tests/data/local-script.js @@ -0,0 +1,2 @@ +document.body.style.backgroundColor = "green"; +document.body.style.color = "red"; diff --git a/src/tests/data/local-style.css b/src/tests/data/local-style.css new file mode 100644 index 0000000..28142a9 --- /dev/null +++ b/src/tests/data/local-style.css @@ -0,0 +1,4 @@ +body { + background-color: #000; + color: #fff; +} diff --git a/src/tests/http.rs b/src/tests/http.rs deleted file mode 100644 index a3fec5f..0000000 --- a/src/tests/http.rs +++ /dev/null @@ -1,25 +0,0 @@ -use crate::http::retrieve_asset; -use reqwest::blocking::Client; -use std::collections::HashMap; - -#[test] -fn test_retrieve_asset() { - let cache = &mut HashMap::new(); - let client = Client::new(); - let (data, final_url) = - retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap(); - assert_eq!(&data, "data:text/html;base64,..."); - assert_eq!(&final_url, "data:text/html;base64,..."); - - let (data, final_url) = retrieve_asset( - cache, - &client, - "data:text/html;base64,...", - true, - "image/png", - false, - ) - .unwrap(); - assert_eq!(&data, "data:text/html;base64,..."); - assert_eq!(&final_url, "data:text/html;base64,..."); -} diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 0051cfc..6b77599 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,5 +1,4 @@ mod cli; mod html; -mod http; mod js; mod utils; diff --git a/src/tests/utils.rs b/src/tests/utils.rs index f1169c6..fece56c 100644 --- a/src/tests/utils.rs +++ b/src/tests/utils.rs @@ -1,14 +1,14 @@ -use crate::utils::{ - clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url, - resolve_url, url_has_protocol, -}; +use crate::utils; +use reqwest::blocking::Client; +use std::collections::HashMap; +use std::env; use url::ParseError; #[test] -fn test_data_to_data_url() { +fn data_to_data_url() { let mime = "application/javascript"; let data = "var word = 'hello';\nalert(word);\n"; - let datauri = data_to_data_url(mime, data.as_bytes()); + let datauri = utils::data_to_data_url(mime, data.as_bytes()); assert_eq!( &datauri, "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" @@ -16,90 +16,124 @@ fn test_data_to_data_url() { } #[test] -fn test_detect_mimetype() { - // image - assert_eq!(detect_mimetype(b"GIF87a"), "image/gif"); - assert_eq!(detect_mimetype(b"GIF89a"), "image/gif"); - assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg"); - assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png"); - assert_eq!(detect_mimetype(b" Result<(), ParseError> { - let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?; +fn resolve_url() -> Result<(), ParseError> { + let resolved_url = utils::resolve_url("https://www.kernel.org", "../category/signatures.html")?; assert_eq!( resolved_url.as_str(), "https://www.kernel.org/category/signatures.html" ); - let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?; + let resolved_url = utils::resolve_url("https://www.kernel.org", "category/signatures.html")?; assert_eq!( resolved_url.as_str(), "https://www.kernel.org/category/signatures.html" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "saved_page.htm", "https://www.kernel.org/category/signatures.html", )?; @@ -108,7 +142,7 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.kernel.org/category/signatures.html" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "https://www.kernel.org", "//www.kernel.org/theme/images/logos/tux.png", )?; @@ -117,7 +151,7 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.kernel.org/theme/images/logos/tux.png" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "https://www.kernel.org", "//another-host.org/theme/images/logos/tux.png", )?; @@ -126,7 +160,7 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://another-host.org/theme/images/logos/tux.png" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "https://www.kernel.org/category/signatures.html", "/theme/images/logos/tux.png", )?; @@ -135,7 +169,7 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.kernel.org/theme/images/logos/tux.png" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "https://www.w3schools.com/html/html_iframe.asp", "default.asp", )?; @@ -144,7 +178,7 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.w3schools.com/html/default.asp" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", "https://www.kernel.org/category/signatures.html", )?; @@ -153,62 +187,197 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.kernel.org/category/signatures.html" ); - let resolved_url = resolve_url( + let resolved_url = utils::resolve_url( "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", "//www.w3schools.com/html/html_iframe.asp", ) .unwrap_or(str!()); assert_eq!(resolved_url.as_str(), ""); + let resolved_url = utils::resolve_url( + "file:///home/user/Websites/my-website/index.html", + "assets/images/logo.png", + ) + .unwrap_or(str!()); + assert_eq!( + resolved_url.as_str(), + "file:///home/user/Websites/my-website/assets/images/logo.png" + ); + + let resolved_url = utils::resolve_url( + "file:\\\\\\home\\user\\Websites\\my-website\\index.html", + "assets\\images\\logo.png", + ) + .unwrap_or(str!()); + assert_eq!( + resolved_url.as_str(), + "file:///home/user/Websites/my-website/assets/images/logo.png" + ); + Ok(()) } #[test] -fn test_is_data_url() { - // passing - assert!(is_data_url( +fn is_data_url() { + // Passing + assert!(utils::is_data_url( "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" )); - // failing - assert!(!is_data_url("https://kernel.org")); - assert!(!is_data_url("//kernel.org")); - assert!(!is_data_url("")); + + // Failing + assert!(!utils::is_data_url("https://kernel.org")); + assert!(!utils::is_data_url("//kernel.org")); + assert!(!utils::is_data_url("")); } #[test] -fn test_clean_url() { +fn clean_url() { assert_eq!( - clean_url("https://somewhere.com/font.eot#iefix"), + utils::clean_url("https://somewhere.com/font.eot#iefix"), "https://somewhere.com/font.eot" ); assert_eq!( - clean_url("https://somewhere.com/font.eot#"), + utils::clean_url("https://somewhere.com/font.eot#"), "https://somewhere.com/font.eot" ); assert_eq!( - clean_url("https://somewhere.com/font.eot?#"), + utils::clean_url("https://somewhere.com/font.eot?#"), "https://somewhere.com/font.eot" ); } #[test] -fn test_data_url_to_text() { +fn data_url_to_text() { assert_eq!( - data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), + utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), "Work expands so as to fill the time available for its completion" ); assert_eq!( - data_url_to_text( + utils::data_url_to_text( "data:text/html;utf8,Work expands so as to fill the time available for its completion" ), "Work expands so as to fill the time available for its completion" ); assert_eq!( - data_url_to_text( + utils::data_url_to_text( "data:text/html,Work expands so as to fill the time available for its completion" ), "Work expands so as to fill the time available for its completion" ); + + assert_eq!( + utils::data_url_to_text( + " data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion " + ), + "Work expands so as to fill the time available for its completion" + ); +} + +#[test] +fn decode_url() { + assert_eq!( + utils::decode_url(str!( + "%E6%A4%9C%E3%83%92%E3%83%A0%E8%A7%A3%E5%A1%97%E3%82%83%E3%83%83%20%3D%20%E3%82%B5" + )), + "検ヒム解塗ゃッ = サ" + ); + + assert_eq!(utils::decode_url(str!("%20 %20")), " "); +} + +#[test] +fn retrieve_asset() { + let cache = &mut HashMap::new(); + let client = Client::new(); + + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + + // If both source and target are data URLs, + // ensure the result contains target data URL + let (data, final_url) = utils::retrieve_asset( + cache, + &client, + "data:text/html;base64,SoUrCe", + "data:text/html;base64,TaRgEt", + true, + "", + false, + ) + .unwrap(); + assert_eq!(&data, "data:text/html;base64,TaRgEt"); + assert_eq!(&final_url, "data:text/html;base64,TaRgEt"); + + // Media type parameter should not influence data URLs + let (data, final_url) = utils::retrieve_asset( + cache, + &client, + "data:text/html;base64,SoUrCe", + "data:text/html;base64,TaRgEt", + true, + "image/png", + false, + ) + .unwrap(); + assert_eq!(&data, "data:text/html;base64,TaRgEt"); + assert_eq!(&final_url, "data:text/html;base64,TaRgEt"); + + // Inclusion of local assets from data URL sources should not be allowed + let (data, final_url) = utils::retrieve_asset( + cache, + &client, + "data:text/html;base64,SoUrCe", + "file:///etc/passwd", + true, + "", + false, + ) + .unwrap(); + assert_eq!(&data, ""); + assert_eq!(&final_url, ""); + + // Inclusion of local assets from remote sources should not be allowed + let (data, final_url) = utils::retrieve_asset( + cache, + &client, + "https://kernel.org/", + "file:///etc/passwd", + true, + "", + false, + ) + .unwrap(); + assert_eq!(&data, ""); + assert_eq!(&final_url, ""); + + // Inclusion of local assets from local sources should be allowed + let cwd = env::current_dir().unwrap(); + let (data, final_url) = utils::retrieve_asset( + cache, + &client, + &format!( + "{file}{cwd}/src/tests/data/local-file.html", + file = file_url_protocol, + cwd = cwd.to_str().unwrap() + ), + &format!( + "{file}{cwd}/src/tests/data/local-script.js", + file = file_url_protocol, + cwd = cwd.to_str().unwrap() + ), + true, + "application/javascript", + false, + ) + .unwrap(); + assert_eq!(&data, "data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg=="); + assert_eq!( + &final_url, + &format!( + "{file}{cwd}/src/tests/data/local-script.js", + file = file_url_protocol, + cwd = cwd.to_str().unwrap() + ) + ); } diff --git a/src/utils.rs b/src/utils.rs index c04f9e9..fb38116 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,8 +1,10 @@ -use crate::http::retrieve_asset; -use base64::{decode, encode}; +use base64; use regex::Regex; use reqwest::blocking::Client; +use reqwest::header::CONTENT_TYPE; use std::collections::HashMap; +use std::fs; +use std::path::Path; use url::{form_urlencoded, ParseError, Url}; /// This monster of a regex is used to match any kind of URL found in CSS. @@ -71,7 +73,7 @@ pub fn data_to_data_url(mime: &str, data: &[u8]) -> String { } else { mime.to_string() }; - format!("data:{};base64,{}", mimetype, encode(data)) + format!("data:{};base64,{}", mimetype, base64::encode(data)) } pub fn detect_mimetype(data: &[u8]) -> String { @@ -95,6 +97,12 @@ pub fn is_data_url>(url: T) -> bool { .unwrap_or(false) } +pub fn is_file_url>(url: T) -> bool { + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme() == "file")) + .unwrap_or(false) +} + pub fn is_http_url>(url: T) -> bool { Url::parse(url.as_ref()) .and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https")) @@ -118,6 +126,7 @@ pub fn resolve_css_imports( client: &Client, css_string: &str, as_data_url: bool, + parent_url: &str, href: &str, opt_no_images: bool, opt_silent: bool, @@ -127,12 +136,12 @@ pub fn resolve_css_imports( for link in REGEX_CSS_URL.captures_iter(&css_string) { let target_link = link.name("url").unwrap().as_str(); - // Determine the type of link + // Determine linked asset type let is_stylesheet = link.name("stylesheet").is_some(); let is_font = link.name("font").is_some(); let is_image = !is_stylesheet && !is_font; - // Generate absolute URL for content + // Generate absolute URL for the content let embedded_url = match resolve_url(href, target_link) { Ok(url) => url, Err(_) => continue, // Malformed URL @@ -144,8 +153,9 @@ pub fn resolve_css_imports( retrieve_asset( cache, client, + &parent_url, &embedded_url, - false, // Formating as data URL will be done later + false, // Formatting as data URL will be done later "text/css", // Expect CSS opt_silent, ) @@ -155,6 +165,7 @@ pub fn resolve_css_imports( client, &content, true, // Finally, convert to a data URL + &parent_url, &embedded_url, opt_no_images, opt_silent, @@ -165,6 +176,7 @@ pub fn resolve_css_imports( retrieve_asset( cache, client, + &parent_url, &embedded_url, true, // Format as data URL "", // Unknown MIME type @@ -186,10 +198,11 @@ pub fn resolve_css_imports( let replacement = format!("\"{}\"", &content); let dest = link.name("to_repl").unwrap(); - let offset = resolved_css.len() - css_string.len(); - let target_range = (dest.start() + offset)..(dest.end() + offset); - - resolved_css.replace_range(target_range, &replacement); + if resolved_css.len() > css_string.len() { + let offset = resolved_css.len() - css_string.len(); + let target_range = (dest.start() + offset)..(dest.end() + offset); + resolved_css.replace_range(target_range, &replacement); + } } if as_data_url { @@ -222,20 +235,7 @@ pub fn data_url_to_text>(url: T) -> String { let meta_data: String = path.chars().take(comma_loc).collect(); let raw_data: String = path.chars().skip(comma_loc + 1).collect(); - let data: String = form_urlencoded::parse(raw_data.as_bytes()) - .map(|(key, val)| { - [ - key.to_string(), - if val.to_string().len() == 0 { - str!() - } else { - str!('=') - }, - val.to_string(), - ] - .concat() - }) - .collect(); + let data: String = decode_url(raw_data); let meta_data_items: Vec<&str> = meta_data.split(';').collect(); let mut mime_type: &str = ""; @@ -259,7 +259,7 @@ pub fn data_url_to_text>(url: T) -> String { if mime_type.eq_ignore_ascii_case("text/html") { if encoding.eq_ignore_ascii_case("base64") { - String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!()) + String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!()) } else { data } @@ -267,3 +267,114 @@ pub fn data_url_to_text>(url: T) -> String { str!() } } + +pub fn decode_url(input: String) -> String { + form_urlencoded::parse(input.as_bytes()) + .map(|(key, val)| { + [ + key.to_string(), + if val.to_string().len() == 0 { + str!() + } else { + str!('=') + }, + val.to_string(), + ] + .concat() + }) + .collect() +} + +pub fn retrieve_asset( + cache: &mut HashMap, + client: &Client, + parent_url: &str, + url: &str, + as_data_url: bool, + mime: &str, + opt_silent: bool, +) -> Result<(String, String), reqwest::Error> { + if url.len() == 0 { + return Ok((str!(), str!())); + } + + let cache_key = clean_url(&url); + + if is_data_url(&url) { + Ok((url.to_string(), url.to_string())) + } else if is_file_url(&url) { + // Check if parent_url is also file:/// + // (if not then we don't download/embed the asset) + if !is_file_url(&parent_url) { + return Ok((str!(), str!())); + } + + let cutoff = if cfg!(windows) { 8 } else { 7 }; + let fs_file_path: String = decode_url(url.to_string()[cutoff..].to_string()); + let path = Path::new(&fs_file_path); + if path.exists() { + if !opt_silent { + eprintln!("{}", &url); + } + + if as_data_url { + let data_url: String = data_to_data_url(&mime, &fs::read(&fs_file_path).unwrap()); + Ok((data_url, url.to_string())) + } else { + let data: String = fs::read_to_string(&fs_file_path).expect(url); + Ok((data, url.to_string())) + } + } else { + Ok((str!(), url.to_string())) + } + } else { + if cache.contains_key(&cache_key) { + // URL is in cache + if !opt_silent { + eprintln!("{} (from cache)", &url); + } + let data = cache.get(&cache_key).unwrap(); + Ok((data.to_string(), url.to_string())) + } else { + // URL not in cache, we request it + let mut response = client.get(url).send()?; + let res_url = response.url().to_string(); + + if !opt_silent { + if url == res_url { + eprintln!("{}", &url); + } else { + eprintln!("{} -> {}", &url, &res_url); + } + } + + let new_cache_key = clean_url(&res_url); + + if as_data_url { + // Convert response into a byte array + let mut data: Vec = vec![]; + response.copy_to(&mut data)?; + + // Attempt to obtain MIME type by reading the Content-Type header + let mimetype = if mime == "" { + response + .headers() + .get(CONTENT_TYPE) + .and_then(|header| header.to_str().ok()) + .unwrap_or(&mime) + } else { + mime + }; + let data_url = data_to_data_url(&mimetype, &data); + // Add to cache + cache.insert(new_cache_key, data_url.clone()); + Ok((data_url, res_url)) + } else { + let content = response.text().unwrap(); + // Add to cache + cache.insert(new_cache_key, content.clone()); + Ok((content, res_url)) + } + } + } +}