From a6e891b3c5f1ed2b751bda0672019120e9ab1d30 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Wed, 2 Jun 2021 03:41:41 -1000 Subject: [PATCH] add more tests --- README.md | 16 +- src/html.rs | 107 +++++----- src/tests/cli/base_url.rs | 2 +- src/tests/cli/data_url.rs | 50 +++-- src/tests/cli/noscript.rs | 32 ++- src/tests/cli/unusual_encodings.rs | 9 +- src/tests/html/walk_and_embed_assets.rs | 272 ++++++++++++++++++------ 7 files changed, 344 insertions(+), 144 deletions(-) diff --git a/README.md b/README.md index 8d07184..c6fafcd 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,11 @@ or - `-j`: Exclude JavaScript - `-k`: Accept invalid X.509 (TLS) certificates - `-M`: Don't add timestamp and URL information - - `-n`: Extract contents of NOSCRIPT tags + - `-n`: Extract contents of NOSCRIPT elements - `-o`: Write output to `file` - `-s`: Be quiet - `-t`: Adjust `network request timeout` - - `-u`: Provide `custom User-Agent` + - `-u`: Provide custom `User-Agent` - `-v`: Exclude videos --------------------------------------------------- @@ -99,19 +99,15 @@ Please open an issue if something is wrong, that helps make this project better. --------------------------------------------------- ## Related projects - - `Monolith Chrome Extension`: https://github.com/rhysd/monolith-of-web - - `Pagesaver`: https://github.com/distributed-mind/pagesaver - - `Personal WayBack Machine`: https://github.com/popey/pwbm - - `Hako`: https://github.com/dmpop/hako + - Monolith Chrome Extension: https://github.com/rhysd/monolith-of-web + - Pagesaver: https://github.com/distributed-mind/pagesaver + - Personal WayBack Machine: https://github.com/popey/pwbm + - Hako: https://github.com/dmpop/hako --------------------------------------------------- ## License - - CC0-1.0 - -
To the extent possible under law, the author(s) have dedicated all copyright related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty. diff --git a/src/html.rs b/src/html.rs index a1524e2..836f5cc 100644 --- a/src/html.rs +++ b/src/html.rs @@ -474,8 +474,9 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { result = String::from_utf8(buf).unwrap(); } + // Unwrap NOSCRIPT elements if options.unwrap_noscript { - let noscript_re = Regex::new(r"<(?P/?noscript)>").unwrap(); + let noscript_re = Regex::new(r"<(?P/?noscript[^>]*)>").unwrap(); result = noscript_re.replace_all(&result, "").to_string(); } @@ -503,44 +504,39 @@ pub fn retrieve_and_embed_asset( depth + 1, ) { Ok((data, final_url, mut media_type)) => { - // Check integrity if it's a LINK or SCRIPT tag let node_name: &str = get_node_name(&node).unwrap(); - let mut ok_to_include: bool = true; + // Check integrity if it's a LINK or SCRIPT element + let mut ok_to_include: bool = true; if node_name == "link" || node_name == "script" { - let node_integrity_attr_value: Option = get_node_attr(node, "integrity"); - // Check integrity - if let Some(node_integrity_attr_value) = node_integrity_attr_value { + if let Some(node_integrity_attr_value) = get_node_attr(node, "integrity") { if !node_integrity_attr_value.is_empty() { ok_to_include = check_integrity(&data, &node_integrity_attr_value); } - } - // Wipe integrity attribute - set_node_attr(node, "integrity", None); + // Wipe the integrity attribute + set_node_attr(node, "integrity", None); + } } if ok_to_include { - if node_name == "link" { - let link_type: &str = determine_link_node_type(node); - // CSS LINK nodes requires special treatment - if link_type == "stylesheet" { - let css: String = embed_css( - cache, - client, - &final_url, - &String::from_utf8_lossy(&data), - options, - depth + 1, - ); - let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url); - - set_node_attr(&node, attr_name, Some(css_data_url.to_string())); + if node_name == "link" && determine_link_node_type(node) == "stylesheet" { + // Stylesheet LINK elements require special treatment + let css: String = embed_css( + cache, + client, + &final_url, + &String::from_utf8_lossy(&data), + options, + depth + 1, + ); - return; // Do not fall through - } + // Create and embed data URL + let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url); + set_node_attr(&node, attr_name, Some(css_data_url.to_string())); } else if node_name == "frame" || node_name == "iframe" { + // (I)FRAMEs are also quite different from conventional resources let frame_dom = html_to_dom(&String::from_utf8_lossy(&data)); walk_and_embed_assets( cache, @@ -559,30 +555,38 @@ pub fn retrieve_and_embed_asset( ) .unwrap(); + // Create and embed data URL let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url); - frame_data_url.set_fragment(resolved_url.fragment()); - set_node_attr(node, attr_name, Some(frame_data_url.to_string())); + } else { + // Every other type of element gets processed here + + // Parse media type for SCRIPT elements + if node_name == "script" { + if let Some(_) = get_node_attr(node, "src") { + if let Some(script_node_type_attr_value) = get_node_attr(node, "type") { + media_type = script_node_type_attr_value.to_string(); + } else { + // Fallback to default one if it's not specified + media_type = "application/javascript".to_string(); + } + } + } - return; // Do not fall through - } - - // Everything else - if node_name == "script" { - media_type = "application/javascript".to_string(); + // Create and embed data URL + let mut data_url = create_data_url(&media_type, &data, &final_url); + data_url.set_fragment(resolved_url.fragment()); + set_node_attr(node, attr_name, Some(data_url.to_string())); } - let mut data_url = create_data_url(&media_type, &data, &final_url); - data_url.set_fragment(resolved_url.fragment()); - set_node_attr(node, attr_name, Some(data_url.to_string())); } } Err(_) => { if resolved_url.scheme() == "http" || resolved_url.scheme() == "https" { - // Keep remote reference if unable to retrieve the asset + // Keep remote references if unable to retrieve the asset set_node_attr(node, attr_name, Some(resolved_url.to_string())); } else { - // Exclude non-remote URLs + // Remove local references if they can't be successfully embedded as data URLs set_node_attr(node, attr_name, None); } } @@ -645,7 +649,7 @@ pub fn walk_and_embed_assets( let link_type: &str = determine_link_node_type(node); if link_type == "icon" { - // Find and resolve this LINK node's href attribute + // Find and resolve LINK's href attribute if let Some(link_attr_href_value) = get_node_attr(node, "href") { if !options.no_images && !link_attr_href_value.is_empty() { retrieve_and_embed_asset( @@ -663,10 +667,12 @@ pub fn walk_and_embed_assets( } } } else if link_type == "stylesheet" { - // Find and resolve this LINK node's href attribute + // Resolve LINK's href attribute if let Some(link_attr_href_value) = get_node_attr(node, "href") { if options.no_css { set_node_attr(node, "href", None); + // Wipe integrity attribute + set_node_attr(node, "integrity", None); } else { if !link_attr_href_value.is_empty() { retrieve_and_embed_asset( @@ -916,14 +922,15 @@ pub fn walk_and_embed_assets( // Replace with empty JS call to preserve original behavior set_node_attr(node, "href", Some(str!("javascript:;"))); } - } else if anchor_attr_href_value.clone().starts_with('#') - || is_url_and_has_protocol(&anchor_attr_href_value.clone()) - { - // Don't touch mailto: links or hrefs which begin with a hash sign } else { - let href_full_url: Url = - resolve_url(document_url, &anchor_attr_href_value); - set_node_attr(node, "href", Some(href_full_url.to_string())); + // Don't touch mailto: links or hrefs which begin with a hash sign + if !anchor_attr_href_value.clone().starts_with('#') + && !is_url_and_has_protocol(&anchor_attr_href_value.clone()) + { + let href_full_url: Url = + resolve_url(document_url, &anchor_attr_href_value); + set_node_attr(node, "href", Some(href_full_url.to_string())); + } } } } @@ -937,6 +944,8 @@ pub fn walk_and_embed_assets( // Remove src attribute if script_attr_src != None { set_node_attr(node, "src", None); + // Wipe integrity attribute + set_node_attr(node, "integrity", None); } } else if !script_attr_src.clone().unwrap_or_default().is_empty() { retrieve_and_embed_asset( @@ -1081,7 +1090,7 @@ pub fn walk_and_embed_assets( ); // Get rid of original contents noscript_contents.clear(); - // Insert HTML containing embedded assets back into NOSCRIPT node + // Insert HTML containing embedded assets into NOSCRIPT node if let Some(html) = get_child_node_by_name(&noscript_contents_dom.document, "html") { diff --git a/src/tests/cli/base_url.rs b/src/tests/cli/base_url.rs index 9b07bf7..192e1e7 100644 --- a/src/tests/cli/base_url.rs +++ b/src/tests/cli/base_url.rs @@ -88,7 +88,7 @@ mod passing { } #[test] - fn remove_existing_when_empty_provided() { + fn set_existing_to_empty_when_empty_provided() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd .arg("-M") diff --git a/src/tests/cli/data_url.rs b/src/tests/cli/data_url.rs index 62e5bbe..280ed40 100644 --- a/src/tests/cli/data_url.rs +++ b/src/tests/cli/data_url.rs @@ -11,24 +11,6 @@ mod passing { use std::env; use std::process::Command; - #[test] - fn bad_input_data_url() { - let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap(); - - // STDOUT should contain HTML - assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); - - // STDERR should contain error description - assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), - "Unsupported data URL media type\n" - ); - - // The exit code should be 1 - out.assert().code(1); - } - #[test] fn isolate_data_url() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); @@ -192,6 +174,38 @@ mod passing { // The exit code should be 0 out.assert().code(0); } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use assert_cmd::prelude::*; + use std::env; + use std::process::Command; + + #[test] + fn bad_input_data_url() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap(); + + // STDOUT should contain HTML + assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); + + // STDERR should contain error description + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + "Unsupported data URL media type\n" + ); + + // The exit code should be 1 + out.assert().code(1); + } #[test] fn security_disallow_local_assets_within_data_url_targets() { diff --git a/src/tests/cli/noscript.rs b/src/tests/cli/noscript.rs index 7ba93ce..19ab674 100644 --- a/src/tests/cli/noscript.rs +++ b/src/tests/cli/noscript.rs @@ -130,7 +130,14 @@ mod passing { // STDOUT should contain HTML with no CSS assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "\n\n" + "\ + \ + \ + \ + \ + \n\ + \ + \n" ); // STDERR should contain target HTML and embedded SVG files @@ -153,4 +160,27 @@ mod passing { // The exit code should be 0 out.assert().code(0); } + + #[test] + fn unwrap_noscript_contents_attr_data_url() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd + .arg("-M") + .arg("-n") + .arg("data:text/html,") + .output() + .unwrap(); + + // STDOUT should contain unwrapped contents of NOSCRIPT element + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "test\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + } } diff --git a/src/tests/cli/unusual_encodings.rs b/src/tests/cli/unusual_encodings.rs index 4796cec..5ebd9ac 100644 --- a/src/tests/cli/unusual_encodings.rs +++ b/src/tests/cli/unusual_encodings.rs @@ -30,7 +30,14 @@ mod passing { // STDOUT should contain newly added base URL assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "\n \n \n \n © Some Company\n \n\n\n" + "\ + \n \ + \n \ + \n \ + \n \ + © Some Company\n \ + \n\n\ + \n" ); // STDERR should contain only the target file diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index 855cc37..7e2ab83 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -87,10 +87,12 @@ mod passing { #[test] fn no_css() { - let html = "\ - \ - \ -
"; + let html = "\ + \ + \ + \ +
\ + "; let dom = html::html_to_dom(&html); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -108,16 +110,18 @@ mod passing { assert_eq!( buf.iter().map(|&c| c as char).collect::(), - "\ - \ - \ - \ - \ - \ - \ -
\ - \ - " + "\ + \ + \ + \ + \ + \ + \ + \ +
\ + \ + \ + " ); } @@ -203,7 +207,15 @@ mod passing { assert_eq!( buf.iter().map(|&c| c as char).collect::(), - "" + "\ + \ + \ + \ + \ + \ + \ + \ + " ); } @@ -227,16 +239,25 @@ mod passing { assert_eq!( buf.iter().map(|&c| c as char).collect::(), - "" + "\ + \ + \ + \ + \ + \ + \ + " ); } #[test] fn no_js() { - let html = "
\ - \ - \ -
"; + let html = "\ +
\ + \ + \ +
\ + "; let dom = html::html_to_dom(&html); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -254,52 +275,141 @@ mod passing { assert_eq!( buf.iter().map(|&c| c as char).collect::(), - "
\ -
" + "\ + \ + \ + \ +
\ + \ + \ +
\ + \ + \ + " ); } - // #[test] - // fn discards_integrity() { - // let html = "No integrity\ - // \ - // "; - // let dom = html::html_to_dom(&html); - // let url: Url = Url::parse("http://localhost").unwrap(); - // let cache = &mut HashMap::new(); - - // let mut options = Options::default(); - // options.no_css = true; - // options.no_frames = true; - // options.no_js = true; - // options.no_images = true; - // options.silent = true; - - // let client = Client::new(); - - // html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0); - - // let mut buf: Vec = Vec::new(); - // serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - // assert_eq!( - // buf.iter().map(|&c| c as char).collect::(), - // "\ - // No integrity\ - // \ - // " - // ); - // } + #[test] + fn keeps_integrity_for_linked_assets() { + let html = "Has integrity\ + "; + let dom = html::html_to_dom(&html); + let url: Url = Url::parse("http://localhost").unwrap(); + let cache = &mut HashMap::new(); + + let mut options = Options::default(); + options.silent = true; + + let client = Client::new(); + + html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "\ + \ + \ + Has integrity\ + \ + \ + \ + \ + " + ); + } + + #[test] + fn discards_integrity_for_linked_assets_nojs_nocss() { + let html = "\ + No integrity\ + \ + \ + "; + let dom = html::html_to_dom(&html); + let url: Url = Url::parse("http://localhost").unwrap(); + let cache = &mut HashMap::new(); + + let mut options = Options::default(); + options.no_css = true; + options.no_js = true; + options.silent = true; + + let client = Client::new(); + + html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "\ + \ + \ + No integrity\ + \ + \ + \ + \ + \ + " + ); + } + + #[test] + fn discards_integrity_for_embedded_assets() { + let html = "\ + No integrity\ + \ + \ + "; + let dom = html::html_to_dom(&html); + let url: Url = Url::parse("http://localhost").unwrap(); + let cache = &mut HashMap::new(); + + let mut options = Options::default(); + options.no_css = true; + options.no_js = true; + options.silent = true; + + let client = Client::new(); + + html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "\ + \ + \ + No integrity\ + \ + \ + \ + \ + \ + \ + " + ); + } #[test] fn removes_unwanted_meta_tags() { - let html = "\ - \ - \ - \ - \ - \ - "; + let html = "\ + \ + \ + \ + \ + \ + \ + \ + \ + "; let dom = html::html_to_dom(&html); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -320,19 +430,22 @@ mod passing { assert_eq!( buf.iter().map(|&c| c as char).collect::(), - "\ + "\ + \ \ \ \ \ - \ + \ + \ " ); } #[test] fn processes_noscript_tags() { - let html = "\ + let html = "\ + \ \