From 6f918f6c1cb1243097608ab5feb7ac622c535d59 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Thu, 11 Mar 2021 18:18:39 -1000 Subject: [PATCH] make possible to unwrap NOSCRIPT nodes --- README.md | 1 + src/html.rs | 13 ++- src/opts.rs | 5 + src/tests/cli/local_files.rs | 4 +- src/tests/cli/mod.rs | 1 + src/tests/cli/noscript.rs | 164 ++++++++++++++++++++++++++++ src/tests/data/noscript/image.svg | 5 + src/tests/data/noscript/index.html | 1 + src/tests/data/noscript/nested.html | 1 + src/tests/data/noscript/script.html | 1 + 10 files changed, 190 insertions(+), 6 deletions(-) create mode 100644 src/tests/cli/noscript.rs create mode 100644 src/tests/data/noscript/image.svg create mode 100644 src/tests/data/noscript/index.html create mode 100644 src/tests/data/noscript/nested.html create mode 100644 src/tests/data/noscript/script.html diff --git a/README.md b/README.md index 91c2c75..0ac5ff3 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ or - `-j`: Exclude JavaScript - `-k`: Accept invalid X.509 (TLS) certificates - `-M`: Don't add timestamp and URL information + - `-n`: Extract contents of NOSCRIPT tags - `-o`: Write output to `file` - `-s`: Be quiet - `-t`: Adjust `network request timeout` diff --git a/src/html.rs b/src/html.rs index 3f5e34d..17524fb 100644 --- a/src/html.rs +++ b/src/html.rs @@ -474,6 +474,11 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { result = String::from_utf8(buf).unwrap(); } + if options.unwrap_noscript { + let noscript_re = Regex::new(r"<(?P/?noscript)>").unwrap(); + result = noscript_re.replace_all(&result, "").to_string(); + } + result } @@ -1060,11 +1065,11 @@ pub fn walk_and_embed_assets( for child_node in node.children.borrow_mut().iter_mut() { match child_node.data { NodeData::Text { ref contents } => { - // Get contents of the NOSCRIPT node + // Get contents of NOSCRIPT node let mut noscript_contents = contents.borrow_mut(); - // Parse contents of the NOSCRIPT node + // Parse contents of NOSCRIPT node as DOM let noscript_contents_dom: RcDom = html_to_dom(&noscript_contents); - // Embed assets within the NOSCRIPT node + // Embed assets of NOSCRIPT node contents walk_and_embed_assets( cache, client, @@ -1075,7 +1080,7 @@ pub fn walk_and_embed_assets( ); // Get rid of original contents noscript_contents.clear(); - // Insert HTML containing embedded assets into the NOSCRIPT node + // Insert HTML containing embedded assets back into NOSCRIPT node if let Some(html) = get_child_node_by_name(&noscript_contents_dom.document, "html") { diff --git a/src/opts.rs b/src/opts.rs index c1a2a2a..f20e82e 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -21,6 +21,7 @@ pub struct Options { pub no_video: bool, pub target: String, pub no_color: bool, + pub unwrap_noscript: bool, } const ASCII: &'static str = " \ @@ -55,6 +56,9 @@ impl Options { .args_from_usage("-j, --no-js 'Removes JavaScript'") .args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'") .args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'") + .args_from_usage( + "-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'", + ) .args_from_usage("-o, --output=[document.html] 'Writes output to '") .args_from_usage("-s, --silent 'Suppresses verbosity'") .args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'") @@ -100,6 +104,7 @@ impl Options { } else { options.user_agent = Some(DEFAULT_USER_AGENT.to_string()); } + options.unwrap_noscript = app.is_present("unwrap-noscript"); options.no_video = app.is_present("no-video"); options.no_color = diff --git a/src/tests/cli/local_files.rs b/src/tests/cli/local_files.rs index 9603325..1ee2280 100644 --- a/src/tests/cli/local_files.rs +++ b/src/tests/cli/local_files.rs @@ -202,12 +202,12 @@ mod passing { format!( "\ {file_url_html}\n \ - {file_url_css}\n\ + {file_url_svg}\n\ ", file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) .unwrap() .into_string(), - file_url_css = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) + file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) .unwrap() .into_string(), ) diff --git a/src/tests/cli/mod.rs b/src/tests/cli/mod.rs index 55d69f6..aeda0db 100644 --- a/src/tests/cli/mod.rs +++ b/src/tests/cli/mod.rs @@ -2,4 +2,5 @@ mod base_url; mod basic; mod data_url; mod local_files; +mod noscript; mod unusual_encodings; diff --git a/src/tests/cli/noscript.rs b/src/tests/cli/noscript.rs new file mode 100644 index 0000000..6dde13c --- /dev/null +++ b/src/tests/cli/noscript.rs @@ -0,0 +1,164 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use assert_cmd::prelude::*; + use std::env; + use std::fs; + use std::path::Path; + use std::process::Command; + use url::Url; + + #[test] + fn parse_noscript_contents() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let path_html: &Path = Path::new("src/tests/data/noscript/index.html"); + let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg"); + + let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap(); + + // STDOUT should contain HTML with no CSS + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\n\n" + ); + + // STDERR should contain target HTML and embedded SVG files + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "\ + {file_url_html}\n \ + {file_url_svg}\n\ + ", + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) + .unwrap() + .into_string(), + file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) + .unwrap() + .into_string(), + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn unwrap_noscript_contents() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let path_html: &Path = Path::new("src/tests/data/noscript/index.html"); + let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg"); + + let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap(); + + // STDOUT should contain HTML with no CSS + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\n\n" + ); + + // STDERR should contain target HTML and embedded SVG files + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "\ + {file_url_html}\n \ + {file_url_svg}\n\ + ", + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) + .unwrap() + .into_string(), + file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) + .unwrap() + .into_string(), + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn unwrap_noscript_contents_nested() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let path_html: &Path = Path::new("src/tests/data/noscript/nested.html"); + let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg"); + + let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap(); + + // STDOUT should contain HTML with no CSS + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "

JS is not active

\n\n" + ); + + // STDERR should contain target HTML and embedded SVG files + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "\ + {file_url_html}\n \ + {file_url_svg}\n\ + ", + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) + .unwrap() + .into_string(), + file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) + .unwrap() + .into_string(), + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn unwrap_noscript_contents_with_script() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let path_html: &Path = Path::new("src/tests/data/noscript/script.html"); + let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg"); + + let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap(); + + // STDOUT should contain HTML with no CSS + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\n\n" + ); + + // STDERR should contain target HTML and embedded SVG files + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "\ + {file_url_html}\n \ + {file_url_svg}\n\ + ", + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) + .unwrap() + .into_string(), + file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) + .unwrap() + .into_string(), + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } +} diff --git a/src/tests/data/noscript/image.svg b/src/tests/data/noscript/image.svg new file mode 100644 index 0000000..e181299 --- /dev/null +++ b/src/tests/data/noscript/image.svg @@ -0,0 +1,5 @@ + + + + SVG + diff --git a/src/tests/data/noscript/index.html b/src/tests/data/noscript/index.html new file mode 100644 index 0000000..cf7b61a --- /dev/null +++ b/src/tests/data/noscript/index.html @@ -0,0 +1 @@ + diff --git a/src/tests/data/noscript/nested.html b/src/tests/data/noscript/nested.html new file mode 100644 index 0000000..f7baf11 --- /dev/null +++ b/src/tests/data/noscript/nested.html @@ -0,0 +1 @@ + diff --git a/src/tests/data/noscript/script.html b/src/tests/data/noscript/script.html new file mode 100644 index 0000000..12842ce --- /dev/null +++ b/src/tests/data/noscript/script.html @@ -0,0 +1 @@ +