From 89ce5029b936adc4fc0588745f642d8312e51eee Mon Sep 17 00:00:00 2001 From: Sunshine Date: Thu, 1 Sep 2022 19:35:52 -0400 Subject: [PATCH] add option to blacklist/whitelist domains --- README.md | 2 + src/opts.rs | 19 ++- src/utils.rs | 69 ++++++++++- tests/utils/domain_is_within_domain.rs | 154 +++++++++++++++++++++++++ tests/utils/mod.rs | 1 + 5 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 tests/utils/domain_is_within_domain.rs diff --git a/README.md b/README.md index 722a53f..049e80f 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,9 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html - `-b`: Use custom `base URL` - `-c`: Exclude CSS - `-C`: Save document using custom `charset` + - `-d`: Allow retrieving assets only from specified `domain(s)` - `-e`: Ignore network errors + - `-E`: Avoid retrieving assets located within specified domains - `-f`: Omit frames - `-F`: Exclude web fonts - `-i`: Remove images diff --git a/src/opts.rs b/src/opts.rs index f2618af..79c4174 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -1,4 +1,4 @@ -use clap::{App, Arg}; +use clap::{App, Arg, ArgAction}; use std::env; #[derive(Default)] @@ -7,7 +7,9 @@ pub struct Options { pub base_url: Option, pub no_css: bool, pub charset: Option, + pub domains: Option>, pub ignore_errors: bool, + pub exclude_domains: bool, pub no_frames: bool, pub no_fonts: bool, pub no_images: bool, @@ -50,7 +52,17 @@ impl Options { .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'") .args_from_usage("-c, --no-css 'Removes CSS'") .args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'") + .arg( + Arg::with_name("domains") + .short('d') + .long("domains") + .takes_value(true) + .value_name("DOMAINS") + .action(ArgAction::Append) + .help("Whitelist of domains"), + ) .args_from_usage("-e, --ignore-errors 'Ignore network errors'") + .args_from_usage("-E, --exclude-domains 'Treat specified domains as blacklist'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") .args_from_usage("-F, --no-fonts 'Removes fonts'") .args_from_usage("-i, --no-images 'Removes images'") @@ -91,7 +103,12 @@ impl Options { if let Some(charset) = app.value_of("charset") { options.charset = Some(charset.to_string()); } + if let Some(domains) = app.get_many::("domains") { + let list_of_domains: Vec = domains.map(|v| v.clone()).collect::>(); + options.domains = Some(list_of_domains); + } options.ignore_errors = app.is_present("ignore-errors"); + options.exclude_domains = app.is_present("exclude-domains"); options.no_frames = app.is_present("no-frames"); options.no_fonts = app.is_present("no-fonts"); options.no_images = app.is_present("no-images"); diff --git a/src/utils.rs b/src/utils.rs index 5d66a53..f3996c2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -92,6 +92,62 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String { mime.to_string() } +pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool { + if domain_to_match_against.len() == 0 { + return false; + } + + if domain_to_match_against == "." { + return true; + } + + let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect(); + let domain_to_match_against_partials: Vec<&str> = domain_to_match_against + .trim_end_matches(".") + .rsplit(".") + .collect(); + let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with("."); + + let mut i: usize = 0; + let l: usize = std::cmp::max( + domain_partials.len(), + domain_to_match_against_partials.len(), + ); + let mut ok: bool = true; + + while i < l { + // Exit and return false if went out of bounds of domain to match against, and it didn't start with a dot + if !domain_to_match_against_starts_with_a_dot + && domain_to_match_against_partials.len() < i + 1 + { + ok = false; + break; + } + + let domain_partial = if domain_partials.len() < i + 1 { + "" + } else { + domain_partials.get(i).unwrap() + }; + let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 { + "" + } else { + domain_to_match_against_partials.get(i).unwrap() + }; + + let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial); + + if !parts_match && domain_to_match_against_partial.len() != 0 { + ok = false; + break; + } + + i += 1; + } + + ok +} + pub fn indent(level: u32) -> String { let mut result: String = String::new(); let mut l: u32 = level; @@ -148,7 +204,7 @@ pub fn retrieve_asset( let (media_type, charset, data) = parse_data_url(url); Ok((data, url.clone(), media_type, charset)) } else if url.scheme() == "file" { - // Check if parent_url is also file:/// (if not, then we don't embed the asset) + // Check if parent_url is also a file: URL (if not, then we don't embed the asset) if parent_url.scheme() != "file" { if !options.silent { eprintln!( @@ -236,6 +292,17 @@ pub fn retrieve_asset( "".to_string(), )) } else { + if let Some(domains) = &options.domains { + let domain_matches = domains + .iter() + .any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim())); + if (options.exclude_domains && domain_matches) + || (!options.exclude_domains && !domain_matches) + { + return Err(client.get("").send().unwrap_err()); + } + } + // URL not in cache, we retrieve the file match client.get(url.as_str()).send() { Ok(response) => { diff --git a/tests/utils/domain_is_within_domain.rs b/tests/utils/domain_is_within_domain.rs new file mode 100644 index 0000000..dc843f4 --- /dev/null +++ b/tests/utils/domain_is_within_domain.rs @@ -0,0 +1,154 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use monolith::utils; + + #[test] + fn sub_domain_is_within_dotted_sub_domain() { + assert!(utils::domain_is_within_domain( + "news.ycombinator.com", + ".news.ycombinator.com" + )); + } + + #[test] + fn domain_is_within_dotted_domain() { + assert!(utils::domain_is_within_domain( + "ycombinator.com", + ".ycombinator.com" + )); + } + + #[test] + fn sub_domain_is_within_dotted_domain() { + assert!(utils::domain_is_within_domain( + "news.ycombinator.com", + ".ycombinator.com" + )); + } + + #[test] + fn sub_domain_is_within_dotted_top_level_domain() { + assert!(utils::domain_is_within_domain( + "news.ycombinator.com", + ".com" + )); + } + + #[test] + fn domain_is_within_itself() { + assert!(utils::domain_is_within_domain( + "ycombinator.com", + "ycombinator.com" + )); + } + + #[test] + fn domain_with_trailing_dot_is_within_itself() { + assert!(utils::domain_is_within_domain( + "ycombinator.com.", + "ycombinator.com" + )); + } + + #[test] + fn domain_with_trailing_dot_is_within_single_dot() { + assert!(utils::domain_is_within_domain("ycombinator.com.", ".")); + } + + #[test] + fn domain_matches_single_dot() { + assert!(utils::domain_is_within_domain("ycombinator.com", ".")); + } + + #[test] + fn dotted_domain_must_be_within_dotted_domain() { + assert!(utils::domain_is_within_domain( + ".ycombinator.com", + ".ycombinator.com" + )); + } + + #[test] + fn empty_is_within_dot() { + assert!(utils::domain_is_within_domain("", ".")); + } + + #[test] + fn both_dots() { + assert!(utils::domain_is_within_domain(".", ".")); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use monolith::utils; + + #[test] + fn sub_domain_must_not_be_within_domain() { + assert!(!utils::domain_is_within_domain( + "news.ycombinator.com", + "ycombinator.com" + )); + } + + #[test] + fn domain_must_not_be_within_top_level_domain() { + assert!(!utils::domain_is_within_domain("ycombinator.com", "com")); + } + + #[test] + fn different_domains_must_not_be_within_one_another() { + assert!(!utils::domain_is_within_domain( + "news.ycombinator.com", + "kernel.org" + )); + } + + #[test] + fn sub_domain_is_not_within_wrong_top_level_domain() { + assert!(!utils::domain_is_within_domain( + "news.ycombinator.com", + "org" + )); + } + + #[test] + fn dotted_domain_is_not_within_domain() { + assert!(!utils::domain_is_within_domain( + ".ycombinator.com", + "ycombinator.com" + )); + } + + #[test] + fn different_domain_is_not_within_dotted_domain() { + assert!(!utils::domain_is_within_domain( + "www.doodleoptimize.com", + ".ycombinator.com" + )); + } + + #[test] + fn no_domain_can_be_within_empty_domain() { + assert!(!utils::domain_is_within_domain("ycombinator.com", "")); + } + + #[test] + fn both_can_not_be_empty() { + assert!(!utils::domain_is_within_domain("", "")); + } +} diff --git a/tests/utils/mod.rs b/tests/utils/mod.rs index e7c7739..60a3ce6 100644 --- a/tests/utils/mod.rs +++ b/tests/utils/mod.rs @@ -1,4 +1,5 @@ mod detect_media_type; +mod domain_is_within_domain; mod indent; mod parse_content_type; mod retrieve_asset;