diff --git a/README.md b/README.md index 175f15a..588dbdb 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ The guide can be found [here](docs/containers.md) --------------------------------------------------- ## Options + - `-b`: Use custom base URL - `-c`: Exclude CSS - `-e`: Ignore network errors - `-f`: Omit frames @@ -62,6 +63,7 @@ The guide can be found [here](docs/containers.md) - `-I`: Isolate the document - `-j`: Exclude JavaScript - `-k`: Accept invalid X.509 (TLS) certificates + - `-M`: Don’t add timestamp and source information - `-o`: Write output to file - `-s`: Be quiet - `-t`: Adjust network request timeout diff --git a/docs/arch/0008-base-tag.md b/docs/arch/0008-base-tag.md new file mode 100644 index 0000000..ce21919 --- /dev/null +++ b/docs/arch/0008-base-tag.md @@ -0,0 +1,27 @@ +# 8. Base Tag + +Date: 2020-11-22 + +## Status + +Accepted + +## Context + +HTML documents may contain `base` tag within `head`, which influences URL resolution prefix for anchor and relative links as well as dynamically loaded resources. Sometimes to make certain saved pages function closer to how they originally operated, the `base` tag specifying the source page's URL may need to be added to the document. + +## Decision + +Adding the `base` tag should be optional. Saved documents should not contain the `base` tag unless it was requested by the user, or unless the document originally had the `base` tag in it. Only documents donwloaded from remote resources should be able to obtain a new `base` tag, existing `base` tags within documents saved from data URLs and local resources should be kept intact. +The existing `href` attribute's value of the original `base` tag should be used for resolving document's relative links instead of document's own URL. +There can be only one such tag. If multiple `base` tags are provided, only the first encountered tag will end up being used. + +## Consequences + +In case the remote document had the `base` tag in it: + - By default: the `href` attribute should be resolved to a full URL if it's relative, kept empty in case it was empty or non-existent, all other attributes of that tag should be kept intact. + - If `base` tag was requested to be added: the exsting `base` tag's `href` attribute should be set to page's full URL, all other attributes should be kept intact. + +In case the remote document didn't have the `base` tag in it: + - By default: no `base` tag is added to the document, it gets saved to disk without having one. + - If `base` tag was requested to be added: the added `base` tag should contain only one attribute `href`, equal to the remote URL of that HTML document. diff --git a/src/html.rs b/src/html.rs index d16cba4..7fadd65 100644 --- a/src/html.rs +++ b/src/html.rs @@ -6,7 +6,7 @@ use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::{format_tendril, Tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; -use html5ever::{local_name, namespace_url, ns}; +use html5ever::{local_name, namespace_url, ns, LocalName}; use reqwest::blocking::Client; use reqwest::Url; use sha2::{Digest, Sha256, Sha384, Sha512}; @@ -29,7 +29,7 @@ struct SrcSetItem<'a> { const ICON_VALUES: &[&str] = &["icon", "shortcut icon"]; -pub fn add_base_tag(document: &Handle, url: String) -> RcDom { +pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { let mut buf: Vec = Vec::new(); serialize(&mut buf, document, SerializeOpts::default()) .expect("unable to serialize DOM into buffer"); @@ -37,55 +37,49 @@ pub fn add_base_tag(document: &Handle, url: String) -> RcDom { let mut dom = html_to_dom(&result); let doc = dom.get_document(); - let html = get_child_node_by_name(&doc, "html"); - let head = get_child_node_by_name(&html, "head"); - let favicon_node = dom.create_element( - QualName::new(None, ns!(), local_name!("base")), - vec![Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: format_tendril!("{}", url), - }], - Default::default(), - ); - - // Insert BASE tag into HEAD - head.children.borrow_mut().push(favicon_node.clone()); + if let Some(html) = get_child_node_by_name(&doc, "html") { + if let Some(head) = get_child_node_by_name(&html, "head") { + let favicon_node = dom.create_element( + QualName::new(None, ns!(), local_name!("link")), + vec![ + Attribute { + name: QualName::new(None, ns!(), local_name!("rel")), + value: format_tendril!("icon"), + }, + Attribute { + name: QualName::new(None, ns!(), local_name!("href")), + value: format_tendril!("{}", favicon_data_url), + }, + ], + Default::default(), + ); + // Insert favicon LINK tag into HEAD + head.children.borrow_mut().push(favicon_node.clone()); + } + } dom } -pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { - let mut buf: Vec = Vec::new(); - serialize(&mut buf, document, SerializeOpts::default()) - .expect("unable to serialize DOM into buffer"); - let result = String::from_utf8(buf).unwrap(); - - let mut dom = html_to_dom(&result); - let doc = dom.get_document(); - let html = get_child_node_by_name(&doc, "html"); - let head = get_child_node_by_name(&html, "head"); - let favicon_node = dom.create_element( - QualName::new(None, ns!(), local_name!("link")), - vec![ - Attribute { - name: QualName::new(None, ns!(), local_name!("rel")), - value: format_tendril!("icon"), - }, - Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: format_tendril!("{}", favicon_data_url), - }, - ], - Default::default(), - ); - - // Insert favicon LINK tag into HEAD - head.children.borrow_mut().push(favicon_node.clone()); - - dom +pub fn check_integrity(data: &[u8], integrity: &str) -> bool { + if integrity.starts_with("sha256-") { + let mut hasher = Sha256::new(); + hasher.update(data); + base64::encode(hasher.finalize()) == integrity[7..] + } else if integrity.starts_with("sha384-") { + let mut hasher = Sha384::new(); + hasher.update(data); + base64::encode(hasher.finalize()) == integrity[7..] + } else if integrity.starts_with("sha512-") { + let mut hasher = Sha512::new(); + hasher.update(data); + base64::encode(hasher.finalize()) == integrity[7..] + } else { + false + } } -pub fn csp(options: &Options) -> String { +pub fn compose_csp(options: &Options) -> String { let mut string_list = vec![]; if options.isolate { @@ -117,6 +111,42 @@ pub fn csp(options: &Options) -> String { string_list.join(" ") } +pub fn create_metadata_tag(url: &str) -> String { + let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); + + // Safe to unwrap (we just put this through an HTTP request) + match Url::parse(url) { + Ok(mut clean_url) => { + clean_url.set_fragment(None); + + // Prevent credentials from getting into metadata + if is_http_url(url) { + // Only HTTP(S) URLs may feature credentials + clean_url.set_username("").unwrap(); + clean_url.set_password(None).unwrap(); + } + + if is_http_url(url) { + format!( + "", + &clean_url, + timestamp, + env!("CARGO_PKG_NAME"), + env!("CARGO_PKG_VERSION"), + ) + } else { + format!( + "", + timestamp, + env!("CARGO_PKG_NAME"), + env!("CARGO_PKG_VERSION"), + ) + } + } + Err(_) => str!(), + } +} + pub fn embed_srcset( cache: &mut HashMap>, client: &Client, @@ -188,15 +218,54 @@ pub fn embed_srcset( result } -fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle { - let children = handle.children.borrow(); +pub fn find_base_node(node: &Handle) -> Option { + match node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + if let Some(base_node) = find_base_node(child) { + return Some(base_node); + } + } + } + NodeData::Element { ref name, .. } => { + match name.local.as_ref() { + "head" => { + return get_child_node_by_name(node, "base"); + } + _ => {} + } + + // Dig deeper + for child in node.children.borrow().iter() { + if let Some(base_node) = find_base_node(child) { + return Some(base_node); + } + } + } + _ => {} + } + + None +} + +pub fn get_base_url(handle: &Handle) -> Option { + if let Some(base_node) = find_base_node(handle) { + get_node_attr(&base_node, "href") + } else { + None + } +} + +pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option { + let children = parent.children.borrow(); let matching_children = children.iter().find(|child| match child.data { NodeData::Element { ref name, .. } => &*name.local == node_name, _ => false, }); match matching_children { - Some(node) => node.clone(), - _ => handle.clone(), + Some(node) => Some(node.clone()), + _ => None, } } @@ -207,77 +276,23 @@ pub fn get_node_name(node: &Handle) -> Option<&'_ str> { } } -pub fn get_parent_node(node: &Handle) -> Handle { - let parent = node.parent.take().clone(); - parent.and_then(|node| node.upgrade()).unwrap() -} - -pub fn has_proper_integrity(data: &[u8], integrity: &str) -> bool { - if integrity.starts_with("sha256-") { - let mut hasher = Sha256::new(); - hasher.update(data); - base64::encode(hasher.finalize()) == integrity[7..] - } else if integrity.starts_with("sha384-") { - let mut hasher = Sha384::new(); - hasher.update(data); - base64::encode(hasher.finalize()) == integrity[7..] - } else if integrity.starts_with("sha512-") { - let mut hasher = Sha512::new(); - hasher.update(data); - base64::encode(hasher.finalize()) == integrity[7..] - } else { - false - } -} - -pub fn has_base_tag(handle: &Handle) -> bool { - let mut found_base_tag: bool = false; - - match handle.data { - NodeData::Document => { - // Dig deeper - for child in handle.children.borrow().iter() { - if has_base_tag(child) { - found_base_tag = true; - break; - } - } - } - NodeData::Element { - ref name, - ref attrs, - .. - } => { - match name.local.as_ref() { - "base" => { - let attrs_mut = &mut attrs.borrow_mut(); - - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "href" { - if !attr.value.trim().is_empty() { - found_base_tag = true; - break; - } - } - } - } - _ => {} - } - - if !found_base_tag { - // Dig deeper - for child in handle.children.borrow().iter() { - if has_base_tag(child) { - found_base_tag = true; - break; - } +pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option { + match &node.data { + NodeData::Element { ref attrs, .. } => { + for attr in attrs.borrow().iter() { + if &*attr.name.local == attr_name { + return Some(str!(&*attr.value)); } } + None } - _ => {} + _ => None, } +} - found_base_tag +pub fn get_parent_node(child: &Handle) -> Handle { + let parent = child.parent.take().clone(); + parent.and_then(|node| node.upgrade()).unwrap() } pub fn has_favicon(handle: &Handle) -> bool { @@ -293,21 +308,12 @@ pub fn has_favicon(handle: &Handle) -> bool { } } } - NodeData::Element { - ref name, - ref attrs, - .. - } => { + NodeData::Element { ref name, .. } => { match name.local.as_ref() { "link" => { - let attrs_mut = &mut attrs.borrow_mut(); - - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "rel" { - if is_icon(attr.value.trim()) { - found_favicon = true; - break; - } + if let Some(attr_value) = get_node_attr(handle, "rel") { + if is_icon(attr_value.trim()) { + found_favicon = true; } } } @@ -341,46 +347,82 @@ pub fn is_icon(attr_value: &str) -> bool { ICON_VALUES.contains(&attr_value.to_lowercase().as_str()) } -pub fn metadata_tag(url: &str) -> String { - let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); +pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom { + let mut buf: Vec = Vec::new(); + serialize(&mut buf, document, SerializeOpts::default()) + .expect("unable to serialize DOM into buffer"); + let result = String::from_utf8(buf).unwrap(); - // Safe to unwrap (we just put this through an HTTP request) - match Url::parse(url) { - Ok(mut clean_url) => { - clean_url.set_fragment(None); + let mut dom = html_to_dom(&result); + let doc = dom.get_document(); + if let Some(html_node) = get_child_node_by_name(&doc, "html") { + if let Some(head_node) = get_child_node_by_name(&html_node, "head") { + // Check if BASE node already exists in the DOM tree + if let Some(base_node) = get_child_node_by_name(&head_node, "base") { + set_node_attr(&base_node, "href", Some(desired_base_href)); + } else { + let base_node = dom.create_element( + QualName::new(None, ns!(), local_name!("base")), + vec![Attribute { + name: QualName::new(None, ns!(), local_name!("href")), + value: format_tendril!("{}", desired_base_href), + }], + Default::default(), + ); + + // Insert newly created BASE node into HEAD + head_node.children.borrow_mut().push(base_node.clone()); + } + } + } - // Prevent credentials from getting into metadata - if is_http_url(url) { - // Only HTTP(S) URLs may feature credentials - clean_url.set_username("").unwrap(); - clean_url.set_password(None).unwrap(); + dom +} + +pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option) { + match &node.data { + NodeData::Element { ref attrs, .. } => { + let attrs_mut = &mut attrs.borrow_mut(); + let mut i = 0; + let mut found_existing_attr: bool = false; + + while i < attrs_mut.len() { + if &attrs_mut[i].name.local == attr_name { + found_existing_attr = true; + + if let Some(attr_value) = attr_value.clone() { + &attrs_mut[i].value.clear(); + &attrs_mut[i].value.push_slice(&attr_value.as_str()); + } else { + // Remove attr completely if attr_value is not defined + attrs_mut.remove(i); + continue; + } + } + + i += 1; } - if is_http_url(url) { - format!( - "", - &clean_url, - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) - } else { - format!( - "", - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) + if !found_existing_attr { + // Add new attribute (since originally the target node didn't have it) + if let Some(attr_value) = attr_value.clone() { + let name = LocalName::from(attr_name); + + attrs_mut.push(Attribute { + name: QualName::new(None, ns!(), name), + value: format_tendril!("{}", attr_value), + }); + } } } - Err(_) => str!(), - } + _ => {} + }; } pub fn stringify_document(handle: &Handle, options: &Options) -> String { let mut buf: Vec = Vec::new(); serialize(&mut buf, handle, SerializeOpts::default()) - .expect("unable to serialize DOM into buffer"); + .expect("Unable to serialize DOM into buffer"); let mut result = String::from_utf8(buf).unwrap(); @@ -398,33 +440,33 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { let mut buf: Vec = Vec::new(); let mut dom = html_to_dom(&result); let doc = dom.get_document(); - let html = get_child_node_by_name(&doc, "html"); - let head = get_child_node_by_name(&html, "head"); - let csp_content: String = csp(options); - - let meta = dom.create_element( - QualName::new(None, ns!(), local_name!("meta")), - vec![ - Attribute { - name: QualName::new(None, ns!(), local_name!("http-equiv")), - value: format_tendril!("Content-Security-Policy"), - }, - Attribute { - name: QualName::new(None, ns!(), local_name!("content")), - value: format_tendril!("{}", csp_content), - }, - ], - Default::default(), - ); - // Note: the CSP meta-tag has to be prepended, never appended, - // since there already may be one defined in the document, - // and browsers don't allow re-defining them (for obvious reasons) - head.children.borrow_mut().reverse(); - head.children.borrow_mut().push(meta.clone()); - head.children.borrow_mut().reverse(); + if let Some(html) = get_child_node_by_name(&doc, "html") { + if let Some(head) = get_child_node_by_name(&html, "head") { + let meta = dom.create_element( + QualName::new(None, ns!(), local_name!("meta")), + vec![ + Attribute { + name: QualName::new(None, ns!(), local_name!("http-equiv")), + value: format_tendril!("Content-Security-Policy"), + }, + Attribute { + name: QualName::new(None, ns!(), local_name!("content")), + value: format_tendril!("{}", compose_csp(options)), + }, + ], + Default::default(), + ); + // Note: the CSP meta-tag has to be prepended, never appended, + // since there already may be one defined in the original document, + // and browsers don't allow re-defining them (for obvious reasons) + head.children.borrow_mut().reverse(); + head.children.borrow_mut().push(meta.clone()); + head.children.borrow_mut().reverse(); + } + } serialize(&mut buf, &doc, SerializeOpts::default()) - .expect("unable to serialize DOM into buffer"); + .expect("Unable to serialize DOM into buffer"); result = String::from_utf8(buf).unwrap(); } @@ -549,7 +591,7 @@ pub fn walk_and_embed_assets( )) => { // Check integrity if integrity.is_empty() - || has_proper_integrity(&link_href_data, &integrity) + || check_integrity(&link_href_data, &integrity) { let link_href_data_url = data_to_data_url( &link_href_media_type, @@ -622,7 +664,7 @@ pub fn walk_and_embed_assets( )) => { // Check integrity if integrity.is_empty() - || has_proper_integrity(&link_href_data, &integrity) + || check_integrity(&link_href_data, &integrity) { let css: String = embed_css( cache, @@ -690,7 +732,7 @@ pub fn walk_and_embed_assets( } "base" => { if is_http_url(url) { - // Ensure BASE href is a full URL, not a relative one + // Ensure the BASE node doesn't have a relative URL for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("href") { @@ -858,74 +900,54 @@ pub fn walk_and_embed_assets( } } "input" => { - // Determine input type - let mut is_image_input: bool = false; - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("type") { - is_image_input = attr.value.to_string().eq_ignore_ascii_case("image"); - } - } - - if is_image_input { - let mut input_image_src: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("src") { - input_image_src = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } - - if options.no_images || input_image_src.is_empty() { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(if input_image_src.is_empty() { - "" + if let Some(attr_value) = get_node_attr(node, "type") { + if attr_value.to_string().eq_ignore_ascii_case("image") { + let mut input_image_src: String = str!(); + let mut i = 0; + while i < attrs_mut.len() { + let attr_name: &str = &attrs_mut[i].name.local; + if attr_name.eq_ignore_ascii_case("src") { + input_image_src = str!(attrs_mut.remove(i).value.trim()); } else { - empty_image!() - }), - }); - } else { - let input_image_full_url = - resolve_url(&url, input_image_src).unwrap_or_default(); - let input_image_url_fragment = - get_url_fragment(input_image_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &input_image_full_url, - options, - depth + 1, - ) { - Ok(( - input_image_data, - input_image_final_url, - input_image_media_type, - )) => { - let input_image_data_url = data_to_data_url( - &input_image_media_type, - &input_image_data, - &input_image_final_url, - ); - // Add data URL src attribute - let assembled_url: String = url_with_fragment( - input_image_data_url.as_str(), - input_image_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + i += 1; } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(input_image_full_url.clone()) { + } + + if options.no_images || input_image_src.is_empty() { + attrs_mut.push(Attribute { + name: QualName::new(None, ns!(), local_name!("src")), + value: Tendril::from_slice(if input_image_src.is_empty() { + "" + } else { + empty_image!() + }), + }); + } else { + let input_image_full_url = + resolve_url(&url, input_image_src).unwrap_or_default(); + let input_image_url_fragment = + get_url_fragment(input_image_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &input_image_full_url, + options, + depth + 1, + ) { + Ok(( + input_image_data, + input_image_final_url, + input_image_media_type, + )) => { + let input_image_data_url = data_to_data_url( + &input_image_media_type, + &input_image_data, + &input_image_final_url, + ); + // Add data URL src attribute let assembled_url: String = url_with_fragment( - input_image_full_url.as_str(), + input_image_data_url.as_str(), input_image_url_fragment.as_str(), ); attrs_mut.push(Attribute { @@ -933,6 +955,23 @@ pub fn walk_and_embed_assets( value: Tendril::from_slice(assembled_url.as_ref()), }); } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(input_image_full_url.clone()) { + let assembled_url: String = url_with_fragment( + input_image_full_url.as_str(), + input_image_url_fragment.as_str(), + ); + attrs_mut.push(Attribute { + name: QualName::new( + None, + ns!(), + local_name!("src"), + ), + value: Tendril::from_slice(assembled_url.as_ref()), + }); + } + } } } } @@ -1066,7 +1105,7 @@ pub fn walk_and_embed_assets( continue; } - // Don't touch email links or hrefs which begin with a hash sign + // Don't touch email links or hrefs which begin with a hash if attr_value.starts_with('#') || url_has_protocol(attr_value) { continue; } @@ -1109,7 +1148,7 @@ pub fn walk_and_embed_assets( Ok((script_data, script_final_url, _script_media_type)) => { // Only embed if we're able to validate integrity if script_integrity.is_empty() - || has_proper_integrity(&script_data, &script_integrity) + || check_integrity(&script_data, &script_integrity) { let script_data_url = data_to_data_url( "application/javascript", diff --git a/src/main.rs b/src/main.rs index 4930d6a..16174eb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,12 +9,12 @@ use std::process; use std::time::Duration; use monolith::html::{ - add_base_tag, add_favicon, has_base_tag, has_favicon, html_to_dom, metadata_tag, + add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url, stringify_document, walk_and_embed_assets, }; use monolith::opts::Options; use monolith::url::{ - data_to_data_url, data_url_to_data, is_data_url, is_file_url, is_http_url, resolve_url, + data_to_data_url, is_data_url, is_file_url, is_http_url, parse_data_url, resolve_url, }; use monolith::utils::retrieve_asset; @@ -52,7 +52,7 @@ fn main() { let options = Options::from_args(); let original_target: &str = &options.target; let target_url: &str; - let base_url; + let mut base_url: String; let mut dom; // Pre-process the input @@ -64,7 +64,9 @@ fn main() { // Determine exact target URL if target.clone().len() == 0 { - eprintln!("No target specified"); + if !options.silent { + eprintln!("No target specified"); + } process::exit(1); } else if is_http_url(target.clone()) || is_data_url(target.clone()) { target_url = target.as_str(); @@ -72,7 +74,9 @@ fn main() { target_url = target.as_str(); } else if path.exists() { if !path.is_file() { - eprintln!("Local target is not a file: {}", original_target); + if !options.silent { + eprintln!("Local target is not a file: {}", original_target); + } process::exit(1); } target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" }); @@ -111,11 +115,16 @@ fn main() { .build() .expect("Failed to initialize HTTP client"); + // At this stage we assume that the base URL is the same as the target URL + base_url = str!(target_url); + // Retrieve target document if is_file_url(target_url) || is_http_url(target_url) { match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) { Ok((data, final_url, _media_type)) => { - base_url = final_url; + if options.base_url.clone().unwrap_or(str!()).is_empty() { + base_url = final_url + } dom = html_to_dom(&String::from_utf8_lossy(&data)); } Err(_) => { @@ -126,23 +135,40 @@ fn main() { } } } else if is_data_url(target_url) { - let (media_type, data): (String, Vec) = data_url_to_data(target_url); + let (media_type, data): (String, Vec) = parse_data_url(target_url); if !media_type.eq_ignore_ascii_case("text/html") { - eprintln!("Unsupported data URL media type"); + if !options.silent { + eprintln!("Unsupported data URL media type"); + } process::exit(1); } - base_url = str!(target_url); dom = html_to_dom(&String::from_utf8_lossy(&data)); } else { process::exit(1); } + // Use custom base URL if specified, read and use what's in the DOM otherwise + if !options.base_url.clone().unwrap_or(str!()).is_empty() { + if is_data_url(options.base_url.clone().unwrap()) { + if !options.silent { + eprintln!("Data URLs cannot be used as base URL"); + } + process::exit(1); + } else { + base_url = options.base_url.clone().unwrap(); + } + } else { + if let Some(existing_base_url) = get_base_url(&dom.document) { + base_url = resolve_url(target_url, existing_base_url).unwrap(); + } + } + // Embed remote assets walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0); - // Take care of BASE tag - if is_http_url(base_url.clone()) && !has_base_tag(&dom.document) { - dom = add_base_tag(&dom.document, base_url.clone()); + // Update or add new BASE tag to reroute network requests and hash-links in the final document + if let Some(new_base_url) = options.base_url.clone() { + dom = set_base_url(&dom.document, new_base_url); } // Request and embed /favicon.ico (unless it's already linked in the document) @@ -172,7 +198,7 @@ fn main() { // Add metadata tag if !options.no_metadata { - let metadata_comment: String = metadata_tag(&base_url); + let metadata_comment: String = create_metadata_tag(&base_url); result.insert_str(0, &metadata_comment); if metadata_comment.len() > 0 { result.insert_str(metadata_comment.len(), "\n"); diff --git a/src/opts.rs b/src/opts.rs index 81ae9e7..bfe69d7 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -2,20 +2,21 @@ use clap::{App, Arg}; #[derive(Default)] pub struct Options { - pub target: String, + pub base_url: Option, pub no_css: bool, pub ignore_errors: bool, - pub no_fonts: bool, pub no_frames: bool, + pub no_fonts: bool, pub no_images: bool, + pub isolate: bool, pub no_js: bool, pub insecure: bool, - pub isolate: bool, + pub no_metadata: bool, pub output: String, pub silent: bool, pub timeout: u64, pub user_agent: String, - pub no_metadata: bool, + pub target: String, } const ASCII: &str = " \ @@ -37,14 +38,8 @@ impl Options { .version(crate_version!()) .author(crate_authors!("\n")) .about(format!("{}\n{}", ASCII, crate_description!()).as_str()) - .arg( - Arg::with_name("target") - .required(true) - .takes_value(true) - .index(1) - .help("URL or file path"), - ) // .args_from_usage("-a, --no-audio 'Removes audio sources'") + .args_from_usage("-b, --base-url=[http://localhost/] 'Use custom base URL'") .args_from_usage("-c, --no-css 'Removes CSS'") .args_from_usage("-e, --ignore-errors 'Ignore network errors'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") @@ -53,12 +48,19 @@ impl Options { .args_from_usage("-I, --isolate 'Cuts off document from the Internet'") .args_from_usage("-j, --no-js 'Removes JavaScript'") .args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'") - .args_from_usage("-M, --no-metadata 'Excludes metadata information from the document'") + .args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'") .args_from_usage("-o, --output=[document.html] 'Write output to '") .args_from_usage("-s, --silent 'Suppresses verbosity'") .args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'") .args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'") // .args_from_usage("-v, --no-video 'Removes video sources'") + .arg( + Arg::with_name("target") + .required(true) + .takes_value(true) + .index(1) + .help("URL or file path"), + ) .get_matches(); let mut options: Options = Options::default(); @@ -67,6 +69,9 @@ impl Options { .value_of("target") .expect("please set target") .to_string(); + if let Some(base_url) = app.value_of("base-url") { + options.base_url = Some(str!(base_url)); + } options.no_css = app.is_present("no-css"); options.ignore_errors = app.is_present("ignore-errors"); options.no_frames = app.is_present("no-frames"); diff --git a/src/tests/cli/base_url.rs b/src/tests/cli/base_url.rs new file mode 100644 index 0000000..7ba88d9 --- /dev/null +++ b/src/tests/cli/base_url.rs @@ -0,0 +1,123 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use assert_cmd::prelude::*; + use std::env; + use std::process::Command; + + #[test] + fn add_new_when_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("-b") + .arg("http://localhost:8000/") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn keep_existing_when_none_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn override_existing_when_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("-b") + .arg("http://localhost/") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn remove_existing_when_empty_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("-b") + .arg("") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } +} diff --git a/src/tests/cli.rs b/src/tests/cli/basic.rs similarity index 100% rename from src/tests/cli.rs rename to src/tests/cli/basic.rs diff --git a/src/tests/cli/mod.rs b/src/tests/cli/mod.rs new file mode 100644 index 0000000..1c20743 --- /dev/null +++ b/src/tests/cli/mod.rs @@ -0,0 +1,2 @@ +mod base_url; +mod basic; diff --git a/src/tests/html/has_proper_integrity.rs b/src/tests/html/check_integrity.rs similarity index 85% rename from src/tests/html/has_proper_integrity.rs rename to src/tests/html/check_integrity.rs index 639bf45..121e412 100644 --- a/src/tests/html/has_proper_integrity.rs +++ b/src/tests/html/check_integrity.rs @@ -11,7 +11,7 @@ mod passing { #[test] fn empty_input_sha256() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "".as_bytes(), "sha256-47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=" )); @@ -19,7 +19,7 @@ mod passing { #[test] fn sha256() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "abcdef0123456789".as_bytes(), "sha256-9EWAHgy4mSYsm54hmDaIDXPKLRsLnBX7lZyQ6xISNOM=" )); @@ -27,7 +27,7 @@ mod passing { #[test] fn sha384() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "abcdef0123456789".as_bytes(), "sha384-gc9l7omltke8C33bedgh15E12M7RrAQa5t63Yb8APlpe7ZhiqV23+oqiulSJl3Kw" )); @@ -35,7 +35,7 @@ mod passing { #[test] fn sha512() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "abcdef0123456789".as_bytes(), "sha512-zG5B88cYMqcdiMi9gz0XkOFYw2BpjeYdn5V6+oFrMgSNjRpqL7EF8JEwl17ztZbK3N7I/tTwp3kxQbN1RgFBww==" )); @@ -55,20 +55,17 @@ mod failing { #[test] fn empty_hash() { - assert!(!html::has_proper_integrity( - "abcdef0123456789".as_bytes(), - "" - )); + assert!(!html::check_integrity("abcdef0123456789".as_bytes(), "")); } #[test] fn empty_input_empty_hash() { - assert!(!html::has_proper_integrity("".as_bytes(), "")); + assert!(!html::check_integrity("".as_bytes(), "")); } #[test] fn sha256() { - assert!(!html::has_proper_integrity( + assert!(!html::check_integrity( "abcdef0123456789".as_bytes(), "sha256-badhash" )); @@ -76,7 +73,7 @@ mod failing { #[test] fn sha384() { - assert!(!html::has_proper_integrity( + assert!(!html::check_integrity( "abcdef0123456789".as_bytes(), "sha384-badhash" )); @@ -84,7 +81,7 @@ mod failing { #[test] fn sha512() { - assert!(!html::has_proper_integrity( + assert!(!html::check_integrity( "abcdef0123456789".as_bytes(), "sha512-badhash" )); diff --git a/src/tests/html/csp.rs b/src/tests/html/compose_csp.rs similarity index 86% rename from src/tests/html/csp.rs rename to src/tests/html/compose_csp.rs index e4adde9..a05b450 100644 --- a/src/tests/html/csp.rs +++ b/src/tests/html/compose_csp.rs @@ -14,7 +14,7 @@ mod passing { fn isolated() { let mut options = Options::default(); options.isolate = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "default-src 'unsafe-inline' data:;"); } @@ -23,7 +23,7 @@ mod passing { fn no_css() { let mut options = Options::default(); options.no_css = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "style-src 'none';"); } @@ -32,7 +32,7 @@ mod passing { fn no_fonts() { let mut options = Options::default(); options.no_fonts = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "font-src 'none';"); } @@ -41,7 +41,7 @@ mod passing { fn no_frames() { let mut options = Options::default(); options.no_frames = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "frame-src 'none'; child-src 'none';"); } @@ -50,7 +50,7 @@ mod passing { fn no_js() { let mut options = Options::default(); options.no_js = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "script-src 'none';"); } @@ -59,7 +59,7 @@ mod passing { fn no_images() { let mut options = Options::default(); options.no_images = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "img-src data:;"); } @@ -73,7 +73,7 @@ mod passing { options.no_frames = true; options.no_js = true; options.no_images = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "default-src 'unsafe-inline' data:; style-src 'none'; font-src 'none'; frame-src 'none'; child-src 'none'; script-src 'none'; img-src data:;"); } diff --git a/src/tests/html/metadata_tag.rs b/src/tests/html/create_metadata_tag.rs similarity index 92% rename from src/tests/html/metadata_tag.rs rename to src/tests/html/create_metadata_tag.rs index cef13bf..ea59731 100644 --- a/src/tests/html/metadata_tag.rs +++ b/src/tests/html/create_metadata_tag.rs @@ -15,7 +15,7 @@ mod passing { fn http_url() { let url = "http://192.168.1.1/"; let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(url); assert_eq!( metadata_comment, @@ -33,7 +33,7 @@ mod passing { fn file_url() { let url = "file:///home/monolith/index.html"; let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(url); assert_eq!( metadata_comment, @@ -50,7 +50,7 @@ mod passing { fn data_url() { let url = "data:text/html,Hello%2C%20World!"; let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(url); assert_eq!( metadata_comment, @@ -77,6 +77,6 @@ mod failing { #[test] fn empty_string() { - assert_eq!(html::metadata_tag(""), ""); + assert_eq!(html::create_metadata_tag(""), ""); } } diff --git a/src/tests/html/get_base_url.rs b/src/tests/html/get_base_url.rs new file mode 100644 index 0000000..a1b959c --- /dev/null +++ b/src/tests/html/get_base_url.rs @@ -0,0 +1,104 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use crate::html; + + #[test] + fn present() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!( + html::get_base_url(&dom.document), + Some(str!("https://musicbrainz.org")) + ); + } + + #[test] + fn multiple_tags() { + let html = " + + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!( + html::get_base_url(&dom.document), + Some(str!("https://www.discogs.com/")) + ); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use crate::html; + + #[test] + fn absent() { + let html = " + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!(html::get_base_url(&dom.document), None); + } + + #[test] + fn no_href() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!(html::get_base_url(&dom.document), None); + } + + #[test] + fn empty_href() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!(html::get_base_url(&dom.document), Some(str!())); + } +} diff --git a/src/tests/html/get_node_attr.rs b/src/tests/html/get_node_attr.rs new file mode 100644 index 0000000..a8b7448 --- /dev/null +++ b/src/tests/html/get_node_attr.rs @@ -0,0 +1,54 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use html5ever::rcdom::{Handle, NodeData}; + + use crate::html; + + #[test] + fn div_two_style_attributes() { + let html = "
"; + let dom = html::html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + + if node_name == "body" { + assert_eq!(html::get_node_attr(node, "class"), None); + } else if node_name == "div" { + assert_eq!( + html::get_node_attr(node, "style"), + Some(str!("color: blue;")) + ); + } + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 6); + } +} diff --git a/src/tests/html/get_node_name.rs b/src/tests/html/get_node_name.rs index 47f339f..79da75e 100644 --- a/src/tests/html/get_node_name.rs +++ b/src/tests/html/get_node_name.rs @@ -12,7 +12,7 @@ mod passing { use crate::html; #[test] - fn get_node_name() { + fn parent_node_names() { let html = "

"; let dom = html::html_to_dom(&html); let mut count = 0; diff --git a/src/tests/html/mod.rs b/src/tests/html/mod.rs index a912338..b9576da 100644 --- a/src/tests/html/mod.rs +++ b/src/tests/html/mod.rs @@ -1,10 +1,13 @@ mod add_favicon; -mod csp; +mod check_integrity; +mod compose_csp; +mod create_metadata_tag; mod embed_srcset; +mod get_base_url; +mod get_node_attr; mod get_node_name; mod has_favicon; -mod has_proper_integrity; mod is_icon; -mod metadata_tag; +mod set_node_attr; mod stringify_document; mod walk_and_embed_assets; diff --git a/src/tests/html/set_node_attr.rs b/src/tests/html/set_node_attr.rs new file mode 100644 index 0000000..73e7766 --- /dev/null +++ b/src/tests/html/set_node_attr.rs @@ -0,0 +1,66 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use html5ever::rcdom::{Handle, NodeData}; + + use crate::html; + + #[test] + fn html_lang_and_body_style() { + let html = ""; + let dom = html::html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + + if node_name == "html" { + assert_eq!(html::get_node_attr(node, "lang"), Some(str!("en"))); + + html::set_node_attr(node, "lang", Some(str!("de"))); + assert_eq!(html::get_node_attr(node, "lang"), Some(str!("de"))); + + html::set_node_attr(node, "lang", None); + assert_eq!(html::get_node_attr(node, "lang"), None); + + html::set_node_attr(node, "lang", Some(str!(""))); + assert_eq!(html::get_node_attr(node, "lang"), Some(str!(""))); + } else if node_name == "body" { + assert_eq!(html::get_node_attr(node, "style"), None); + + html::set_node_attr(node, "style", Some(str!("display: none;"))); + assert_eq!( + html::get_node_attr(node, "style"), + Some(str!("display: none;")) + ); + } + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 5); + } +} diff --git a/src/tests/url/mod.rs b/src/tests/url/mod.rs index 226c388..fe06cda 100644 --- a/src/tests/url/mod.rs +++ b/src/tests/url/mod.rs @@ -1,12 +1,12 @@ mod clean_url; mod data_to_data_url; -mod data_url_to_data; mod decode_url; mod file_url_to_fs_path; mod get_url_fragment; mod is_data_url; mod is_file_url; mod is_http_url; +mod parse_data_url; mod resolve_url; mod url_has_protocol; mod url_with_fragment; diff --git a/src/tests/url/data_url_to_data.rs b/src/tests/url/parse_data_url.rs similarity index 83% rename from src/tests/url/data_url_to_data.rs rename to src/tests/url/parse_data_url.rs index 2ad5437..589fc5e 100644 --- a/src/tests/url/data_url_to_data.rs +++ b/src/tests/url/parse_data_url.rs @@ -11,7 +11,7 @@ mod passing { #[test] fn parse_text_html_base64() { - let (media_type, data) = url::data_url_to_data("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="); + let (media_type, data) = url::parse_data_url("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="); assert_eq!(media_type, "text/html"); assert_eq!( @@ -22,7 +22,7 @@ mod passing { #[test] fn parse_text_html_utf8() { - let (media_type, data) = url::data_url_to_data( + let (media_type, data) = url::parse_data_url( "data:text/html;utf8,Work expands so as to fill the time available for its completion", ); @@ -35,7 +35,7 @@ mod passing { #[test] fn parse_text_html_plaintext() { - let (media_type, data) = url::data_url_to_data( + let (media_type, data) = url::parse_data_url( "data:text/html,Work expands so as to fill the time available for its completion", ); @@ -48,7 +48,7 @@ mod passing { #[test] fn parse_text_html_charset_utf_8_between_two_whitespaces() { - let (media_type, data) = url::data_url_to_data(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "); + let (media_type, data) = url::parse_data_url(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "); assert_eq!(media_type, "text/html"); assert_eq!( @@ -60,7 +60,7 @@ mod passing { #[test] fn parse_text_css_url_encoded() { let (media_type, data) = - url::data_url_to_data("data:text/css,div{background-color:%23000}"); + url::parse_data_url("data:text/css,div{background-color:%23000}"); assert_eq!(media_type, "text/css"); assert_eq!(String::from_utf8_lossy(&data), "div{background-color:#000}"); @@ -68,7 +68,7 @@ mod passing { #[test] fn parse_no_media_type_base64() { - let (media_type, data) = url::data_url_to_data("data:;base64,dGVzdA=="); + let (media_type, data) = url::parse_data_url("data:;base64,dGVzdA=="); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), "test"); @@ -76,7 +76,7 @@ mod passing { #[test] fn parse_no_media_type_no_encoding() { - let (media_type, data) = url::data_url_to_data("data:;,test%20test"); + let (media_type, data) = url::parse_data_url("data:;,test%20test"); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), "test test"); @@ -96,7 +96,7 @@ mod failing { #[test] fn just_word_data() { - let (media_type, data) = url::data_url_to_data("data"); + let (media_type, data) = url::parse_data_url("data"); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), ""); diff --git a/src/url.rs b/src/url.rs index e493ce1..5792fb6 100644 --- a/src/url.rs +++ b/src/url.rs @@ -33,45 +33,6 @@ pub fn data_to_data_url(media_type: &str, data: &[u8], url: &str) -> String { format!("data:{};base64,{}", media_type, base64::encode(data)) } -pub fn data_url_to_data>(url: T) -> (String, Vec) { - let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap()); - let path: String = parsed_url.path().to_string(); - let comma_loc: usize = path.find(',').unwrap_or(path.len()); - - let meta_data: String = path.chars().take(comma_loc).collect(); - let raw_data: String = path.chars().skip(comma_loc + 1).collect(); - - let text: String = decode_url(raw_data); - - let meta_data_items: Vec<&str> = meta_data.split(';').collect(); - let mut media_type: String = str!(); - let mut encoding: &str = ""; - - let mut i: i8 = 0; - for item in &meta_data_items { - if i == 0 { - media_type = str!(item); - } else { - if item.eq_ignore_ascii_case("base64") - || item.eq_ignore_ascii_case("utf8") - || item.eq_ignore_ascii_case("charset=UTF-8") - { - encoding = item; - } - } - - i = i + 1; - } - - let data: Vec = if encoding.eq_ignore_ascii_case("base64") { - base64::decode(&text).unwrap_or(vec![]) - } else { - text.as_bytes().to_vec() - }; - - (media_type, data) -} - pub fn decode_url(input: String) -> String { let input: String = input.replace("+", "%2B"); @@ -138,6 +99,45 @@ pub fn is_http_url>(url: T) -> bool { .unwrap_or(false) } +pub fn parse_data_url>(url: T) -> (String, Vec) { + let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap()); + let path: String = parsed_url.path().to_string(); + let comma_loc: usize = path.find(',').unwrap_or(path.len()); + + let meta_data: String = path.chars().take(comma_loc).collect(); + let raw_data: String = path.chars().skip(comma_loc + 1).collect(); + + let text: String = decode_url(raw_data); + + let meta_data_items: Vec<&str> = meta_data.split(';').collect(); + let mut media_type: String = str!(); + let mut encoding: &str = ""; + + let mut i: i8 = 0; + for item in &meta_data_items { + if i == 0 { + media_type = str!(item); + } else { + if item.eq_ignore_ascii_case("base64") + || item.eq_ignore_ascii_case("utf8") + || item.eq_ignore_ascii_case("charset=UTF-8") + { + encoding = item; + } + } + + i = i + 1; + } + + let data: Vec = if encoding.eq_ignore_ascii_case("base64") { + base64::decode(&text).unwrap_or(vec![]) + } else { + text.as_bytes().to_vec() + }; + + (media_type, data) +} + pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result { let result = if is_http_url(to.as_ref()) { to.as_ref().to_string() diff --git a/src/utils.rs b/src/utils.rs index 014ee6e..5fd76be 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,7 +5,7 @@ use std::fs; use std::path::Path; use crate::opts::Options; -use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url}; +use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url}; const INDENT: &str = " "; @@ -83,7 +83,7 @@ pub fn retrieve_asset( } if is_data_url(&url) { - let (media_type, data) = data_url_to_data(url); + let (media_type, data) = parse_data_url(url); Ok((data, url.to_string(), media_type)) } else if is_file_url(&url) { // Check if parent_url is also file:///