From 15d98a7269db57e2852d20cb87d4381a1393c891 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Sun, 22 Nov 2020 19:12:26 -1000 Subject: [PATCH 1/3] don't modify base url by default, add option for setting it --- README.md | 2 + docs/arch/0008-base-tag.md | 27 + src/html.rs | 543 ++++++++++-------- src/main.rs | 52 +- src/opts.rs | 29 +- src/tests/cli/base_url.rs | 123 ++++ src/tests/{cli.rs => cli/basic.rs} | 0 src/tests/cli/mod.rs | 2 + ...proper_integrity.rs => check_integrity.rs} | 21 +- src/tests/html/{csp.rs => compose_csp.rs} | 14 +- ...metadata_tag.rs => create_metadata_tag.rs} | 8 +- src/tests/html/get_base_url.rs | 104 ++++ src/tests/html/get_node_attr.rs | 54 ++ src/tests/html/get_node_name.rs | 2 +- src/tests/html/mod.rs | 9 +- src/tests/html/set_node_attr.rs | 66 +++ src/tests/url/mod.rs | 2 +- ...{data_url_to_data.rs => parse_data_url.rs} | 16 +- src/url.rs | 78 +-- src/utils.rs | 4 +- 20 files changed, 802 insertions(+), 354 deletions(-) create mode 100644 docs/arch/0008-base-tag.md create mode 100644 src/tests/cli/base_url.rs rename src/tests/{cli.rs => cli/basic.rs} (100%) create mode 100644 src/tests/cli/mod.rs rename src/tests/html/{has_proper_integrity.rs => check_integrity.rs} (85%) rename src/tests/html/{csp.rs => compose_csp.rs} (86%) rename src/tests/html/{metadata_tag.rs => create_metadata_tag.rs} (92%) create mode 100644 src/tests/html/get_base_url.rs create mode 100644 src/tests/html/get_node_attr.rs create mode 100644 src/tests/html/set_node_attr.rs rename src/tests/url/{data_url_to_data.rs => parse_data_url.rs} (83%) diff --git a/README.md b/README.md index 175f15a..588dbdb 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ The guide can be found [here](docs/containers.md) --------------------------------------------------- ## Options + - `-b`: Use custom base URL - `-c`: Exclude CSS - `-e`: Ignore network errors - `-f`: Omit frames @@ -62,6 +63,7 @@ The guide can be found [here](docs/containers.md) - `-I`: Isolate the document - `-j`: Exclude JavaScript - `-k`: Accept invalid X.509 (TLS) certificates + - `-M`: Don’t add timestamp and source information - `-o`: Write output to file - `-s`: Be quiet - `-t`: Adjust network request timeout diff --git a/docs/arch/0008-base-tag.md b/docs/arch/0008-base-tag.md new file mode 100644 index 0000000..ce21919 --- /dev/null +++ b/docs/arch/0008-base-tag.md @@ -0,0 +1,27 @@ +# 8. Base Tag + +Date: 2020-11-22 + +## Status + +Accepted + +## Context + +HTML documents may contain `base` tag within `head`, which influences URL resolution prefix for anchor and relative links as well as dynamically loaded resources. Sometimes to make certain saved pages function closer to how they originally operated, the `base` tag specifying the source page's URL may need to be added to the document. + +## Decision + +Adding the `base` tag should be optional. Saved documents should not contain the `base` tag unless it was requested by the user, or unless the document originally had the `base` tag in it. Only documents donwloaded from remote resources should be able to obtain a new `base` tag, existing `base` tags within documents saved from data URLs and local resources should be kept intact. +The existing `href` attribute's value of the original `base` tag should be used for resolving document's relative links instead of document's own URL. +There can be only one such tag. If multiple `base` tags are provided, only the first encountered tag will end up being used. + +## Consequences + +In case the remote document had the `base` tag in it: + - By default: the `href` attribute should be resolved to a full URL if it's relative, kept empty in case it was empty or non-existent, all other attributes of that tag should be kept intact. + - If `base` tag was requested to be added: the exsting `base` tag's `href` attribute should be set to page's full URL, all other attributes should be kept intact. + +In case the remote document didn't have the `base` tag in it: + - By default: no `base` tag is added to the document, it gets saved to disk without having one. + - If `base` tag was requested to be added: the added `base` tag should contain only one attribute `href`, equal to the remote URL of that HTML document. diff --git a/src/html.rs b/src/html.rs index d16cba4..7fadd65 100644 --- a/src/html.rs +++ b/src/html.rs @@ -6,7 +6,7 @@ use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::{format_tendril, Tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; -use html5ever::{local_name, namespace_url, ns}; +use html5ever::{local_name, namespace_url, ns, LocalName}; use reqwest::blocking::Client; use reqwest::Url; use sha2::{Digest, Sha256, Sha384, Sha512}; @@ -29,7 +29,7 @@ struct SrcSetItem<'a> { const ICON_VALUES: &[&str] = &["icon", "shortcut icon"]; -pub fn add_base_tag(document: &Handle, url: String) -> RcDom { +pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { let mut buf: Vec = Vec::new(); serialize(&mut buf, document, SerializeOpts::default()) .expect("unable to serialize DOM into buffer"); @@ -37,55 +37,49 @@ pub fn add_base_tag(document: &Handle, url: String) -> RcDom { let mut dom = html_to_dom(&result); let doc = dom.get_document(); - let html = get_child_node_by_name(&doc, "html"); - let head = get_child_node_by_name(&html, "head"); - let favicon_node = dom.create_element( - QualName::new(None, ns!(), local_name!("base")), - vec![Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: format_tendril!("{}", url), - }], - Default::default(), - ); - - // Insert BASE tag into HEAD - head.children.borrow_mut().push(favicon_node.clone()); + if let Some(html) = get_child_node_by_name(&doc, "html") { + if let Some(head) = get_child_node_by_name(&html, "head") { + let favicon_node = dom.create_element( + QualName::new(None, ns!(), local_name!("link")), + vec![ + Attribute { + name: QualName::new(None, ns!(), local_name!("rel")), + value: format_tendril!("icon"), + }, + Attribute { + name: QualName::new(None, ns!(), local_name!("href")), + value: format_tendril!("{}", favicon_data_url), + }, + ], + Default::default(), + ); + // Insert favicon LINK tag into HEAD + head.children.borrow_mut().push(favicon_node.clone()); + } + } dom } -pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { - let mut buf: Vec = Vec::new(); - serialize(&mut buf, document, SerializeOpts::default()) - .expect("unable to serialize DOM into buffer"); - let result = String::from_utf8(buf).unwrap(); - - let mut dom = html_to_dom(&result); - let doc = dom.get_document(); - let html = get_child_node_by_name(&doc, "html"); - let head = get_child_node_by_name(&html, "head"); - let favicon_node = dom.create_element( - QualName::new(None, ns!(), local_name!("link")), - vec![ - Attribute { - name: QualName::new(None, ns!(), local_name!("rel")), - value: format_tendril!("icon"), - }, - Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: format_tendril!("{}", favicon_data_url), - }, - ], - Default::default(), - ); - - // Insert favicon LINK tag into HEAD - head.children.borrow_mut().push(favicon_node.clone()); - - dom +pub fn check_integrity(data: &[u8], integrity: &str) -> bool { + if integrity.starts_with("sha256-") { + let mut hasher = Sha256::new(); + hasher.update(data); + base64::encode(hasher.finalize()) == integrity[7..] + } else if integrity.starts_with("sha384-") { + let mut hasher = Sha384::new(); + hasher.update(data); + base64::encode(hasher.finalize()) == integrity[7..] + } else if integrity.starts_with("sha512-") { + let mut hasher = Sha512::new(); + hasher.update(data); + base64::encode(hasher.finalize()) == integrity[7..] + } else { + false + } } -pub fn csp(options: &Options) -> String { +pub fn compose_csp(options: &Options) -> String { let mut string_list = vec![]; if options.isolate { @@ -117,6 +111,42 @@ pub fn csp(options: &Options) -> String { string_list.join(" ") } +pub fn create_metadata_tag(url: &str) -> String { + let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); + + // Safe to unwrap (we just put this through an HTTP request) + match Url::parse(url) { + Ok(mut clean_url) => { + clean_url.set_fragment(None); + + // Prevent credentials from getting into metadata + if is_http_url(url) { + // Only HTTP(S) URLs may feature credentials + clean_url.set_username("").unwrap(); + clean_url.set_password(None).unwrap(); + } + + if is_http_url(url) { + format!( + "", + &clean_url, + timestamp, + env!("CARGO_PKG_NAME"), + env!("CARGO_PKG_VERSION"), + ) + } else { + format!( + "", + timestamp, + env!("CARGO_PKG_NAME"), + env!("CARGO_PKG_VERSION"), + ) + } + } + Err(_) => str!(), + } +} + pub fn embed_srcset( cache: &mut HashMap>, client: &Client, @@ -188,15 +218,54 @@ pub fn embed_srcset( result } -fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle { - let children = handle.children.borrow(); +pub fn find_base_node(node: &Handle) -> Option { + match node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + if let Some(base_node) = find_base_node(child) { + return Some(base_node); + } + } + } + NodeData::Element { ref name, .. } => { + match name.local.as_ref() { + "head" => { + return get_child_node_by_name(node, "base"); + } + _ => {} + } + + // Dig deeper + for child in node.children.borrow().iter() { + if let Some(base_node) = find_base_node(child) { + return Some(base_node); + } + } + } + _ => {} + } + + None +} + +pub fn get_base_url(handle: &Handle) -> Option { + if let Some(base_node) = find_base_node(handle) { + get_node_attr(&base_node, "href") + } else { + None + } +} + +pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option { + let children = parent.children.borrow(); let matching_children = children.iter().find(|child| match child.data { NodeData::Element { ref name, .. } => &*name.local == node_name, _ => false, }); match matching_children { - Some(node) => node.clone(), - _ => handle.clone(), + Some(node) => Some(node.clone()), + _ => None, } } @@ -207,77 +276,23 @@ pub fn get_node_name(node: &Handle) -> Option<&'_ str> { } } -pub fn get_parent_node(node: &Handle) -> Handle { - let parent = node.parent.take().clone(); - parent.and_then(|node| node.upgrade()).unwrap() -} - -pub fn has_proper_integrity(data: &[u8], integrity: &str) -> bool { - if integrity.starts_with("sha256-") { - let mut hasher = Sha256::new(); - hasher.update(data); - base64::encode(hasher.finalize()) == integrity[7..] - } else if integrity.starts_with("sha384-") { - let mut hasher = Sha384::new(); - hasher.update(data); - base64::encode(hasher.finalize()) == integrity[7..] - } else if integrity.starts_with("sha512-") { - let mut hasher = Sha512::new(); - hasher.update(data); - base64::encode(hasher.finalize()) == integrity[7..] - } else { - false - } -} - -pub fn has_base_tag(handle: &Handle) -> bool { - let mut found_base_tag: bool = false; - - match handle.data { - NodeData::Document => { - // Dig deeper - for child in handle.children.borrow().iter() { - if has_base_tag(child) { - found_base_tag = true; - break; - } - } - } - NodeData::Element { - ref name, - ref attrs, - .. - } => { - match name.local.as_ref() { - "base" => { - let attrs_mut = &mut attrs.borrow_mut(); - - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "href" { - if !attr.value.trim().is_empty() { - found_base_tag = true; - break; - } - } - } - } - _ => {} - } - - if !found_base_tag { - // Dig deeper - for child in handle.children.borrow().iter() { - if has_base_tag(child) { - found_base_tag = true; - break; - } +pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option { + match &node.data { + NodeData::Element { ref attrs, .. } => { + for attr in attrs.borrow().iter() { + if &*attr.name.local == attr_name { + return Some(str!(&*attr.value)); } } + None } - _ => {} + _ => None, } +} - found_base_tag +pub fn get_parent_node(child: &Handle) -> Handle { + let parent = child.parent.take().clone(); + parent.and_then(|node| node.upgrade()).unwrap() } pub fn has_favicon(handle: &Handle) -> bool { @@ -293,21 +308,12 @@ pub fn has_favicon(handle: &Handle) -> bool { } } } - NodeData::Element { - ref name, - ref attrs, - .. - } => { + NodeData::Element { ref name, .. } => { match name.local.as_ref() { "link" => { - let attrs_mut = &mut attrs.borrow_mut(); - - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "rel" { - if is_icon(attr.value.trim()) { - found_favicon = true; - break; - } + if let Some(attr_value) = get_node_attr(handle, "rel") { + if is_icon(attr_value.trim()) { + found_favicon = true; } } } @@ -341,46 +347,82 @@ pub fn is_icon(attr_value: &str) -> bool { ICON_VALUES.contains(&attr_value.to_lowercase().as_str()) } -pub fn metadata_tag(url: &str) -> String { - let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); +pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom { + let mut buf: Vec = Vec::new(); + serialize(&mut buf, document, SerializeOpts::default()) + .expect("unable to serialize DOM into buffer"); + let result = String::from_utf8(buf).unwrap(); - // Safe to unwrap (we just put this through an HTTP request) - match Url::parse(url) { - Ok(mut clean_url) => { - clean_url.set_fragment(None); + let mut dom = html_to_dom(&result); + let doc = dom.get_document(); + if let Some(html_node) = get_child_node_by_name(&doc, "html") { + if let Some(head_node) = get_child_node_by_name(&html_node, "head") { + // Check if BASE node already exists in the DOM tree + if let Some(base_node) = get_child_node_by_name(&head_node, "base") { + set_node_attr(&base_node, "href", Some(desired_base_href)); + } else { + let base_node = dom.create_element( + QualName::new(None, ns!(), local_name!("base")), + vec![Attribute { + name: QualName::new(None, ns!(), local_name!("href")), + value: format_tendril!("{}", desired_base_href), + }], + Default::default(), + ); + + // Insert newly created BASE node into HEAD + head_node.children.borrow_mut().push(base_node.clone()); + } + } + } - // Prevent credentials from getting into metadata - if is_http_url(url) { - // Only HTTP(S) URLs may feature credentials - clean_url.set_username("").unwrap(); - clean_url.set_password(None).unwrap(); + dom +} + +pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option) { + match &node.data { + NodeData::Element { ref attrs, .. } => { + let attrs_mut = &mut attrs.borrow_mut(); + let mut i = 0; + let mut found_existing_attr: bool = false; + + while i < attrs_mut.len() { + if &attrs_mut[i].name.local == attr_name { + found_existing_attr = true; + + if let Some(attr_value) = attr_value.clone() { + &attrs_mut[i].value.clear(); + &attrs_mut[i].value.push_slice(&attr_value.as_str()); + } else { + // Remove attr completely if attr_value is not defined + attrs_mut.remove(i); + continue; + } + } + + i += 1; } - if is_http_url(url) { - format!( - "", - &clean_url, - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) - } else { - format!( - "", - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) + if !found_existing_attr { + // Add new attribute (since originally the target node didn't have it) + if let Some(attr_value) = attr_value.clone() { + let name = LocalName::from(attr_name); + + attrs_mut.push(Attribute { + name: QualName::new(None, ns!(), name), + value: format_tendril!("{}", attr_value), + }); + } } } - Err(_) => str!(), - } + _ => {} + }; } pub fn stringify_document(handle: &Handle, options: &Options) -> String { let mut buf: Vec = Vec::new(); serialize(&mut buf, handle, SerializeOpts::default()) - .expect("unable to serialize DOM into buffer"); + .expect("Unable to serialize DOM into buffer"); let mut result = String::from_utf8(buf).unwrap(); @@ -398,33 +440,33 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { let mut buf: Vec = Vec::new(); let mut dom = html_to_dom(&result); let doc = dom.get_document(); - let html = get_child_node_by_name(&doc, "html"); - let head = get_child_node_by_name(&html, "head"); - let csp_content: String = csp(options); - - let meta = dom.create_element( - QualName::new(None, ns!(), local_name!("meta")), - vec![ - Attribute { - name: QualName::new(None, ns!(), local_name!("http-equiv")), - value: format_tendril!("Content-Security-Policy"), - }, - Attribute { - name: QualName::new(None, ns!(), local_name!("content")), - value: format_tendril!("{}", csp_content), - }, - ], - Default::default(), - ); - // Note: the CSP meta-tag has to be prepended, never appended, - // since there already may be one defined in the document, - // and browsers don't allow re-defining them (for obvious reasons) - head.children.borrow_mut().reverse(); - head.children.borrow_mut().push(meta.clone()); - head.children.borrow_mut().reverse(); + if let Some(html) = get_child_node_by_name(&doc, "html") { + if let Some(head) = get_child_node_by_name(&html, "head") { + let meta = dom.create_element( + QualName::new(None, ns!(), local_name!("meta")), + vec![ + Attribute { + name: QualName::new(None, ns!(), local_name!("http-equiv")), + value: format_tendril!("Content-Security-Policy"), + }, + Attribute { + name: QualName::new(None, ns!(), local_name!("content")), + value: format_tendril!("{}", compose_csp(options)), + }, + ], + Default::default(), + ); + // Note: the CSP meta-tag has to be prepended, never appended, + // since there already may be one defined in the original document, + // and browsers don't allow re-defining them (for obvious reasons) + head.children.borrow_mut().reverse(); + head.children.borrow_mut().push(meta.clone()); + head.children.borrow_mut().reverse(); + } + } serialize(&mut buf, &doc, SerializeOpts::default()) - .expect("unable to serialize DOM into buffer"); + .expect("Unable to serialize DOM into buffer"); result = String::from_utf8(buf).unwrap(); } @@ -549,7 +591,7 @@ pub fn walk_and_embed_assets( )) => { // Check integrity if integrity.is_empty() - || has_proper_integrity(&link_href_data, &integrity) + || check_integrity(&link_href_data, &integrity) { let link_href_data_url = data_to_data_url( &link_href_media_type, @@ -622,7 +664,7 @@ pub fn walk_and_embed_assets( )) => { // Check integrity if integrity.is_empty() - || has_proper_integrity(&link_href_data, &integrity) + || check_integrity(&link_href_data, &integrity) { let css: String = embed_css( cache, @@ -690,7 +732,7 @@ pub fn walk_and_embed_assets( } "base" => { if is_http_url(url) { - // Ensure BASE href is a full URL, not a relative one + // Ensure the BASE node doesn't have a relative URL for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("href") { @@ -858,74 +900,54 @@ pub fn walk_and_embed_assets( } } "input" => { - // Determine input type - let mut is_image_input: bool = false; - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("type") { - is_image_input = attr.value.to_string().eq_ignore_ascii_case("image"); - } - } - - if is_image_input { - let mut input_image_src: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("src") { - input_image_src = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } - - if options.no_images || input_image_src.is_empty() { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(if input_image_src.is_empty() { - "" + if let Some(attr_value) = get_node_attr(node, "type") { + if attr_value.to_string().eq_ignore_ascii_case("image") { + let mut input_image_src: String = str!(); + let mut i = 0; + while i < attrs_mut.len() { + let attr_name: &str = &attrs_mut[i].name.local; + if attr_name.eq_ignore_ascii_case("src") { + input_image_src = str!(attrs_mut.remove(i).value.trim()); } else { - empty_image!() - }), - }); - } else { - let input_image_full_url = - resolve_url(&url, input_image_src).unwrap_or_default(); - let input_image_url_fragment = - get_url_fragment(input_image_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &input_image_full_url, - options, - depth + 1, - ) { - Ok(( - input_image_data, - input_image_final_url, - input_image_media_type, - )) => { - let input_image_data_url = data_to_data_url( - &input_image_media_type, - &input_image_data, - &input_image_final_url, - ); - // Add data URL src attribute - let assembled_url: String = url_with_fragment( - input_image_data_url.as_str(), - input_image_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + i += 1; } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(input_image_full_url.clone()) { + } + + if options.no_images || input_image_src.is_empty() { + attrs_mut.push(Attribute { + name: QualName::new(None, ns!(), local_name!("src")), + value: Tendril::from_slice(if input_image_src.is_empty() { + "" + } else { + empty_image!() + }), + }); + } else { + let input_image_full_url = + resolve_url(&url, input_image_src).unwrap_or_default(); + let input_image_url_fragment = + get_url_fragment(input_image_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &input_image_full_url, + options, + depth + 1, + ) { + Ok(( + input_image_data, + input_image_final_url, + input_image_media_type, + )) => { + let input_image_data_url = data_to_data_url( + &input_image_media_type, + &input_image_data, + &input_image_final_url, + ); + // Add data URL src attribute let assembled_url: String = url_with_fragment( - input_image_full_url.as_str(), + input_image_data_url.as_str(), input_image_url_fragment.as_str(), ); attrs_mut.push(Attribute { @@ -933,6 +955,23 @@ pub fn walk_and_embed_assets( value: Tendril::from_slice(assembled_url.as_ref()), }); } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(input_image_full_url.clone()) { + let assembled_url: String = url_with_fragment( + input_image_full_url.as_str(), + input_image_url_fragment.as_str(), + ); + attrs_mut.push(Attribute { + name: QualName::new( + None, + ns!(), + local_name!("src"), + ), + value: Tendril::from_slice(assembled_url.as_ref()), + }); + } + } } } } @@ -1066,7 +1105,7 @@ pub fn walk_and_embed_assets( continue; } - // Don't touch email links or hrefs which begin with a hash sign + // Don't touch email links or hrefs which begin with a hash if attr_value.starts_with('#') || url_has_protocol(attr_value) { continue; } @@ -1109,7 +1148,7 @@ pub fn walk_and_embed_assets( Ok((script_data, script_final_url, _script_media_type)) => { // Only embed if we're able to validate integrity if script_integrity.is_empty() - || has_proper_integrity(&script_data, &script_integrity) + || check_integrity(&script_data, &script_integrity) { let script_data_url = data_to_data_url( "application/javascript", diff --git a/src/main.rs b/src/main.rs index 4930d6a..16174eb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,12 +9,12 @@ use std::process; use std::time::Duration; use monolith::html::{ - add_base_tag, add_favicon, has_base_tag, has_favicon, html_to_dom, metadata_tag, + add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url, stringify_document, walk_and_embed_assets, }; use monolith::opts::Options; use monolith::url::{ - data_to_data_url, data_url_to_data, is_data_url, is_file_url, is_http_url, resolve_url, + data_to_data_url, is_data_url, is_file_url, is_http_url, parse_data_url, resolve_url, }; use monolith::utils::retrieve_asset; @@ -52,7 +52,7 @@ fn main() { let options = Options::from_args(); let original_target: &str = &options.target; let target_url: &str; - let base_url; + let mut base_url: String; let mut dom; // Pre-process the input @@ -64,7 +64,9 @@ fn main() { // Determine exact target URL if target.clone().len() == 0 { - eprintln!("No target specified"); + if !options.silent { + eprintln!("No target specified"); + } process::exit(1); } else if is_http_url(target.clone()) || is_data_url(target.clone()) { target_url = target.as_str(); @@ -72,7 +74,9 @@ fn main() { target_url = target.as_str(); } else if path.exists() { if !path.is_file() { - eprintln!("Local target is not a file: {}", original_target); + if !options.silent { + eprintln!("Local target is not a file: {}", original_target); + } process::exit(1); } target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" }); @@ -111,11 +115,16 @@ fn main() { .build() .expect("Failed to initialize HTTP client"); + // At this stage we assume that the base URL is the same as the target URL + base_url = str!(target_url); + // Retrieve target document if is_file_url(target_url) || is_http_url(target_url) { match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) { Ok((data, final_url, _media_type)) => { - base_url = final_url; + if options.base_url.clone().unwrap_or(str!()).is_empty() { + base_url = final_url + } dom = html_to_dom(&String::from_utf8_lossy(&data)); } Err(_) => { @@ -126,23 +135,40 @@ fn main() { } } } else if is_data_url(target_url) { - let (media_type, data): (String, Vec) = data_url_to_data(target_url); + let (media_type, data): (String, Vec) = parse_data_url(target_url); if !media_type.eq_ignore_ascii_case("text/html") { - eprintln!("Unsupported data URL media type"); + if !options.silent { + eprintln!("Unsupported data URL media type"); + } process::exit(1); } - base_url = str!(target_url); dom = html_to_dom(&String::from_utf8_lossy(&data)); } else { process::exit(1); } + // Use custom base URL if specified, read and use what's in the DOM otherwise + if !options.base_url.clone().unwrap_or(str!()).is_empty() { + if is_data_url(options.base_url.clone().unwrap()) { + if !options.silent { + eprintln!("Data URLs cannot be used as base URL"); + } + process::exit(1); + } else { + base_url = options.base_url.clone().unwrap(); + } + } else { + if let Some(existing_base_url) = get_base_url(&dom.document) { + base_url = resolve_url(target_url, existing_base_url).unwrap(); + } + } + // Embed remote assets walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0); - // Take care of BASE tag - if is_http_url(base_url.clone()) && !has_base_tag(&dom.document) { - dom = add_base_tag(&dom.document, base_url.clone()); + // Update or add new BASE tag to reroute network requests and hash-links in the final document + if let Some(new_base_url) = options.base_url.clone() { + dom = set_base_url(&dom.document, new_base_url); } // Request and embed /favicon.ico (unless it's already linked in the document) @@ -172,7 +198,7 @@ fn main() { // Add metadata tag if !options.no_metadata { - let metadata_comment: String = metadata_tag(&base_url); + let metadata_comment: String = create_metadata_tag(&base_url); result.insert_str(0, &metadata_comment); if metadata_comment.len() > 0 { result.insert_str(metadata_comment.len(), "\n"); diff --git a/src/opts.rs b/src/opts.rs index 81ae9e7..bfe69d7 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -2,20 +2,21 @@ use clap::{App, Arg}; #[derive(Default)] pub struct Options { - pub target: String, + pub base_url: Option, pub no_css: bool, pub ignore_errors: bool, - pub no_fonts: bool, pub no_frames: bool, + pub no_fonts: bool, pub no_images: bool, + pub isolate: bool, pub no_js: bool, pub insecure: bool, - pub isolate: bool, + pub no_metadata: bool, pub output: String, pub silent: bool, pub timeout: u64, pub user_agent: String, - pub no_metadata: bool, + pub target: String, } const ASCII: &str = " \ @@ -37,14 +38,8 @@ impl Options { .version(crate_version!()) .author(crate_authors!("\n")) .about(format!("{}\n{}", ASCII, crate_description!()).as_str()) - .arg( - Arg::with_name("target") - .required(true) - .takes_value(true) - .index(1) - .help("URL or file path"), - ) // .args_from_usage("-a, --no-audio 'Removes audio sources'") + .args_from_usage("-b, --base-url=[http://localhost/] 'Use custom base URL'") .args_from_usage("-c, --no-css 'Removes CSS'") .args_from_usage("-e, --ignore-errors 'Ignore network errors'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") @@ -53,12 +48,19 @@ impl Options { .args_from_usage("-I, --isolate 'Cuts off document from the Internet'") .args_from_usage("-j, --no-js 'Removes JavaScript'") .args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'") - .args_from_usage("-M, --no-metadata 'Excludes metadata information from the document'") + .args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'") .args_from_usage("-o, --output=[document.html] 'Write output to '") .args_from_usage("-s, --silent 'Suppresses verbosity'") .args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'") .args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'") // .args_from_usage("-v, --no-video 'Removes video sources'") + .arg( + Arg::with_name("target") + .required(true) + .takes_value(true) + .index(1) + .help("URL or file path"), + ) .get_matches(); let mut options: Options = Options::default(); @@ -67,6 +69,9 @@ impl Options { .value_of("target") .expect("please set target") .to_string(); + if let Some(base_url) = app.value_of("base-url") { + options.base_url = Some(str!(base_url)); + } options.no_css = app.is_present("no-css"); options.ignore_errors = app.is_present("ignore-errors"); options.no_frames = app.is_present("no-frames"); diff --git a/src/tests/cli/base_url.rs b/src/tests/cli/base_url.rs new file mode 100644 index 0000000..7ba88d9 --- /dev/null +++ b/src/tests/cli/base_url.rs @@ -0,0 +1,123 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use assert_cmd::prelude::*; + use std::env; + use std::process::Command; + + #[test] + fn add_new_when_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("-b") + .arg("http://localhost:8000/") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn keep_existing_when_none_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn override_existing_when_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("-b") + .arg("http://localhost/") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } + + #[test] + fn remove_existing_when_empty_provided() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg("-b") + .arg("") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\ + \ + Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } +} diff --git a/src/tests/cli.rs b/src/tests/cli/basic.rs similarity index 100% rename from src/tests/cli.rs rename to src/tests/cli/basic.rs diff --git a/src/tests/cli/mod.rs b/src/tests/cli/mod.rs new file mode 100644 index 0000000..1c20743 --- /dev/null +++ b/src/tests/cli/mod.rs @@ -0,0 +1,2 @@ +mod base_url; +mod basic; diff --git a/src/tests/html/has_proper_integrity.rs b/src/tests/html/check_integrity.rs similarity index 85% rename from src/tests/html/has_proper_integrity.rs rename to src/tests/html/check_integrity.rs index 639bf45..121e412 100644 --- a/src/tests/html/has_proper_integrity.rs +++ b/src/tests/html/check_integrity.rs @@ -11,7 +11,7 @@ mod passing { #[test] fn empty_input_sha256() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "".as_bytes(), "sha256-47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=" )); @@ -19,7 +19,7 @@ mod passing { #[test] fn sha256() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "abcdef0123456789".as_bytes(), "sha256-9EWAHgy4mSYsm54hmDaIDXPKLRsLnBX7lZyQ6xISNOM=" )); @@ -27,7 +27,7 @@ mod passing { #[test] fn sha384() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "abcdef0123456789".as_bytes(), "sha384-gc9l7omltke8C33bedgh15E12M7RrAQa5t63Yb8APlpe7ZhiqV23+oqiulSJl3Kw" )); @@ -35,7 +35,7 @@ mod passing { #[test] fn sha512() { - assert!(html::has_proper_integrity( + assert!(html::check_integrity( "abcdef0123456789".as_bytes(), "sha512-zG5B88cYMqcdiMi9gz0XkOFYw2BpjeYdn5V6+oFrMgSNjRpqL7EF8JEwl17ztZbK3N7I/tTwp3kxQbN1RgFBww==" )); @@ -55,20 +55,17 @@ mod failing { #[test] fn empty_hash() { - assert!(!html::has_proper_integrity( - "abcdef0123456789".as_bytes(), - "" - )); + assert!(!html::check_integrity("abcdef0123456789".as_bytes(), "")); } #[test] fn empty_input_empty_hash() { - assert!(!html::has_proper_integrity("".as_bytes(), "")); + assert!(!html::check_integrity("".as_bytes(), "")); } #[test] fn sha256() { - assert!(!html::has_proper_integrity( + assert!(!html::check_integrity( "abcdef0123456789".as_bytes(), "sha256-badhash" )); @@ -76,7 +73,7 @@ mod failing { #[test] fn sha384() { - assert!(!html::has_proper_integrity( + assert!(!html::check_integrity( "abcdef0123456789".as_bytes(), "sha384-badhash" )); @@ -84,7 +81,7 @@ mod failing { #[test] fn sha512() { - assert!(!html::has_proper_integrity( + assert!(!html::check_integrity( "abcdef0123456789".as_bytes(), "sha512-badhash" )); diff --git a/src/tests/html/csp.rs b/src/tests/html/compose_csp.rs similarity index 86% rename from src/tests/html/csp.rs rename to src/tests/html/compose_csp.rs index e4adde9..a05b450 100644 --- a/src/tests/html/csp.rs +++ b/src/tests/html/compose_csp.rs @@ -14,7 +14,7 @@ mod passing { fn isolated() { let mut options = Options::default(); options.isolate = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "default-src 'unsafe-inline' data:;"); } @@ -23,7 +23,7 @@ mod passing { fn no_css() { let mut options = Options::default(); options.no_css = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "style-src 'none';"); } @@ -32,7 +32,7 @@ mod passing { fn no_fonts() { let mut options = Options::default(); options.no_fonts = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "font-src 'none';"); } @@ -41,7 +41,7 @@ mod passing { fn no_frames() { let mut options = Options::default(); options.no_frames = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "frame-src 'none'; child-src 'none';"); } @@ -50,7 +50,7 @@ mod passing { fn no_js() { let mut options = Options::default(); options.no_js = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "script-src 'none';"); } @@ -59,7 +59,7 @@ mod passing { fn no_images() { let mut options = Options::default(); options.no_images = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "img-src data:;"); } @@ -73,7 +73,7 @@ mod passing { options.no_frames = true; options.no_js = true; options.no_images = true; - let csp_content = html::csp(&options); + let csp_content = html::compose_csp(&options); assert_eq!(csp_content, "default-src 'unsafe-inline' data:; style-src 'none'; font-src 'none'; frame-src 'none'; child-src 'none'; script-src 'none'; img-src data:;"); } diff --git a/src/tests/html/metadata_tag.rs b/src/tests/html/create_metadata_tag.rs similarity index 92% rename from src/tests/html/metadata_tag.rs rename to src/tests/html/create_metadata_tag.rs index cef13bf..ea59731 100644 --- a/src/tests/html/metadata_tag.rs +++ b/src/tests/html/create_metadata_tag.rs @@ -15,7 +15,7 @@ mod passing { fn http_url() { let url = "http://192.168.1.1/"; let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(url); assert_eq!( metadata_comment, @@ -33,7 +33,7 @@ mod passing { fn file_url() { let url = "file:///home/monolith/index.html"; let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(url); assert_eq!( metadata_comment, @@ -50,7 +50,7 @@ mod passing { fn data_url() { let url = "data:text/html,Hello%2C%20World!"; let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(url); assert_eq!( metadata_comment, @@ -77,6 +77,6 @@ mod failing { #[test] fn empty_string() { - assert_eq!(html::metadata_tag(""), ""); + assert_eq!(html::create_metadata_tag(""), ""); } } diff --git a/src/tests/html/get_base_url.rs b/src/tests/html/get_base_url.rs new file mode 100644 index 0000000..a1b959c --- /dev/null +++ b/src/tests/html/get_base_url.rs @@ -0,0 +1,104 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use crate::html; + + #[test] + fn present() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!( + html::get_base_url(&dom.document), + Some(str!("https://musicbrainz.org")) + ); + } + + #[test] + fn multiple_tags() { + let html = " + + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!( + html::get_base_url(&dom.document), + Some(str!("https://www.discogs.com/")) + ); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use crate::html; + + #[test] + fn absent() { + let html = " + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!(html::get_base_url(&dom.document), None); + } + + #[test] + fn no_href() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!(html::get_base_url(&dom.document), None); + } + + #[test] + fn empty_href() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html); + + assert_eq!(html::get_base_url(&dom.document), Some(str!())); + } +} diff --git a/src/tests/html/get_node_attr.rs b/src/tests/html/get_node_attr.rs new file mode 100644 index 0000000..a8b7448 --- /dev/null +++ b/src/tests/html/get_node_attr.rs @@ -0,0 +1,54 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use html5ever::rcdom::{Handle, NodeData}; + + use crate::html; + + #[test] + fn div_two_style_attributes() { + let html = "
"; + let dom = html::html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + + if node_name == "body" { + assert_eq!(html::get_node_attr(node, "class"), None); + } else if node_name == "div" { + assert_eq!( + html::get_node_attr(node, "style"), + Some(str!("color: blue;")) + ); + } + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 6); + } +} diff --git a/src/tests/html/get_node_name.rs b/src/tests/html/get_node_name.rs index 47f339f..79da75e 100644 --- a/src/tests/html/get_node_name.rs +++ b/src/tests/html/get_node_name.rs @@ -12,7 +12,7 @@ mod passing { use crate::html; #[test] - fn get_node_name() { + fn parent_node_names() { let html = "

"; let dom = html::html_to_dom(&html); let mut count = 0; diff --git a/src/tests/html/mod.rs b/src/tests/html/mod.rs index a912338..b9576da 100644 --- a/src/tests/html/mod.rs +++ b/src/tests/html/mod.rs @@ -1,10 +1,13 @@ mod add_favicon; -mod csp; +mod check_integrity; +mod compose_csp; +mod create_metadata_tag; mod embed_srcset; +mod get_base_url; +mod get_node_attr; mod get_node_name; mod has_favicon; -mod has_proper_integrity; mod is_icon; -mod metadata_tag; +mod set_node_attr; mod stringify_document; mod walk_and_embed_assets; diff --git a/src/tests/html/set_node_attr.rs b/src/tests/html/set_node_attr.rs new file mode 100644 index 0000000..73e7766 --- /dev/null +++ b/src/tests/html/set_node_attr.rs @@ -0,0 +1,66 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use html5ever::rcdom::{Handle, NodeData}; + + use crate::html; + + #[test] + fn html_lang_and_body_style() { + let html = ""; + let dom = html::html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + + if node_name == "html" { + assert_eq!(html::get_node_attr(node, "lang"), Some(str!("en"))); + + html::set_node_attr(node, "lang", Some(str!("de"))); + assert_eq!(html::get_node_attr(node, "lang"), Some(str!("de"))); + + html::set_node_attr(node, "lang", None); + assert_eq!(html::get_node_attr(node, "lang"), None); + + html::set_node_attr(node, "lang", Some(str!(""))); + assert_eq!(html::get_node_attr(node, "lang"), Some(str!(""))); + } else if node_name == "body" { + assert_eq!(html::get_node_attr(node, "style"), None); + + html::set_node_attr(node, "style", Some(str!("display: none;"))); + assert_eq!( + html::get_node_attr(node, "style"), + Some(str!("display: none;")) + ); + } + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 5); + } +} diff --git a/src/tests/url/mod.rs b/src/tests/url/mod.rs index 226c388..fe06cda 100644 --- a/src/tests/url/mod.rs +++ b/src/tests/url/mod.rs @@ -1,12 +1,12 @@ mod clean_url; mod data_to_data_url; -mod data_url_to_data; mod decode_url; mod file_url_to_fs_path; mod get_url_fragment; mod is_data_url; mod is_file_url; mod is_http_url; +mod parse_data_url; mod resolve_url; mod url_has_protocol; mod url_with_fragment; diff --git a/src/tests/url/data_url_to_data.rs b/src/tests/url/parse_data_url.rs similarity index 83% rename from src/tests/url/data_url_to_data.rs rename to src/tests/url/parse_data_url.rs index 2ad5437..589fc5e 100644 --- a/src/tests/url/data_url_to_data.rs +++ b/src/tests/url/parse_data_url.rs @@ -11,7 +11,7 @@ mod passing { #[test] fn parse_text_html_base64() { - let (media_type, data) = url::data_url_to_data("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="); + let (media_type, data) = url::parse_data_url("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="); assert_eq!(media_type, "text/html"); assert_eq!( @@ -22,7 +22,7 @@ mod passing { #[test] fn parse_text_html_utf8() { - let (media_type, data) = url::data_url_to_data( + let (media_type, data) = url::parse_data_url( "data:text/html;utf8,Work expands so as to fill the time available for its completion", ); @@ -35,7 +35,7 @@ mod passing { #[test] fn parse_text_html_plaintext() { - let (media_type, data) = url::data_url_to_data( + let (media_type, data) = url::parse_data_url( "data:text/html,Work expands so as to fill the time available for its completion", ); @@ -48,7 +48,7 @@ mod passing { #[test] fn parse_text_html_charset_utf_8_between_two_whitespaces() { - let (media_type, data) = url::data_url_to_data(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "); + let (media_type, data) = url::parse_data_url(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "); assert_eq!(media_type, "text/html"); assert_eq!( @@ -60,7 +60,7 @@ mod passing { #[test] fn parse_text_css_url_encoded() { let (media_type, data) = - url::data_url_to_data("data:text/css,div{background-color:%23000}"); + url::parse_data_url("data:text/css,div{background-color:%23000}"); assert_eq!(media_type, "text/css"); assert_eq!(String::from_utf8_lossy(&data), "div{background-color:#000}"); @@ -68,7 +68,7 @@ mod passing { #[test] fn parse_no_media_type_base64() { - let (media_type, data) = url::data_url_to_data("data:;base64,dGVzdA=="); + let (media_type, data) = url::parse_data_url("data:;base64,dGVzdA=="); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), "test"); @@ -76,7 +76,7 @@ mod passing { #[test] fn parse_no_media_type_no_encoding() { - let (media_type, data) = url::data_url_to_data("data:;,test%20test"); + let (media_type, data) = url::parse_data_url("data:;,test%20test"); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), "test test"); @@ -96,7 +96,7 @@ mod failing { #[test] fn just_word_data() { - let (media_type, data) = url::data_url_to_data("data"); + let (media_type, data) = url::parse_data_url("data"); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), ""); diff --git a/src/url.rs b/src/url.rs index e493ce1..5792fb6 100644 --- a/src/url.rs +++ b/src/url.rs @@ -33,45 +33,6 @@ pub fn data_to_data_url(media_type: &str, data: &[u8], url: &str) -> String { format!("data:{};base64,{}", media_type, base64::encode(data)) } -pub fn data_url_to_data>(url: T) -> (String, Vec) { - let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap()); - let path: String = parsed_url.path().to_string(); - let comma_loc: usize = path.find(',').unwrap_or(path.len()); - - let meta_data: String = path.chars().take(comma_loc).collect(); - let raw_data: String = path.chars().skip(comma_loc + 1).collect(); - - let text: String = decode_url(raw_data); - - let meta_data_items: Vec<&str> = meta_data.split(';').collect(); - let mut media_type: String = str!(); - let mut encoding: &str = ""; - - let mut i: i8 = 0; - for item in &meta_data_items { - if i == 0 { - media_type = str!(item); - } else { - if item.eq_ignore_ascii_case("base64") - || item.eq_ignore_ascii_case("utf8") - || item.eq_ignore_ascii_case("charset=UTF-8") - { - encoding = item; - } - } - - i = i + 1; - } - - let data: Vec = if encoding.eq_ignore_ascii_case("base64") { - base64::decode(&text).unwrap_or(vec![]) - } else { - text.as_bytes().to_vec() - }; - - (media_type, data) -} - pub fn decode_url(input: String) -> String { let input: String = input.replace("+", "%2B"); @@ -138,6 +99,45 @@ pub fn is_http_url>(url: T) -> bool { .unwrap_or(false) } +pub fn parse_data_url>(url: T) -> (String, Vec) { + let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap()); + let path: String = parsed_url.path().to_string(); + let comma_loc: usize = path.find(',').unwrap_or(path.len()); + + let meta_data: String = path.chars().take(comma_loc).collect(); + let raw_data: String = path.chars().skip(comma_loc + 1).collect(); + + let text: String = decode_url(raw_data); + + let meta_data_items: Vec<&str> = meta_data.split(';').collect(); + let mut media_type: String = str!(); + let mut encoding: &str = ""; + + let mut i: i8 = 0; + for item in &meta_data_items { + if i == 0 { + media_type = str!(item); + } else { + if item.eq_ignore_ascii_case("base64") + || item.eq_ignore_ascii_case("utf8") + || item.eq_ignore_ascii_case("charset=UTF-8") + { + encoding = item; + } + } + + i = i + 1; + } + + let data: Vec = if encoding.eq_ignore_ascii_case("base64") { + base64::decode(&text).unwrap_or(vec![]) + } else { + text.as_bytes().to_vec() + }; + + (media_type, data) +} + pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result { let result = if is_http_url(to.as_ref()) { to.as_ref().to_string() diff --git a/src/utils.rs b/src/utils.rs index 014ee6e..5fd76be 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,7 +5,7 @@ use std::fs; use std::path::Path; use crate::opts::Options; -use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url}; +use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url}; const INDENT: &str = " "; @@ -83,7 +83,7 @@ pub fn retrieve_asset( } if is_data_url(&url) { - let (media_type, data) = data_url_to_data(url); + let (media_type, data) = parse_data_url(url); Ok((data, url.to_string(), media_type)) } else if is_file_url(&url) { // Check if parent_url is also file:/// From d89b4d5f5bf32259c2f83c9fa731d1749933995b Mon Sep 17 00:00:00 2001 From: Sunshine Date: Fri, 25 Dec 2020 11:09:47 -1000 Subject: [PATCH 2/3] refactor code that processes the DOM --- src/html.rs | 1117 ++++++++++------------- src/tests/cli/basic.rs | 4 +- src/tests/html/set_node_attr.rs | 39 + src/tests/html/walk_and_embed_assets.rs | 4 +- 4 files changed, 541 insertions(+), 623 deletions(-) diff --git a/src/html.rs b/src/html.rs index 7fadd65..bfe19e6 100644 --- a/src/html.rs +++ b/src/html.rs @@ -4,7 +4,7 @@ use html5ever::interface::QualName; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; -use html5ever::tendril::{format_tendril, Tendril, TendrilSink}; +use html5ever::tendril::{format_tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns, LocalName}; use reqwest::blocking::Client; @@ -493,39 +493,33 @@ pub fn walk_and_embed_assets( ref attrs, .. } => { - let attrs_mut = &mut attrs.borrow_mut(); - match name.local.as_ref() { "meta" => { - // Determine type - let mut is_unwanted_meta: bool = false; - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("http-equiv") { - let value: String = attr.value.to_string(); - is_unwanted_meta = value.eq_ignore_ascii_case("refresh") - || value.eq_ignore_ascii_case("location"); - } - } - - if is_unwanted_meta { - // Strip this node off all its attributes - while attrs_mut.len() > 0 { - attrs_mut.remove(0); + // Remove http-equiv attributes from META nodes if they're able to control the page + if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") { + let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value; + if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh") + || meta_attr_http_equiv_value.eq_ignore_ascii_case("location") + { + set_node_attr( + &node, + "http-equiv", + Some(format!( + "disabled by monolith ({})", + meta_attr_http_equiv_value + )), + ); } } } "link" => { - // Remove integrity attributes, keep value of the last one - let mut integrity: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("integrity") { - integrity = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } + // Read and remember integrity attribute value of this LINK node + let link_attr_integrity_value: Option = + get_node_attr(node, "integrity"); + + // Remove integrity attribute from the LINK node + if link_attr_integrity_value != None { + set_node_attr(node, "integrity", None); } enum LinkType { @@ -537,195 +531,167 @@ pub fn walk_and_embed_assets( } let mut link_type = LinkType::Unknown; - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "rel" { - let value = attr.value.trim(); - if is_icon(value) { - link_type = LinkType::Icon; - break; - } else if value.eq_ignore_ascii_case("stylesheet") { - link_type = LinkType::Stylesheet; - break; - } else if value.eq_ignore_ascii_case("preload") { - link_type = LinkType::Preload; - break; - } else if value.eq_ignore_ascii_case("dns-prefetch") { - link_type = LinkType::DnsPrefetch; - break; - } + if let Some(link_attr_rel_value) = get_node_attr(node, "rel") { + if is_icon(&link_attr_rel_value) { + link_type = LinkType::Icon; + } else if link_attr_rel_value.eq_ignore_ascii_case("stylesheet") { + link_type = LinkType::Stylesheet; + } else if link_attr_rel_value.eq_ignore_ascii_case("preload") { + link_type = LinkType::Preload; + } else if link_attr_rel_value.eq_ignore_ascii_case("dns-prefetch") { + link_type = LinkType::DnsPrefetch; } } + // Shadow the variable (to make it non-mutable) let link_type = link_type; match link_type { LinkType::Icon => { - // Find and remove href attribute(s), keep value of the last found one - let mut link_href: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("href") { - link_href = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } + // Find and resolve this LINK node's href attribute + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + if !options.no_images && !link_attr_href_value.is_empty() { + let link_href_full_url = + resolve_url(&url, link_attr_href_value).unwrap_or_default(); + let link_href_url_fragment = + get_url_fragment(link_href_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &link_href_full_url, + options, + depth + 1, + ) { + Ok(( + link_href_data, + link_href_final_url, + link_href_media_type, + )) => { + let mut ok_to_include = true; + + // Check integrity + if let Some(link_attr_integrity_value) = + link_attr_integrity_value + { + if !link_attr_integrity_value.is_empty() { + ok_to_include = check_integrity( + &link_href_data, + &link_attr_integrity_value, + ); + } + } - if !options.no_images && !link_href.is_empty() { - let link_href_full_url = - resolve_url(&url, link_href).unwrap_or_default(); - let link_href_url_fragment = - get_url_fragment(link_href_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &link_href_full_url, - options, - depth + 1, - ) { - Ok(( - link_href_data, - link_href_final_url, - link_href_media_type, - )) => { - // Check integrity - if integrity.is_empty() - || check_integrity(&link_href_data, &integrity) - { - let link_href_data_url = data_to_data_url( - &link_href_media_type, - &link_href_data, - &link_href_final_url, - ); - // Add new data URL href attribute - let assembled_url: String = url_with_fragment( - link_href_data_url.as_str(), - link_href_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + if ok_to_include { + let link_href_data_url = data_to_data_url( + &link_href_media_type, + &link_href_data, + &link_href_final_url, + ); + // Add new data URL href attribute + let assembled_url: String = url_with_fragment( + link_href_data_url.as_str(), + link_href_url_fragment.as_str(), + ); + set_node_attr(&node, "href", Some(assembled_url)); + } } - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(link_href_full_url.clone()) { - let assembled_url: String = url_with_fragment( - link_href_full_url.as_str(), - link_href_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(link_href_full_url.clone()) { + let assembled_url: String = url_with_fragment( + link_href_full_url.as_str(), + link_href_url_fragment.as_str(), + ); + set_node_attr(node, "href", Some(assembled_url)); + } } } + } else { + set_node_attr(node, "href", None); } } } LinkType::Stylesheet => { - // Find and remove href attribute(s), keep value of the last found one - let mut link_href: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("href") { - link_href = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } + // Find and resolve this LINK node's href attribute + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + set_node_attr(node, "href", None); - if !options.no_css && !link_href.is_empty() { - let link_href_full_url = - resolve_url(&url, link_href).unwrap_or_default(); - match retrieve_asset( - cache, - client, - &url, - &link_href_full_url, - options, - depth + 1, - ) { - Ok(( - link_href_data, - link_href_final_url, - _link_href_media_type, - )) => { - // Check integrity - if integrity.is_empty() - || check_integrity(&link_href_data, &integrity) - { - let css: String = embed_css( - cache, - client, - &link_href_final_url, - &String::from_utf8_lossy(&link_href_data), - options, - depth + 1, - ); - let link_href_data_url = data_to_data_url( - "text/css", - css.as_bytes(), - &link_href_final_url, - ); - // Add new data URL href attribute - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice( - link_href_data_url.as_ref(), - ), - }); + if !options.no_css && !link_attr_href_value.is_empty() { + let link_href_full_url = + resolve_url(&url, link_attr_href_value).unwrap_or_default(); + match retrieve_asset( + cache, + client, + &url, + &link_href_full_url, + options, + depth + 1, + ) { + Ok(( + link_href_data, + link_href_final_url, + _link_href_media_type, + )) => { + let mut ok_to_include = true; + + // Check integrity + if let Some(link_attr_integrity_value) = + link_attr_integrity_value + { + if !link_attr_integrity_value.is_empty() { + ok_to_include = check_integrity( + &link_href_data, + &link_attr_integrity_value, + ); + } + } + + if ok_to_include { + let css: String = embed_css( + cache, + client, + &link_href_final_url, + &String::from_utf8_lossy(&link_href_data), + options, + depth + 1, + ); + let link_href_data_url = data_to_data_url( + "text/css", + css.as_bytes(), + &link_href_final_url, + ); + // Add new data URL href attribute + set_node_attr( + &node, + "href", + Some(link_href_data_url), + ); + } } - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(link_href_full_url.clone()) { - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice( - link_href_full_url.as_ref(), - ), - }); + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(link_href_full_url.clone()) { + set_node_attr( + &node, + "href", + Some(link_href_full_url), + ); + } } } } } } LinkType::Preload | LinkType::DnsPrefetch => { - // Since all resources are embedded as data URL, preloading and prefetching are unnecessary - for _ in 0..attrs_mut.len() { - attrs_mut.remove(0); - } + // Since all resources are embedded as data URLs, preloading and prefetching are not necessary + set_node_attr(node, "rel", None); } LinkType::Unknown => { // Make sure that all other LINKs' href attributes are full URLs - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("href") { - let href_full_url = - resolve_url(&url, attr.value.trim()).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(&href_full_url.as_str()); - } + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + let href_full_url = + resolve_url(&url, link_attr_href_value).unwrap_or_default(); + set_node_attr(node, "href", Some(href_full_url)); } } } @@ -733,68 +699,58 @@ pub fn walk_and_embed_assets( "base" => { if is_http_url(url) { // Ensure the BASE node doesn't have a relative URL - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("href") { - let href_full_url = - resolve_url(&url, attr.value.trim()).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(&href_full_url.as_str()); - } + if let Some(base_attr_href_value) = get_node_attr(node, "href") { + let href_full_url = + resolve_url(&url, base_attr_href_value).unwrap_or_default(); + set_node_attr(node, "href", Some(href_full_url)); } } } "body" => { - // Find and remove background attribute(s), keep value of the last found one - let mut background: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("background") { - background = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } - - if !options.no_images && !background.is_empty() { - let background_full_url = resolve_url(&url, background).unwrap_or_default(); - let background_url_fragment = get_url_fragment(background_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &background_full_url, - options, - depth + 1, - ) { - Ok((background_data, background_final_url, background_media_type)) => { - let background_data_url = data_to_data_url( - &background_media_type, - &background_data, - &background_final_url, - ); - // Add new data URL background attribute - let assembled_url: String = url_with_fragment( - background_data_url.as_str(), - background_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("background")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(background_full_url.clone()) { + // Read and remember background attribute value of this BODY node + if let Some(body_attr_background_value) = get_node_attr(node, "background") { + // Remove background BODY node attribute by default + set_node_attr(node, "background", None); + + if !options.no_images && !body_attr_background_value.is_empty() { + let background_full_url = + resolve_url(&url, body_attr_background_value).unwrap_or_default(); + let background_url_fragment = + get_url_fragment(background_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &background_full_url, + options, + depth + 1, + ) { + Ok(( + background_data, + background_final_url, + background_media_type, + )) => { + let background_data_url = data_to_data_url( + &background_media_type, + &background_data, + &background_final_url, + ); + // Convert background attribute to data URL let assembled_url: String = url_with_fragment( - background_full_url.as_str(), + background_data_url.as_str(), background_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("background")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "background", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(background_full_url.clone()) { + let assembled_url: String = url_with_fragment( + background_full_url.as_str(), + background_url_fragment.as_str(), + ); + set_node_attr(node, "background", Some(assembled_url)); + } } } } @@ -802,48 +758,43 @@ pub fn walk_and_embed_assets( } "img" => { // Find source attribute(s) - let mut img_data_src: String = str!(); - let mut img_src: String = str!(); - let mut img_srcset: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("data-src") { - img_data_src = str!(attrs_mut.remove(i).value.trim()); - } else if attr_name.eq_ignore_ascii_case("src") { - img_src = str!(attrs_mut.remove(i).value.trim()); - } else if attr_name.eq_ignore_ascii_case("srcset") { - img_srcset = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } + let img_attr_src_value: Option = get_node_attr(node, "src"); + let img_attr_data_src_value: Option = get_node_attr(node, "data-src"); if options.no_images { - // Add empty image src attribute - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(empty_image!()), - }); + // Put empty images into src and data-src attributes + if img_attr_src_value != None { + set_node_attr(node, "src", Some(str!(empty_image!()))); + } + if img_attr_data_src_value != None { + set_node_attr(node, "data-src", Some(str!(empty_image!()))); + } } else { - if img_src.is_empty() && img_data_src.is_empty() { + if img_attr_src_value.clone().unwrap_or_default().is_empty() + && img_attr_data_src_value + .clone() + .unwrap_or_default() + .is_empty() + { // Add empty src attribute - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(""), - }); + set_node_attr(node, "src", Some(str!())); } else { // Add data URL src attribute let img_full_url = resolve_url( &url, - if !img_data_src.is_empty() { - img_data_src + if !img_attr_data_src_value + .clone() + .unwrap_or_default() + .is_empty() + { + img_attr_data_src_value.unwrap_or_default() } else { - img_src + img_attr_src_value.unwrap_or_default() }, ) .unwrap_or_default(); let img_url_fragment = get_url_fragment(img_full_url.clone()); + match retrieve_asset( cache, client, @@ -862,36 +813,32 @@ pub fn walk_and_embed_assets( img_data_url.as_str(), img_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "src", Some(assembled_url)); } Err(_) => { - // Keep remote reference if unable to retrieve the asset if is_http_url(img_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset let assembled_url: String = url_with_fragment( img_full_url.as_str(), img_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "src", Some(assembled_url)); + } else { + // Don't keep original reference if it's not a remote target + set_node_attr(node, "src", None); } } } } } - if !img_srcset.is_empty() { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("srcset")), - value: Tendril::from_slice( - embed_srcset(cache, client, &url, &img_srcset, options, depth) - .as_ref(), - ), - }); + // Resolve srcset attribute + if let Some(img_srcset) = get_node_attr(node, "srcset") { + if !img_srcset.is_empty() { + let resolved_srcset: String = + embed_srcset(cache, client, &url, &img_srcset, options, depth); + set_node_attr(node, "srcset", Some(resolved_srcset)); + } } } "svg" => { @@ -900,76 +847,55 @@ pub fn walk_and_embed_assets( } } "input" => { - if let Some(attr_value) = get_node_attr(node, "type") { - if attr_value.to_string().eq_ignore_ascii_case("image") { - let mut input_image_src: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("src") { - input_image_src = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } - - if options.no_images || input_image_src.is_empty() { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(if input_image_src.is_empty() { - "" + if let Some(input_attr_type_value) = get_node_attr(node, "type") { + if input_attr_type_value.eq_ignore_ascii_case("image") { + if let Some(input_attr_src_value) = get_node_attr(node, "src") { + if options.no_images || input_attr_src_value.is_empty() { + let value = if input_attr_src_value.is_empty() { + str!() } else { - empty_image!() - }), - }); - } else { - let input_image_full_url = - resolve_url(&url, input_image_src).unwrap_or_default(); - let input_image_url_fragment = - get_url_fragment(input_image_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &input_image_full_url, - options, - depth + 1, - ) { - Ok(( - input_image_data, - input_image_final_url, - input_image_media_type, - )) => { - let input_image_data_url = data_to_data_url( - &input_image_media_type, - &input_image_data, - &input_image_final_url, - ); - // Add data URL src attribute - let assembled_url: String = url_with_fragment( - input_image_data_url.as_str(), - input_image_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(input_image_full_url.clone()) { + str!(empty_image!()) + }; + set_node_attr(node, "src", Some(value)); + } else { + let input_image_full_url = + resolve_url(&url, input_attr_src_value).unwrap_or_default(); + let input_image_url_fragment = + get_url_fragment(input_image_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &input_image_full_url, + options, + depth + 1, + ) { + Ok(( + input_image_data, + input_image_final_url, + input_image_media_type, + )) => { + let input_image_data_url = data_to_data_url( + &input_image_media_type, + &input_image_data, + &input_image_final_url, + ); + // Add data URL src attribute let assembled_url: String = url_with_fragment( - input_image_full_url.as_str(), + input_image_data_url.as_str(), input_image_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("src"), - ), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(input_image_full_url.clone()) { + let assembled_url: String = url_with_fragment( + input_image_full_url.as_str(), + input_image_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } } } } @@ -978,17 +904,18 @@ pub fn walk_and_embed_assets( } } "image" => { - // Find and remove (xlink:)href attribute(s), keep value of the last one let mut image_href: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("xlink:href") - || attr_name.eq_ignore_ascii_case("href") - { - image_href = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; + + if let Some(image_attr_href_value) = get_node_attr(node, "href") { + image_href = image_attr_href_value; + if options.no_images { + set_node_attr(node, "href", None); + } + } + if let Some(image_attr_xlink_href_value) = get_node_attr(node, "xlink:href") { + image_href = image_attr_xlink_href_value; + if options.no_images { + set_node_attr(node, "xlink:href", None); } } @@ -1014,10 +941,7 @@ pub fn walk_and_embed_assets( image_data_url.as_str(), image_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "href", Some(assembled_url)); } Err(_) => { // Keep remote reference if unable to retrieve the asset @@ -1026,65 +950,55 @@ pub fn walk_and_embed_assets( image_full_url.as_str(), image_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "href", Some(assembled_url)); } } } } } "source" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - - if attr_name.eq_ignore_ascii_case("src") { - let src_full_url = resolve_url(&url, attr.value.trim()) - .unwrap_or_else(|_| attr.value.to_string()); - attr.value.clear(); - attr.value.push_slice(src_full_url.as_str()); - } else if attr_name.eq_ignore_ascii_case("srcset") { - if get_node_name(&get_parent_node(&node)) == Some("picture") { - if options.no_images { - attr.value.clear(); - attr.value.push_slice(empty_image!()); - } else { - let srcset_full_url = - resolve_url(&url, attr.value.trim()).unwrap_or_default(); - let srcset_url_fragment = - get_url_fragment(srcset_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &srcset_full_url, - options, - depth + 1, - ) { - Ok((srcset_data, srcset_final_url, srcset_media_type)) => { - let srcset_data_url = data_to_data_url( - &srcset_media_type, - &srcset_data, - &srcset_final_url, - ); - attr.value.clear(); + if let Some(source_attr_src_value) = get_node_attr(node, "src") { + let src_full_url: String = resolve_url(&url, source_attr_src_value.clone()) + .unwrap_or_else(|_| source_attr_src_value.to_string()); + set_node_attr(node, "src", Some(src_full_url)); + } + + if let Some(source_attr_srcset_value) = get_node_attr(node, "srcset") { + if get_node_name(&get_parent_node(&node)) == Some("picture") { + if options.no_images { + set_node_attr(node, "srcset", Some(str!(empty_image!()))); + } else { + let srcset_full_url = + resolve_url(&url, source_attr_srcset_value).unwrap_or_default(); + let srcset_url_fragment = get_url_fragment(srcset_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &srcset_full_url, + options, + depth + 1, + ) { + Ok((srcset_data, srcset_final_url, srcset_media_type)) => { + let srcset_data_url = data_to_data_url( + &srcset_media_type, + &srcset_data, + &srcset_final_url, + ); + let assembled_url: String = url_with_fragment( + srcset_data_url.as_str(), + srcset_url_fragment.as_str(), + ); + set_node_attr(node, "srcset", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(srcset_full_url.clone()) { let assembled_url: String = url_with_fragment( - srcset_data_url.as_str(), + srcset_full_url.as_str(), srcset_url_fragment.as_str(), ); - attr.value.push_slice(assembled_url.as_str()); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(srcset_full_url.clone()) { - attr.value.clear(); - let assembled_url: String = url_with_fragment( - srcset_full_url.as_str(), - srcset_url_fragment.as_str(), - ); - attr.value.push_slice(assembled_url.as_str()); - } + set_node_attr(node, "srcset", Some(assembled_url)); } } } @@ -1093,50 +1007,47 @@ pub fn walk_and_embed_assets( } } "a" | "area" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("href") { - let attr_value = attr.value.trim(); - - if options.no_js && attr_value.trim().starts_with("javascript:") { - attr.value.clear(); - // Replace with empty JS call to preserve original behavior - attr.value.push_slice("javascript:;"); - continue; - } - + if let Some(anchor_attr_href_value) = get_node_attr(node, "href") { + if options.no_js + && anchor_attr_href_value + .clone() + .trim() + .starts_with("javascript:") + { + // Replace with empty JS call to preserve original behavior + set_node_attr(node, "href", Some(str!("javascript:;"))); + } else if anchor_attr_href_value.clone().starts_with('#') + || url_has_protocol(anchor_attr_href_value.clone()) + { // Don't touch email links or hrefs which begin with a hash - if attr_value.starts_with('#') || url_has_protocol(attr_value) { - continue; - } - - let href_full_url = resolve_url(&url, attr_value).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(href_full_url.as_str()); + } else { + let href_full_url = + resolve_url(&url, anchor_attr_href_value).unwrap_or_default(); + set_node_attr(node, "href", Some(href_full_url)); } } } "script" => { - // Remove integrity and src attributes, keep values of the last ones - let mut script_integrity: String = str!(); - let mut script_src: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("integrity") { - script_integrity = str!(attrs_mut.remove(i).value.trim()); - } else if attr_name.eq_ignore_ascii_case("src") { - script_src = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } + // Read values of integrity and src attributes + let script_attr_integrity: Option = get_node_attr(node, "integrity"); + let script_attr_src: Option = get_node_attr(node, "src"); + + // Wipe integrity attribute + if script_attr_integrity != None { + set_node_attr(node, "integrity", None); } if options.no_js { - // Empty inner content (src is already gone) + // Empty inner content node.children.borrow_mut().clear(); - } else if !script_src.is_empty() { - let script_full_url = resolve_url(&url, script_src).unwrap_or_default(); + // Remove src attribute + if script_attr_src != None { + set_node_attr(node, "src", None); + } + } else if !script_attr_src.clone().unwrap_or_default().is_empty() { + let script_full_url = + resolve_url(&url, script_attr_src.unwrap_or_default()) + .unwrap_or_default(); match retrieve_asset( cache, client, @@ -1146,29 +1057,37 @@ pub fn walk_and_embed_assets( depth + 1, ) { Ok((script_data, script_final_url, _script_media_type)) => { - // Only embed if we're able to validate integrity - if script_integrity.is_empty() - || check_integrity(&script_data, &script_integrity) - { + let mut ok_to_include = true; + + // Check integrity + if let Some(script_attr_integrity_value) = script_attr_integrity { + if !script_attr_integrity_value.is_empty() { + ok_to_include = check_integrity( + &script_data, + &script_attr_integrity_value, + ); + } + } + + if ok_to_include { + // Only embed if we're able to validate integrity let script_data_url = data_to_data_url( "application/javascript", &script_data, &script_final_url, ); - // Add new data URL src attribute - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(script_data_url.as_ref()), - }); + set_node_attr(node, "src", Some(script_data_url)); + } else { + set_node_attr(node, "src", None); } } Err(_) => { - // Keep remote reference if unable to retrieve the asset if is_http_url(script_full_url.clone()) { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(script_full_url.as_ref()), - }); + // Keep remote reference if unable to retrieve the asset + set_node_attr(node, "src", Some(script_full_url)); + } else { + // Remove src attribute if target is not remote + set_node_attr(node, "src", None); } } }; @@ -1197,86 +1116,74 @@ pub fn walk_and_embed_assets( } } "form" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("action") { - let form_action = attr.value.trim(); - // Modify action property to ensure it's a full URL - if !is_http_url(form_action) { - let form_action_full_url = - resolve_url(&url, form_action).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(form_action_full_url.as_str()); - } + if let Some(form_attr_action_value) = get_node_attr(node, "action") { + // Modify action property to ensure it's a full URL + if !is_http_url(form_attr_action_value.clone()) { + let form_action_full_url = + resolve_url(&url, form_attr_action_value).unwrap_or_default(); + set_node_attr(node, "action", Some(form_action_full_url)); } } } "frame" | "iframe" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("src") { - if options.no_frames { - // Empty the src attribute - attr.value.clear(); - continue; - } - - let frame_src = attr.value.trim(); - - // Ignore (i)frames with empty source — they cause infinite loops - if frame_src.is_empty() { - continue; - } + if let Some(frame_attr_src_value) = get_node_attr(node, "src") { + if options.no_frames { + // Empty the src attribute + set_node_attr(node, "src", Some(str!())); + } else { + let frame_src = frame_attr_src_value.trim(); - let frame_full_url = resolve_url(&url, frame_src).unwrap_or_default(); - let frame_url_fragment = get_url_fragment(frame_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &frame_full_url, - options, - depth + 1, - ) { - Ok((frame_data, frame_final_url, frame_media_type)) => { - let frame_dom = - html_to_dom(&String::from_utf8_lossy(&frame_data)); - walk_and_embed_assets( - cache, - client, - &frame_final_url, - &frame_dom.document, - &options, - depth + 1, - ); - let mut frame_data: Vec = Vec::new(); - serialize( - &mut frame_data, - &frame_dom.document, - SerializeOpts::default(), - ) - .unwrap(); - let frame_data_url = data_to_data_url( - &frame_media_type, - &frame_data, - &frame_final_url, - ); - attr.value.clear(); - let assembled_url: String = url_with_fragment( - frame_data_url.as_str(), - frame_url_fragment.as_str(), - ); - attr.value.push_slice(assembled_url.as_str()); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(frame_full_url.clone()) { - attr.value.clear(); + // Ignore (i)frames with empty source (they cause infinite loops) + if !frame_src.is_empty() { + let frame_full_url = + resolve_url(&url, frame_src).unwrap_or_default(); + let frame_url_fragment = get_url_fragment(frame_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &frame_full_url, + options, + depth + 1, + ) { + Ok((frame_data, frame_final_url, frame_media_type)) => { + let frame_dom = + html_to_dom(&String::from_utf8_lossy(&frame_data)); + walk_and_embed_assets( + cache, + client, + &frame_final_url, + &frame_dom.document, + &options, + depth + 1, + ); + let mut frame_data: Vec = Vec::new(); + serialize( + &mut frame_data, + &frame_dom.document, + SerializeOpts::default(), + ) + .unwrap(); + let frame_data_url = data_to_data_url( + &frame_media_type, + &frame_data, + &frame_final_url, + ); let assembled_url: String = url_with_fragment( - frame_full_url.as_str(), + frame_data_url.as_str(), frame_url_fragment.as_str(), ); - attr.value.push_slice(assembled_url.as_str()); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(frame_full_url.clone()) { + let assembled_url: String = url_with_fragment( + frame_full_url.as_str(), + frame_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } } } } @@ -1284,59 +1191,49 @@ pub fn walk_and_embed_assets( } } "video" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("poster") { - let video_poster_url = attr.value.trim(); - - // Skip posters with empty source - if video_poster_url.is_empty() { - continue; - } - + if let Some(video_attr_poster_value) = get_node_attr(node, "poster") { + // Skip posters with empty source + if !video_attr_poster_value.is_empty() { if options.no_images { - attr.value.clear(); - continue; - } - - let video_poster_full_url = - resolve_url(&url, video_poster_url).unwrap_or_default(); - let video_poster_url_fragment = - get_url_fragment(video_poster_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &video_poster_full_url, - options, - depth + 1, - ) { - Ok(( - video_poster_data, - video_poster_final_url, - video_poster_media_type, - )) => { - let video_poster_data_url = data_to_data_url( - &video_poster_media_type, - &video_poster_data, - &video_poster_final_url, - ); - attr.value.clear(); - let assembled_url: String = url_with_fragment( - video_poster_data_url.as_str(), - video_poster_url_fragment.as_str(), - ); - attr.value.push_slice(assembled_url.as_str()); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(video_poster_full_url.clone()) { - attr.value.clear(); + set_node_attr(node, "poster", Some(str!(empty_image!()))); + } else { + let video_poster_full_url = + resolve_url(&url, video_attr_poster_value).unwrap_or_default(); + let video_poster_url_fragment = + get_url_fragment(video_poster_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &video_poster_full_url, + options, + depth + 1, + ) { + Ok(( + video_poster_data, + video_poster_final_url, + video_poster_media_type, + )) => { + let video_poster_data_url = data_to_data_url( + &video_poster_media_type, + &video_poster_data, + &video_poster_final_url, + ); let assembled_url: String = url_with_fragment( - video_poster_full_url.as_str(), + video_poster_data_url.as_str(), video_poster_url_fragment.as_str(), ); - attr.value.push_slice(assembled_url.as_str()); + set_node_attr(node, "poster", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(video_poster_full_url.clone()) { + let assembled_url: String = url_with_fragment( + video_poster_full_url.as_str(), + video_poster_url_fragment.as_str(), + ); + set_node_attr(node, "poster", Some(assembled_url)); + } } } } @@ -1349,39 +1246,21 @@ pub fn walk_and_embed_assets( // Process style attributes if options.no_css { // Get rid of style attributes - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("style") { - attrs_mut.remove(i); - } else { - i += 1; - } - } + set_node_attr(node, "style", None); } else { - // Otherwise, parse any links found in the attributes - for attribute in attrs_mut - .iter_mut() - .filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style")) - { - let replacement = embed_css( - cache, - client, - &url, - attribute.value.as_ref(), - options, - depth, - ); - // let replacement = str!(); - attribute.value.clear(); - attribute.value.push_slice(&replacement); + // Embed URLs found within the style attribute of this node + if let Some(node_attr_style_value) = get_node_attr(node, "style") { + let embedded_style = + embed_css(cache, client, &url, &node_attr_style_value, options, depth); + set_node_attr(node, "style", Some(embedded_style)); } } if options.no_js { + let attrs_mut = &mut attrs.borrow_mut(); // Get rid of JS event attributes let mut js_attr_indexes = Vec::new(); - for (i, attr) in attrs_mut.iter_mut().enumerate() { + for (i, attr) in attrs_mut.iter().enumerate() { if attr_is_event_handler(&attr.name.local) { js_attr_indexes.push(i); } diff --git a/src/tests/cli/basic.rs b/src/tests/cli/basic.rs index 14c84c4..629d938 100644 --- a/src/tests/cli/basic.rs +++ b/src/tests/cli/basic.rs @@ -337,7 +337,7 @@ mod passing { Local HTML file\n \ \n \ \n\n\n\n \ - \"\"\n \ + \"\"\n \ Tricky href\n \ Remote URL\n \ \n\n\n\n\ @@ -399,7 +399,7 @@ mod passing { Local HTML file\n \ \n \ \n\n\n\n \ - \"\"\n \ + \"\"\n \ Tricky href\n \ Remote URL\n \ \n\n\n\n\ diff --git a/src/tests/html/set_node_attr.rs b/src/tests/html/set_node_attr.rs index 73e7766..140895b 100644 --- a/src/tests/html/set_node_attr.rs +++ b/src/tests/html/set_node_attr.rs @@ -63,4 +63,43 @@ mod passing { assert_eq!(count, 5); } + + #[test] + fn body_background() { + let html = ""; + let dom = html::html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + + if node_name == "body" { + assert_eq!(html::get_node_attr(node, "background"), Some(str!("1"))); + + html::set_node_attr(node, "background", None); + assert_eq!(html::get_node_attr(node, "background"), None); + } + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 5); + } } diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index 894bad2..901574a 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -319,8 +319,8 @@ mod passing { buf.iter().map(|&c| c as char).collect::(), "\ \ - \ - \ + \ + \ \ \ " From 816b6175ac307c39a2c0a8e8d4d56aabc97ebb5d Mon Sep 17 00:00:00 2001 From: Sunshine Date: Fri, 25 Dec 2020 12:06:56 -1000 Subject: [PATCH 3/3] rewrite ADR #8 (Base Tag) --- docs/arch/0008-base-tag.md | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/docs/arch/0008-base-tag.md b/docs/arch/0008-base-tag.md index ce21919..491b504 100644 --- a/docs/arch/0008-base-tag.md +++ b/docs/arch/0008-base-tag.md @@ -1,6 +1,6 @@ # 8. Base Tag -Date: 2020-11-22 +Date: 2020-12-25 ## Status @@ -8,20 +8,33 @@ Accepted ## Context -HTML documents may contain `base` tag within `head`, which influences URL resolution prefix for anchor and relative links as well as dynamically loaded resources. Sometimes to make certain saved pages function closer to how they originally operated, the `base` tag specifying the source page's URL may need to be added to the document. +HTML documents may contain `base` tag, which influences resolution of anchor links and relative URLs as well as dynamically loaded resources. + +Sometimes, in order to make certain saved documents function closer to how they operate while being served from a remote server, the `base` tag specifying the source page's URL may need to be added to the document. + +There can be only one such tag. If multiple `base` tags are present, only the first encountered tag ends up being used. ## Decision -Adding the `base` tag should be optional. Saved documents should not contain the `base` tag unless it was requested by the user, or unless the document originally had the `base` tag in it. Only documents donwloaded from remote resources should be able to obtain a new `base` tag, existing `base` tags within documents saved from data URLs and local resources should be kept intact. -The existing `href` attribute's value of the original `base` tag should be used for resolving document's relative links instead of document's own URL. -There can be only one such tag. If multiple `base` tags are provided, only the first encountered tag will end up being used. +Adding the `base` tag should be optional — saved documents should not contain the `base` tag unless it was specified by the user, or the document originally had the `base` tag in it. + +Existing `href` attribute's value of the original `base` tag should be used for resolving the document's relative links instead of document's own URL (precisely the way browsers do it). ## Consequences -In case the remote document had the `base` tag in it: - - By default: the `href` attribute should be resolved to a full URL if it's relative, kept empty in case it was empty or non-existent, all other attributes of that tag should be kept intact. - - If `base` tag was requested to be added: the exsting `base` tag's `href` attribute should be set to page's full URL, all other attributes should be kept intact. +#### If the base tag does not exist in the source document + +- If the base tag does not exist in the source document + - With base URL option provided + - use the specified base URL value to retrieve assets, keep original base URL value in the document + - Without base URL option provided + - download document as usual, do not add base tag +- If the base tag already exists in the source document + - With base URL option provided + - we overwrite the original base URL before retrieving assets, keep new base URL value in the document + - Without base URL option provided: + - use the base URL from the original document to retrieve assets, keep original base URL value in the document + +The program will obtain ability to retrieve remote assets for non-remote sources (such as data URLs and local files). -In case the remote document didn't have the `base` tag in it: - - By default: no `base` tag is added to the document, it gets saved to disk without having one. - - If `base` tag was requested to be added: the added `base` tag should contain only one attribute `href`, equal to the remote URL of that HTML document. +The program will obatin ability to get rid of existing base tag values (by provind an empty one).