From d89b4d5f5bf32259c2f83c9fa731d1749933995b Mon Sep 17 00:00:00 2001 From: Sunshine Date: Fri, 25 Dec 2020 11:09:47 -1000 Subject: [PATCH] refactor code that processes the DOM --- src/html.rs | 1117 ++++++++++------------- src/tests/cli/basic.rs | 4 +- src/tests/html/set_node_attr.rs | 39 + src/tests/html/walk_and_embed_assets.rs | 4 +- 4 files changed, 541 insertions(+), 623 deletions(-) diff --git a/src/html.rs b/src/html.rs index 7fadd65..bfe19e6 100644 --- a/src/html.rs +++ b/src/html.rs @@ -4,7 +4,7 @@ use html5ever::interface::QualName; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; -use html5ever::tendril::{format_tendril, Tendril, TendrilSink}; +use html5ever::tendril::{format_tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns, LocalName}; use reqwest::blocking::Client; @@ -493,39 +493,33 @@ pub fn walk_and_embed_assets( ref attrs, .. } => { - let attrs_mut = &mut attrs.borrow_mut(); - match name.local.as_ref() { "meta" => { - // Determine type - let mut is_unwanted_meta: bool = false; - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("http-equiv") { - let value: String = attr.value.to_string(); - is_unwanted_meta = value.eq_ignore_ascii_case("refresh") - || value.eq_ignore_ascii_case("location"); - } - } - - if is_unwanted_meta { - // Strip this node off all its attributes - while attrs_mut.len() > 0 { - attrs_mut.remove(0); + // Remove http-equiv attributes from META nodes if they're able to control the page + if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") { + let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value; + if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh") + || meta_attr_http_equiv_value.eq_ignore_ascii_case("location") + { + set_node_attr( + &node, + "http-equiv", + Some(format!( + "disabled by monolith ({})", + meta_attr_http_equiv_value + )), + ); } } } "link" => { - // Remove integrity attributes, keep value of the last one - let mut integrity: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("integrity") { - integrity = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } + // Read and remember integrity attribute value of this LINK node + let link_attr_integrity_value: Option = + get_node_attr(node, "integrity"); + + // Remove integrity attribute from the LINK node + if link_attr_integrity_value != None { + set_node_attr(node, "integrity", None); } enum LinkType { @@ -537,195 +531,167 @@ pub fn walk_and_embed_assets( } let mut link_type = LinkType::Unknown; - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "rel" { - let value = attr.value.trim(); - if is_icon(value) { - link_type = LinkType::Icon; - break; - } else if value.eq_ignore_ascii_case("stylesheet") { - link_type = LinkType::Stylesheet; - break; - } else if value.eq_ignore_ascii_case("preload") { - link_type = LinkType::Preload; - break; - } else if value.eq_ignore_ascii_case("dns-prefetch") { - link_type = LinkType::DnsPrefetch; - break; - } + if let Some(link_attr_rel_value) = get_node_attr(node, "rel") { + if is_icon(&link_attr_rel_value) { + link_type = LinkType::Icon; + } else if link_attr_rel_value.eq_ignore_ascii_case("stylesheet") { + link_type = LinkType::Stylesheet; + } else if link_attr_rel_value.eq_ignore_ascii_case("preload") { + link_type = LinkType::Preload; + } else if link_attr_rel_value.eq_ignore_ascii_case("dns-prefetch") { + link_type = LinkType::DnsPrefetch; } } + // Shadow the variable (to make it non-mutable) let link_type = link_type; match link_type { LinkType::Icon => { - // Find and remove href attribute(s), keep value of the last found one - let mut link_href: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("href") { - link_href = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } + // Find and resolve this LINK node's href attribute + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + if !options.no_images && !link_attr_href_value.is_empty() { + let link_href_full_url = + resolve_url(&url, link_attr_href_value).unwrap_or_default(); + let link_href_url_fragment = + get_url_fragment(link_href_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &link_href_full_url, + options, + depth + 1, + ) { + Ok(( + link_href_data, + link_href_final_url, + link_href_media_type, + )) => { + let mut ok_to_include = true; + + // Check integrity + if let Some(link_attr_integrity_value) = + link_attr_integrity_value + { + if !link_attr_integrity_value.is_empty() { + ok_to_include = check_integrity( + &link_href_data, + &link_attr_integrity_value, + ); + } + } - if !options.no_images && !link_href.is_empty() { - let link_href_full_url = - resolve_url(&url, link_href).unwrap_or_default(); - let link_href_url_fragment = - get_url_fragment(link_href_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &link_href_full_url, - options, - depth + 1, - ) { - Ok(( - link_href_data, - link_href_final_url, - link_href_media_type, - )) => { - // Check integrity - if integrity.is_empty() - || check_integrity(&link_href_data, &integrity) - { - let link_href_data_url = data_to_data_url( - &link_href_media_type, - &link_href_data, - &link_href_final_url, - ); - // Add new data URL href attribute - let assembled_url: String = url_with_fragment( - link_href_data_url.as_str(), - link_href_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + if ok_to_include { + let link_href_data_url = data_to_data_url( + &link_href_media_type, + &link_href_data, + &link_href_final_url, + ); + // Add new data URL href attribute + let assembled_url: String = url_with_fragment( + link_href_data_url.as_str(), + link_href_url_fragment.as_str(), + ); + set_node_attr(&node, "href", Some(assembled_url)); + } } - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(link_href_full_url.clone()) { - let assembled_url: String = url_with_fragment( - link_href_full_url.as_str(), - link_href_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(link_href_full_url.clone()) { + let assembled_url: String = url_with_fragment( + link_href_full_url.as_str(), + link_href_url_fragment.as_str(), + ); + set_node_attr(node, "href", Some(assembled_url)); + } } } + } else { + set_node_attr(node, "href", None); } } } LinkType::Stylesheet => { - // Find and remove href attribute(s), keep value of the last found one - let mut link_href: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("href") { - link_href = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } + // Find and resolve this LINK node's href attribute + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + set_node_attr(node, "href", None); - if !options.no_css && !link_href.is_empty() { - let link_href_full_url = - resolve_url(&url, link_href).unwrap_or_default(); - match retrieve_asset( - cache, - client, - &url, - &link_href_full_url, - options, - depth + 1, - ) { - Ok(( - link_href_data, - link_href_final_url, - _link_href_media_type, - )) => { - // Check integrity - if integrity.is_empty() - || check_integrity(&link_href_data, &integrity) - { - let css: String = embed_css( - cache, - client, - &link_href_final_url, - &String::from_utf8_lossy(&link_href_data), - options, - depth + 1, - ); - let link_href_data_url = data_to_data_url( - "text/css", - css.as_bytes(), - &link_href_final_url, - ); - // Add new data URL href attribute - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice( - link_href_data_url.as_ref(), - ), - }); + if !options.no_css && !link_attr_href_value.is_empty() { + let link_href_full_url = + resolve_url(&url, link_attr_href_value).unwrap_or_default(); + match retrieve_asset( + cache, + client, + &url, + &link_href_full_url, + options, + depth + 1, + ) { + Ok(( + link_href_data, + link_href_final_url, + _link_href_media_type, + )) => { + let mut ok_to_include = true; + + // Check integrity + if let Some(link_attr_integrity_value) = + link_attr_integrity_value + { + if !link_attr_integrity_value.is_empty() { + ok_to_include = check_integrity( + &link_href_data, + &link_attr_integrity_value, + ); + } + } + + if ok_to_include { + let css: String = embed_css( + cache, + client, + &link_href_final_url, + &String::from_utf8_lossy(&link_href_data), + options, + depth + 1, + ); + let link_href_data_url = data_to_data_url( + "text/css", + css.as_bytes(), + &link_href_final_url, + ); + // Add new data URL href attribute + set_node_attr( + &node, + "href", + Some(link_href_data_url), + ); + } } - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(link_href_full_url.clone()) { - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("href"), - ), - value: Tendril::from_slice( - link_href_full_url.as_ref(), - ), - }); + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(link_href_full_url.clone()) { + set_node_attr( + &node, + "href", + Some(link_href_full_url), + ); + } } } } } } LinkType::Preload | LinkType::DnsPrefetch => { - // Since all resources are embedded as data URL, preloading and prefetching are unnecessary - for _ in 0..attrs_mut.len() { - attrs_mut.remove(0); - } + // Since all resources are embedded as data URLs, preloading and prefetching are not necessary + set_node_attr(node, "rel", None); } LinkType::Unknown => { // Make sure that all other LINKs' href attributes are full URLs - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("href") { - let href_full_url = - resolve_url(&url, attr.value.trim()).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(&href_full_url.as_str()); - } + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + let href_full_url = + resolve_url(&url, link_attr_href_value).unwrap_or_default(); + set_node_attr(node, "href", Some(href_full_url)); } } } @@ -733,68 +699,58 @@ pub fn walk_and_embed_assets( "base" => { if is_http_url(url) { // Ensure the BASE node doesn't have a relative URL - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("href") { - let href_full_url = - resolve_url(&url, attr.value.trim()).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(&href_full_url.as_str()); - } + if let Some(base_attr_href_value) = get_node_attr(node, "href") { + let href_full_url = + resolve_url(&url, base_attr_href_value).unwrap_or_default(); + set_node_attr(node, "href", Some(href_full_url)); } } } "body" => { - // Find and remove background attribute(s), keep value of the last found one - let mut background: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("background") { - background = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } - - if !options.no_images && !background.is_empty() { - let background_full_url = resolve_url(&url, background).unwrap_or_default(); - let background_url_fragment = get_url_fragment(background_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &background_full_url, - options, - depth + 1, - ) { - Ok((background_data, background_final_url, background_media_type)) => { - let background_data_url = data_to_data_url( - &background_media_type, - &background_data, - &background_final_url, - ); - // Add new data URL background attribute - let assembled_url: String = url_with_fragment( - background_data_url.as_str(), - background_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("background")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(background_full_url.clone()) { + // Read and remember background attribute value of this BODY node + if let Some(body_attr_background_value) = get_node_attr(node, "background") { + // Remove background BODY node attribute by default + set_node_attr(node, "background", None); + + if !options.no_images && !body_attr_background_value.is_empty() { + let background_full_url = + resolve_url(&url, body_attr_background_value).unwrap_or_default(); + let background_url_fragment = + get_url_fragment(background_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &background_full_url, + options, + depth + 1, + ) { + Ok(( + background_data, + background_final_url, + background_media_type, + )) => { + let background_data_url = data_to_data_url( + &background_media_type, + &background_data, + &background_final_url, + ); + // Convert background attribute to data URL let assembled_url: String = url_with_fragment( - background_full_url.as_str(), + background_data_url.as_str(), background_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("background")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "background", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(background_full_url.clone()) { + let assembled_url: String = url_with_fragment( + background_full_url.as_str(), + background_url_fragment.as_str(), + ); + set_node_attr(node, "background", Some(assembled_url)); + } } } } @@ -802,48 +758,43 @@ pub fn walk_and_embed_assets( } "img" => { // Find source attribute(s) - let mut img_data_src: String = str!(); - let mut img_src: String = str!(); - let mut img_srcset: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("data-src") { - img_data_src = str!(attrs_mut.remove(i).value.trim()); - } else if attr_name.eq_ignore_ascii_case("src") { - img_src = str!(attrs_mut.remove(i).value.trim()); - } else if attr_name.eq_ignore_ascii_case("srcset") { - img_srcset = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } + let img_attr_src_value: Option = get_node_attr(node, "src"); + let img_attr_data_src_value: Option = get_node_attr(node, "data-src"); if options.no_images { - // Add empty image src attribute - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(empty_image!()), - }); + // Put empty images into src and data-src attributes + if img_attr_src_value != None { + set_node_attr(node, "src", Some(str!(empty_image!()))); + } + if img_attr_data_src_value != None { + set_node_attr(node, "data-src", Some(str!(empty_image!()))); + } } else { - if img_src.is_empty() && img_data_src.is_empty() { + if img_attr_src_value.clone().unwrap_or_default().is_empty() + && img_attr_data_src_value + .clone() + .unwrap_or_default() + .is_empty() + { // Add empty src attribute - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(""), - }); + set_node_attr(node, "src", Some(str!())); } else { // Add data URL src attribute let img_full_url = resolve_url( &url, - if !img_data_src.is_empty() { - img_data_src + if !img_attr_data_src_value + .clone() + .unwrap_or_default() + .is_empty() + { + img_attr_data_src_value.unwrap_or_default() } else { - img_src + img_attr_src_value.unwrap_or_default() }, ) .unwrap_or_default(); let img_url_fragment = get_url_fragment(img_full_url.clone()); + match retrieve_asset( cache, client, @@ -862,36 +813,32 @@ pub fn walk_and_embed_assets( img_data_url.as_str(), img_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "src", Some(assembled_url)); } Err(_) => { - // Keep remote reference if unable to retrieve the asset if is_http_url(img_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset let assembled_url: String = url_with_fragment( img_full_url.as_str(), img_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "src", Some(assembled_url)); + } else { + // Don't keep original reference if it's not a remote target + set_node_attr(node, "src", None); } } } } } - if !img_srcset.is_empty() { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("srcset")), - value: Tendril::from_slice( - embed_srcset(cache, client, &url, &img_srcset, options, depth) - .as_ref(), - ), - }); + // Resolve srcset attribute + if let Some(img_srcset) = get_node_attr(node, "srcset") { + if !img_srcset.is_empty() { + let resolved_srcset: String = + embed_srcset(cache, client, &url, &img_srcset, options, depth); + set_node_attr(node, "srcset", Some(resolved_srcset)); + } } } "svg" => { @@ -900,76 +847,55 @@ pub fn walk_and_embed_assets( } } "input" => { - if let Some(attr_value) = get_node_attr(node, "type") { - if attr_value.to_string().eq_ignore_ascii_case("image") { - let mut input_image_src: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("src") { - input_image_src = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } - } - - if options.no_images || input_image_src.is_empty() { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(if input_image_src.is_empty() { - "" + if let Some(input_attr_type_value) = get_node_attr(node, "type") { + if input_attr_type_value.eq_ignore_ascii_case("image") { + if let Some(input_attr_src_value) = get_node_attr(node, "src") { + if options.no_images || input_attr_src_value.is_empty() { + let value = if input_attr_src_value.is_empty() { + str!() } else { - empty_image!() - }), - }); - } else { - let input_image_full_url = - resolve_url(&url, input_image_src).unwrap_or_default(); - let input_image_url_fragment = - get_url_fragment(input_image_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &input_image_full_url, - options, - depth + 1, - ) { - Ok(( - input_image_data, - input_image_final_url, - input_image_media_type, - )) => { - let input_image_data_url = data_to_data_url( - &input_image_media_type, - &input_image_data, - &input_image_final_url, - ); - // Add data URL src attribute - let assembled_url: String = url_with_fragment( - input_image_data_url.as_str(), - input_image_url_fragment.as_str(), - ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(input_image_full_url.clone()) { + str!(empty_image!()) + }; + set_node_attr(node, "src", Some(value)); + } else { + let input_image_full_url = + resolve_url(&url, input_attr_src_value).unwrap_or_default(); + let input_image_url_fragment = + get_url_fragment(input_image_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &input_image_full_url, + options, + depth + 1, + ) { + Ok(( + input_image_data, + input_image_final_url, + input_image_media_type, + )) => { + let input_image_data_url = data_to_data_url( + &input_image_media_type, + &input_image_data, + &input_image_final_url, + ); + // Add data URL src attribute let assembled_url: String = url_with_fragment( - input_image_full_url.as_str(), + input_image_data_url.as_str(), input_image_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new( - None, - ns!(), - local_name!("src"), - ), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(input_image_full_url.clone()) { + let assembled_url: String = url_with_fragment( + input_image_full_url.as_str(), + input_image_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } } } } @@ -978,17 +904,18 @@ pub fn walk_and_embed_assets( } } "image" => { - // Find and remove (xlink:)href attribute(s), keep value of the last one let mut image_href: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("xlink:href") - || attr_name.eq_ignore_ascii_case("href") - { - image_href = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; + + if let Some(image_attr_href_value) = get_node_attr(node, "href") { + image_href = image_attr_href_value; + if options.no_images { + set_node_attr(node, "href", None); + } + } + if let Some(image_attr_xlink_href_value) = get_node_attr(node, "xlink:href") { + image_href = image_attr_xlink_href_value; + if options.no_images { + set_node_attr(node, "xlink:href", None); } } @@ -1014,10 +941,7 @@ pub fn walk_and_embed_assets( image_data_url.as_str(), image_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "href", Some(assembled_url)); } Err(_) => { // Keep remote reference if unable to retrieve the asset @@ -1026,65 +950,55 @@ pub fn walk_and_embed_assets( image_full_url.as_str(), image_url_fragment.as_str(), ); - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("href")), - value: Tendril::from_slice(assembled_url.as_ref()), - }); + set_node_attr(node, "href", Some(assembled_url)); } } } } } "source" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - - if attr_name.eq_ignore_ascii_case("src") { - let src_full_url = resolve_url(&url, attr.value.trim()) - .unwrap_or_else(|_| attr.value.to_string()); - attr.value.clear(); - attr.value.push_slice(src_full_url.as_str()); - } else if attr_name.eq_ignore_ascii_case("srcset") { - if get_node_name(&get_parent_node(&node)) == Some("picture") { - if options.no_images { - attr.value.clear(); - attr.value.push_slice(empty_image!()); - } else { - let srcset_full_url = - resolve_url(&url, attr.value.trim()).unwrap_or_default(); - let srcset_url_fragment = - get_url_fragment(srcset_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &srcset_full_url, - options, - depth + 1, - ) { - Ok((srcset_data, srcset_final_url, srcset_media_type)) => { - let srcset_data_url = data_to_data_url( - &srcset_media_type, - &srcset_data, - &srcset_final_url, - ); - attr.value.clear(); + if let Some(source_attr_src_value) = get_node_attr(node, "src") { + let src_full_url: String = resolve_url(&url, source_attr_src_value.clone()) + .unwrap_or_else(|_| source_attr_src_value.to_string()); + set_node_attr(node, "src", Some(src_full_url)); + } + + if let Some(source_attr_srcset_value) = get_node_attr(node, "srcset") { + if get_node_name(&get_parent_node(&node)) == Some("picture") { + if options.no_images { + set_node_attr(node, "srcset", Some(str!(empty_image!()))); + } else { + let srcset_full_url = + resolve_url(&url, source_attr_srcset_value).unwrap_or_default(); + let srcset_url_fragment = get_url_fragment(srcset_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &srcset_full_url, + options, + depth + 1, + ) { + Ok((srcset_data, srcset_final_url, srcset_media_type)) => { + let srcset_data_url = data_to_data_url( + &srcset_media_type, + &srcset_data, + &srcset_final_url, + ); + let assembled_url: String = url_with_fragment( + srcset_data_url.as_str(), + srcset_url_fragment.as_str(), + ); + set_node_attr(node, "srcset", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(srcset_full_url.clone()) { let assembled_url: String = url_with_fragment( - srcset_data_url.as_str(), + srcset_full_url.as_str(), srcset_url_fragment.as_str(), ); - attr.value.push_slice(assembled_url.as_str()); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(srcset_full_url.clone()) { - attr.value.clear(); - let assembled_url: String = url_with_fragment( - srcset_full_url.as_str(), - srcset_url_fragment.as_str(), - ); - attr.value.push_slice(assembled_url.as_str()); - } + set_node_attr(node, "srcset", Some(assembled_url)); } } } @@ -1093,50 +1007,47 @@ pub fn walk_and_embed_assets( } } "a" | "area" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("href") { - let attr_value = attr.value.trim(); - - if options.no_js && attr_value.trim().starts_with("javascript:") { - attr.value.clear(); - // Replace with empty JS call to preserve original behavior - attr.value.push_slice("javascript:;"); - continue; - } - + if let Some(anchor_attr_href_value) = get_node_attr(node, "href") { + if options.no_js + && anchor_attr_href_value + .clone() + .trim() + .starts_with("javascript:") + { + // Replace with empty JS call to preserve original behavior + set_node_attr(node, "href", Some(str!("javascript:;"))); + } else if anchor_attr_href_value.clone().starts_with('#') + || url_has_protocol(anchor_attr_href_value.clone()) + { // Don't touch email links or hrefs which begin with a hash - if attr_value.starts_with('#') || url_has_protocol(attr_value) { - continue; - } - - let href_full_url = resolve_url(&url, attr_value).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(href_full_url.as_str()); + } else { + let href_full_url = + resolve_url(&url, anchor_attr_href_value).unwrap_or_default(); + set_node_attr(node, "href", Some(href_full_url)); } } } "script" => { - // Remove integrity and src attributes, keep values of the last ones - let mut script_integrity: String = str!(); - let mut script_src: String = str!(); - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("integrity") { - script_integrity = str!(attrs_mut.remove(i).value.trim()); - } else if attr_name.eq_ignore_ascii_case("src") { - script_src = str!(attrs_mut.remove(i).value.trim()); - } else { - i += 1; - } + // Read values of integrity and src attributes + let script_attr_integrity: Option = get_node_attr(node, "integrity"); + let script_attr_src: Option = get_node_attr(node, "src"); + + // Wipe integrity attribute + if script_attr_integrity != None { + set_node_attr(node, "integrity", None); } if options.no_js { - // Empty inner content (src is already gone) + // Empty inner content node.children.borrow_mut().clear(); - } else if !script_src.is_empty() { - let script_full_url = resolve_url(&url, script_src).unwrap_or_default(); + // Remove src attribute + if script_attr_src != None { + set_node_attr(node, "src", None); + } + } else if !script_attr_src.clone().unwrap_or_default().is_empty() { + let script_full_url = + resolve_url(&url, script_attr_src.unwrap_or_default()) + .unwrap_or_default(); match retrieve_asset( cache, client, @@ -1146,29 +1057,37 @@ pub fn walk_and_embed_assets( depth + 1, ) { Ok((script_data, script_final_url, _script_media_type)) => { - // Only embed if we're able to validate integrity - if script_integrity.is_empty() - || check_integrity(&script_data, &script_integrity) - { + let mut ok_to_include = true; + + // Check integrity + if let Some(script_attr_integrity_value) = script_attr_integrity { + if !script_attr_integrity_value.is_empty() { + ok_to_include = check_integrity( + &script_data, + &script_attr_integrity_value, + ); + } + } + + if ok_to_include { + // Only embed if we're able to validate integrity let script_data_url = data_to_data_url( "application/javascript", &script_data, &script_final_url, ); - // Add new data URL src attribute - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(script_data_url.as_ref()), - }); + set_node_attr(node, "src", Some(script_data_url)); + } else { + set_node_attr(node, "src", None); } } Err(_) => { - // Keep remote reference if unable to retrieve the asset if is_http_url(script_full_url.clone()) { - attrs_mut.push(Attribute { - name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(script_full_url.as_ref()), - }); + // Keep remote reference if unable to retrieve the asset + set_node_attr(node, "src", Some(script_full_url)); + } else { + // Remove src attribute if target is not remote + set_node_attr(node, "src", None); } } }; @@ -1197,86 +1116,74 @@ pub fn walk_and_embed_assets( } } "form" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("action") { - let form_action = attr.value.trim(); - // Modify action property to ensure it's a full URL - if !is_http_url(form_action) { - let form_action_full_url = - resolve_url(&url, form_action).unwrap_or_default(); - attr.value.clear(); - attr.value.push_slice(form_action_full_url.as_str()); - } + if let Some(form_attr_action_value) = get_node_attr(node, "action") { + // Modify action property to ensure it's a full URL + if !is_http_url(form_attr_action_value.clone()) { + let form_action_full_url = + resolve_url(&url, form_attr_action_value).unwrap_or_default(); + set_node_attr(node, "action", Some(form_action_full_url)); } } } "frame" | "iframe" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("src") { - if options.no_frames { - // Empty the src attribute - attr.value.clear(); - continue; - } - - let frame_src = attr.value.trim(); - - // Ignore (i)frames with empty source — they cause infinite loops - if frame_src.is_empty() { - continue; - } + if let Some(frame_attr_src_value) = get_node_attr(node, "src") { + if options.no_frames { + // Empty the src attribute + set_node_attr(node, "src", Some(str!())); + } else { + let frame_src = frame_attr_src_value.trim(); - let frame_full_url = resolve_url(&url, frame_src).unwrap_or_default(); - let frame_url_fragment = get_url_fragment(frame_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &frame_full_url, - options, - depth + 1, - ) { - Ok((frame_data, frame_final_url, frame_media_type)) => { - let frame_dom = - html_to_dom(&String::from_utf8_lossy(&frame_data)); - walk_and_embed_assets( - cache, - client, - &frame_final_url, - &frame_dom.document, - &options, - depth + 1, - ); - let mut frame_data: Vec = Vec::new(); - serialize( - &mut frame_data, - &frame_dom.document, - SerializeOpts::default(), - ) - .unwrap(); - let frame_data_url = data_to_data_url( - &frame_media_type, - &frame_data, - &frame_final_url, - ); - attr.value.clear(); - let assembled_url: String = url_with_fragment( - frame_data_url.as_str(), - frame_url_fragment.as_str(), - ); - attr.value.push_slice(assembled_url.as_str()); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(frame_full_url.clone()) { - attr.value.clear(); + // Ignore (i)frames with empty source (they cause infinite loops) + if !frame_src.is_empty() { + let frame_full_url = + resolve_url(&url, frame_src).unwrap_or_default(); + let frame_url_fragment = get_url_fragment(frame_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &frame_full_url, + options, + depth + 1, + ) { + Ok((frame_data, frame_final_url, frame_media_type)) => { + let frame_dom = + html_to_dom(&String::from_utf8_lossy(&frame_data)); + walk_and_embed_assets( + cache, + client, + &frame_final_url, + &frame_dom.document, + &options, + depth + 1, + ); + let mut frame_data: Vec = Vec::new(); + serialize( + &mut frame_data, + &frame_dom.document, + SerializeOpts::default(), + ) + .unwrap(); + let frame_data_url = data_to_data_url( + &frame_media_type, + &frame_data, + &frame_final_url, + ); let assembled_url: String = url_with_fragment( - frame_full_url.as_str(), + frame_data_url.as_str(), frame_url_fragment.as_str(), ); - attr.value.push_slice(assembled_url.as_str()); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(frame_full_url.clone()) { + let assembled_url: String = url_with_fragment( + frame_full_url.as_str(), + frame_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } } } } @@ -1284,59 +1191,49 @@ pub fn walk_and_embed_assets( } } "video" => { - for attr in attrs_mut.iter_mut() { - let attr_name: &str = &attr.name.local; - if attr_name.eq_ignore_ascii_case("poster") { - let video_poster_url = attr.value.trim(); - - // Skip posters with empty source - if video_poster_url.is_empty() { - continue; - } - + if let Some(video_attr_poster_value) = get_node_attr(node, "poster") { + // Skip posters with empty source + if !video_attr_poster_value.is_empty() { if options.no_images { - attr.value.clear(); - continue; - } - - let video_poster_full_url = - resolve_url(&url, video_poster_url).unwrap_or_default(); - let video_poster_url_fragment = - get_url_fragment(video_poster_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &video_poster_full_url, - options, - depth + 1, - ) { - Ok(( - video_poster_data, - video_poster_final_url, - video_poster_media_type, - )) => { - let video_poster_data_url = data_to_data_url( - &video_poster_media_type, - &video_poster_data, - &video_poster_final_url, - ); - attr.value.clear(); - let assembled_url: String = url_with_fragment( - video_poster_data_url.as_str(), - video_poster_url_fragment.as_str(), - ); - attr.value.push_slice(assembled_url.as_str()); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(video_poster_full_url.clone()) { - attr.value.clear(); + set_node_attr(node, "poster", Some(str!(empty_image!()))); + } else { + let video_poster_full_url = + resolve_url(&url, video_attr_poster_value).unwrap_or_default(); + let video_poster_url_fragment = + get_url_fragment(video_poster_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &video_poster_full_url, + options, + depth + 1, + ) { + Ok(( + video_poster_data, + video_poster_final_url, + video_poster_media_type, + )) => { + let video_poster_data_url = data_to_data_url( + &video_poster_media_type, + &video_poster_data, + &video_poster_final_url, + ); let assembled_url: String = url_with_fragment( - video_poster_full_url.as_str(), + video_poster_data_url.as_str(), video_poster_url_fragment.as_str(), ); - attr.value.push_slice(assembled_url.as_str()); + set_node_attr(node, "poster", Some(assembled_url)); + } + Err(_) => { + // Keep remote reference if unable to retrieve the asset + if is_http_url(video_poster_full_url.clone()) { + let assembled_url: String = url_with_fragment( + video_poster_full_url.as_str(), + video_poster_url_fragment.as_str(), + ); + set_node_attr(node, "poster", Some(assembled_url)); + } } } } @@ -1349,39 +1246,21 @@ pub fn walk_and_embed_assets( // Process style attributes if options.no_css { // Get rid of style attributes - let mut i = 0; - while i < attrs_mut.len() { - let attr_name: &str = &attrs_mut[i].name.local; - if attr_name.eq_ignore_ascii_case("style") { - attrs_mut.remove(i); - } else { - i += 1; - } - } + set_node_attr(node, "style", None); } else { - // Otherwise, parse any links found in the attributes - for attribute in attrs_mut - .iter_mut() - .filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style")) - { - let replacement = embed_css( - cache, - client, - &url, - attribute.value.as_ref(), - options, - depth, - ); - // let replacement = str!(); - attribute.value.clear(); - attribute.value.push_slice(&replacement); + // Embed URLs found within the style attribute of this node + if let Some(node_attr_style_value) = get_node_attr(node, "style") { + let embedded_style = + embed_css(cache, client, &url, &node_attr_style_value, options, depth); + set_node_attr(node, "style", Some(embedded_style)); } } if options.no_js { + let attrs_mut = &mut attrs.borrow_mut(); // Get rid of JS event attributes let mut js_attr_indexes = Vec::new(); - for (i, attr) in attrs_mut.iter_mut().enumerate() { + for (i, attr) in attrs_mut.iter().enumerate() { if attr_is_event_handler(&attr.name.local) { js_attr_indexes.push(i); } diff --git a/src/tests/cli/basic.rs b/src/tests/cli/basic.rs index 14c84c4..629d938 100644 --- a/src/tests/cli/basic.rs +++ b/src/tests/cli/basic.rs @@ -337,7 +337,7 @@ mod passing { Local HTML file\n \ \n \ \n\n\n\n \ - \"\"\n \ + \"\"\n \ Tricky href\n \ Remote URL\n \ \n\n\n\n\ @@ -399,7 +399,7 @@ mod passing { Local HTML file\n \ \n \ \n\n\n\n \ - \"\"\n \ + \"\"\n \ Tricky href\n \ Remote URL\n \ \n\n\n\n\ diff --git a/src/tests/html/set_node_attr.rs b/src/tests/html/set_node_attr.rs index 73e7766..140895b 100644 --- a/src/tests/html/set_node_attr.rs +++ b/src/tests/html/set_node_attr.rs @@ -63,4 +63,43 @@ mod passing { assert_eq!(count, 5); } + + #[test] + fn body_background() { + let html = ""; + let dom = html::html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + + if node_name == "body" { + assert_eq!(html::get_node_attr(node, "background"), Some(str!("1"))); + + html::set_node_attr(node, "background", None); + assert_eq!(html::get_node_attr(node, "background"), None); + } + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 5); + } } diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index 894bad2..901574a 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -319,8 +319,8 @@ mod passing { buf.iter().map(|&c| c as char).collect::(), "\ \ - \ - \ + \ + \ \ \ "