use crate::css::embed_css; use crate::js::attr_is_event_handler; use crate::utils::{ data_to_data_url, get_url_fragment, is_http_url, resolve_url, retrieve_asset, url_has_protocol, }; use base64; use html5ever::interface::QualName; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::{format_tendril, Tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns}; use reqwest::blocking::Client; use sha2::{Digest, Sha256, Sha384, Sha512}; use std::collections::HashMap; use std::default::Default; struct SrcSetItem<'a> { path: &'a str, descriptor: &'a str, } const ICON_VALUES: &[&str] = &[ "icon", "shortcut icon", "mask-icon", "apple-touch-icon", "fluid-icon", ]; pub fn get_parent_node(node: &Handle) -> Handle { let parent = node.parent.take().clone(); parent.and_then(|node| node.upgrade()).unwrap() } pub fn get_node_name(node: &Handle) -> Option<&'_ str> { match &node.data { NodeData::Element { ref name, .. } => Some(name.local.as_ref()), _ => None, } } pub fn is_icon(attr_value: &str) -> bool { ICON_VALUES.contains(&attr_value.to_lowercase().as_str()) } pub fn has_proper_integrity(data: &[u8], integrity: &str) -> bool { if integrity.starts_with("sha256-") { let mut hasher = Sha256::new(); hasher.update(data); base64::encode(hasher.finalize()) == integrity[7..] } else if integrity.starts_with("sha384-") { let mut hasher = Sha384::new(); hasher.update(data); base64::encode(hasher.finalize()) == integrity[7..] } else if integrity.starts_with("sha512-") { let mut hasher = Sha512::new(); hasher.update(data); base64::encode(hasher.finalize()) == integrity[7..] } else { false } } pub fn embed_srcset( cache: &mut HashMap>, client: &Client, parent_url: &str, srcset: &str, opt_no_images: bool, opt_silent: bool, ) -> String { let mut array: Vec = vec![]; let srcset_items: Vec<&str> = srcset.split(',').collect(); for srcset_item in srcset_items { let parts: Vec<&str> = srcset_item.trim().split_whitespace().collect(); let path = parts[0].trim(); let descriptor = if parts.len() > 1 { parts[1].trim() } else { "" }; let srcset_real_item = SrcSetItem { path, descriptor }; array.push(srcset_real_item); } let mut result: String = str!(); let mut i: usize = array.len(); for part in array { if opt_no_images { result.push_str(empty_image!()); } else { let image_full_url = resolve_url(&parent_url, part.path).unwrap_or_default(); let image_url_fragment = get_url_fragment(image_full_url.clone()); match retrieve_asset(cache, client, &parent_url, &image_full_url, opt_silent) { Ok((image_data, image_final_url, image_media_type)) => { let image_data_url = data_to_data_url( &image_media_type, &image_data, &image_final_url, &image_url_fragment, ); // Append retreved asset as a data URL result.push_str(image_data_url.as_ref()); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(image_full_url.clone()) { result.push_str(image_full_url.as_ref()); } else { // Avoid breaking the structure in case if not an HTTP(S) URL result.push_str(empty_image!()); } } } } if !part.descriptor.is_empty() { result.push_str(" "); result.push_str(part.descriptor); } if i > 1 { result.push_str(", "); } i -= 1; } result } pub fn walk_and_embed_assets( cache: &mut HashMap>, client: &Client, url: &str, node: &Handle, opt_no_css: bool, opt_no_fonts: bool, opt_no_frames: bool, opt_no_js: bool, opt_no_images: bool, opt_silent: bool, ) { match node.data { NodeData::Document => { // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( cache, client, &url, child, opt_no_css, opt_no_fonts, opt_no_frames, opt_no_js, opt_no_images, opt_silent, ); } } NodeData::Element { ref name, ref attrs, .. } => { let attrs_mut = &mut attrs.borrow_mut(); match name.local.as_ref() { "link" => { // Remove integrity attributes, keep value of the last one let mut integrity: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("integrity") { integrity = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } enum LinkType { Icon, Stylesheet, Preload, DnsPrefetch, Unknown, } let mut link_type = LinkType::Unknown; for attr in attrs_mut.iter_mut() { if &attr.name.local == "rel" { let value = attr.value.trim(); if is_icon(value) { link_type = LinkType::Icon; break; } else if value.eq_ignore_ascii_case("stylesheet") { link_type = LinkType::Stylesheet; break; } else if value.eq_ignore_ascii_case("preload") { link_type = LinkType::Preload; break; } else if value.eq_ignore_ascii_case("dns-prefetch") { link_type = LinkType::DnsPrefetch; break; } } } let link_type = link_type; match link_type { LinkType::Icon => { // Find and remove href attribute(s), keep value of the last found one let mut link_href: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("href") { link_href = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if !opt_no_images && !link_href.is_empty() { let link_href_full_url = resolve_url(&url, link_href).unwrap_or_default(); let link_href_url_fragment = get_url_fragment(link_href_full_url.clone()); match retrieve_asset( cache, client, &url, &link_href_full_url, opt_silent, ) { Ok(( link_href_data, link_href_final_url, link_href_media_type, )) => { // Check integrity if integrity.is_empty() || has_proper_integrity(&link_href_data, &integrity) { let link_href_data_url = data_to_data_url( &link_href_media_type, &link_href_data, &link_href_final_url, &link_href_url_fragment, ); // Add new data URL href attribute attrs_mut.push(Attribute { name: QualName::new( None, ns!(), local_name!("href"), ), value: Tendril::from_slice( link_href_data_url.as_ref(), ), }); } } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(link_href_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new( None, ns!(), local_name!("href"), ), value: Tendril::from_slice( link_href_full_url.as_ref(), ), }); } } } } } LinkType::Stylesheet => { // Find and remove href attribute(s), keep value of the last found one let mut link_href: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("href") { link_href = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if !opt_no_css && !link_href.is_empty() { let link_href_full_url = resolve_url(&url, link_href).unwrap_or_default(); match retrieve_asset( cache, client, &url, &link_href_full_url, opt_silent, ) { Ok(( link_href_data, link_href_final_url, _link_href_media_type, )) => { // Check integrity if integrity.is_empty() || has_proper_integrity(&link_href_data, &integrity) { let css: String = embed_css( cache, client, &link_href_final_url, &String::from_utf8_lossy(&link_href_data), opt_no_fonts, opt_no_images, opt_silent, ); let link_href_data_url = data_to_data_url( "text/css", css.as_bytes(), &link_href_final_url, "", ); // Add new data URL href attribute attrs_mut.push(Attribute { name: QualName::new( None, ns!(), local_name!("href"), ), value: Tendril::from_slice( link_href_data_url.as_ref(), ), }); } } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(link_href_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new( None, ns!(), local_name!("href"), ), value: Tendril::from_slice( link_href_full_url.as_ref(), ), }); } } } } } LinkType::Preload | LinkType::DnsPrefetch => { // Since all resources are embedded as data URL, preloading and prefetching are unnecessary for _ in 0..attrs_mut.len() { attrs_mut.remove(0); } } LinkType::Unknown => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("href") { let href_full_url = resolve_url(&url, attr.value.trim()).unwrap_or_default(); attr.value.clear(); attr.value.push_slice(&href_full_url.as_str()); } } } } } "body" => { // Find and remove background attribute(s), keep value of the last found one let mut background: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("background") { background = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if !opt_no_images && !background.is_empty() { let background_full_url = resolve_url(&url, background).unwrap_or_default(); let background_url_fragment = get_url_fragment(background_full_url.clone()); match retrieve_asset(cache, client, &url, &background_full_url, opt_silent) { Ok((background_data, background_final_url, background_media_type)) => { let background_data_url = data_to_data_url( &background_media_type, &background_data, &background_final_url, &background_url_fragment, ); // Add new data URL background attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("background")), value: Tendril::from_slice(background_data_url.as_ref()), }); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(background_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("background")), value: Tendril::from_slice(background_full_url.as_ref()), }); } } } } } "img" => { // Find source attribute(s) let mut img_data_src: String = str!(); let mut img_src: String = str!(); let mut img_srcset: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("data-src") { img_data_src = str!(attrs_mut.remove(i).value.trim()); } else if attr_name.eq_ignore_ascii_case("src") { img_src = str!(attrs_mut.remove(i).value.trim()); } else if attr_name.eq_ignore_ascii_case("srcset") { img_srcset = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if opt_no_images { // Add empty image src attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(empty_image!()), }); } else { if img_src.is_empty() && img_data_src.is_empty() { // Add empty src attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(""), }); } else { // Add data URL src attribute let img_full_url = resolve_url( &url, if !img_data_src.is_empty() { img_data_src } else { img_src }, ) .unwrap_or_default(); let img_url_fragment = get_url_fragment(img_full_url.clone()); match retrieve_asset(cache, client, &url, &img_full_url, opt_silent) { Ok((img_data, img_final_url, img_media_type)) => { let img_data_url = data_to_data_url( &img_media_type, &img_data, &img_final_url, &img_url_fragment, ); attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(img_data_url.as_ref()), }); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(img_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(img_full_url.as_ref()), }); } } } } } if !img_srcset.is_empty() { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("srcset")), value: Tendril::from_slice( embed_srcset( cache, client, &url, &img_srcset, opt_no_images, opt_silent, ) .as_ref(), ), }); } } "svg" => { if opt_no_images { node.children.borrow_mut().clear(); } } "input" => { // Determine input type let mut is_image_input: bool = false; for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("type") { is_image_input = attr.value.to_string().eq_ignore_ascii_case("image"); } } if is_image_input { let mut input_image_src: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("src") { input_image_src = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if opt_no_images || input_image_src.is_empty() { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(if input_image_src.is_empty() { "" } else { empty_image!() }), }); } else { let input_image_full_url = resolve_url(&url, input_image_src).unwrap_or_default(); let input_image_url_fragment = get_url_fragment(input_image_full_url.clone()); match retrieve_asset( cache, client, &url, &input_image_full_url, opt_silent, ) { Ok(( input_image_data, input_image_final_url, input_image_media_type, )) => { let input_image_data_url = data_to_data_url( &input_image_media_type, &input_image_data, &input_image_final_url, &input_image_url_fragment, ); // Add data URL src attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(input_image_data_url.as_ref()), }); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(input_image_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice( input_image_full_url.as_ref(), ), }); } } } } } } "image" => { // Find and remove (xlink:)href attribute(s), keep value of the last one let mut image_href: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("xlink:href") || attr_name.eq_ignore_ascii_case("href") { image_href = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if !opt_no_images && !image_href.is_empty() { let image_full_url = resolve_url(&url, image_href).unwrap_or_default(); let image_url_fragment = get_url_fragment(image_full_url.clone()); match retrieve_asset(cache, client, &url, &image_full_url, opt_silent) { Ok((image_data, image_final_url, image_media_type)) => { let image_data_url = data_to_data_url( &image_media_type, &image_data, &image_final_url, &image_url_fragment, ); // Add new data URL href attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("href")), value: Tendril::from_slice(image_data_url.as_ref()), }); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(image_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("href")), value: Tendril::from_slice(image_full_url.as_ref()), }); } } } } } "source" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("src") { let src_full_url = resolve_url(&url, attr.value.trim()) .unwrap_or_else(|_| attr.value.to_string()); attr.value.clear(); attr.value.push_slice(src_full_url.as_str()); } else if attr_name.eq_ignore_ascii_case("srcset") { if get_node_name(&get_parent_node(&node)) == Some("picture") { if opt_no_images { attr.value.clear(); attr.value.push_slice(empty_image!()); } else { let srcset_full_url = resolve_url(&url, attr.value.trim()).unwrap_or_default(); let srcset_url_fragment = get_url_fragment(srcset_full_url.clone()); match retrieve_asset( cache, client, &url, &srcset_full_url, opt_silent, ) { Ok((srcset_data, srcset_final_url, srcset_media_type)) => { let srcset_data_url = data_to_data_url( &srcset_media_type, &srcset_data, &srcset_final_url, &srcset_url_fragment, ); attr.value.clear(); attr.value.push_slice(srcset_data_url.as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(srcset_full_url.clone()) { attr.value.clear(); attr.value.push_slice(srcset_full_url.as_str()); if !srcset_url_fragment.is_empty() { attr.value.push_slice("#"); attr.value .push_slice(srcset_url_fragment.as_str()); } } } } } } } } } "a" | "area" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("href") { let attr_value = attr.value.trim(); if opt_no_js && attr_value.starts_with("javascript:") { attr.value.clear(); // Replace with empty JS call to preserve original behavior attr.value.push_slice("javascript:;"); continue; } // Don't touch email links or hrefs which begin with a hash sign if attr_value.starts_with('#') || url_has_protocol(attr_value) { continue; } let href_full_url = resolve_url(&url, attr_value).unwrap_or_default(); attr.value.clear(); attr.value.push_slice(href_full_url.as_str()); } } } "script" => { // Remove integrity and src attributes, keep values of the last ones let mut script_integrity: String = str!(); let mut script_src: String = str!(); let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("integrity") { script_integrity = str!(attrs_mut.remove(i).value.trim()); } else if attr_name.eq_ignore_ascii_case("src") { script_src = str!(attrs_mut.remove(i).value.trim()); } else { i += 1; } } if opt_no_js { // Empty inner content (src is already gone) node.children.borrow_mut().clear(); } else if !script_src.is_empty() { let script_full_url = resolve_url(&url, script_src).unwrap_or_default(); match retrieve_asset(cache, client, &url, &script_full_url, opt_silent) { Ok((script_data, script_final_url, _script_media_type)) => { // Only embed if we're able to validate integrity if script_integrity.is_empty() || has_proper_integrity(&script_data, &script_integrity) { let script_data_url = data_to_data_url( "application/javascript", &script_data, &script_final_url, "", ); // Add new data URL src attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(script_data_url.as_ref()), }); } } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(script_full_url.clone()) { attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(script_full_url.as_ref()), }); } } }; } } "style" => { if opt_no_css { // Empty inner content of STYLE tags node.children.borrow_mut().clear(); } else { for node in node.children.borrow_mut().iter_mut() { if let NodeData::Text { ref contents } = node.data { let mut tendril = contents.borrow_mut(); let replacement = embed_css( cache, client, &url, tendril.as_ref(), opt_no_fonts, opt_no_images, opt_silent, ); tendril.clear(); tendril.push_slice(&replacement); } } } } "form" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("action") { let form_action = attr.value.trim(); // Modify action property to ensure it's a full URL if !is_http_url(form_action) { let form_action_full_url = resolve_url(&url, form_action).unwrap_or_default(); attr.value.clear(); attr.value.push_slice(form_action_full_url.as_str()); } } } } "frame" | "iframe" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("src") { if opt_no_frames { // Empty the src attribute attr.value.clear(); continue; } let frame_src = attr.value.trim(); // Ignore (i)frames with empty source — they cause infinite loops if frame_src.is_empty() { continue; } let frame_full_url = resolve_url(&url, frame_src).unwrap_or_default(); let frame_url_fragment = get_url_fragment(frame_full_url.clone()); match retrieve_asset(cache, client, &url, &frame_full_url, opt_silent) { Ok((frame_data, frame_final_url, frame_media_type)) => { let frame_dom = html_to_dom(&String::from_utf8_lossy(&frame_data)); walk_and_embed_assets( cache, client, &frame_final_url, &frame_dom.document, opt_no_css, opt_no_fonts, opt_no_frames, opt_no_js, opt_no_images, opt_silent, ); let mut frame_data: Vec = Vec::new(); serialize( &mut frame_data, &frame_dom.document, SerializeOpts::default(), ) .unwrap(); let frame_data_url = data_to_data_url( &frame_media_type, &frame_data, &frame_final_url, &frame_url_fragment, ); attr.value.clear(); attr.value.push_slice(frame_data_url.as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(frame_full_url.clone()) { attr.value.clear(); attr.value.push_slice(frame_full_url.as_str()); } } } } } } "video" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name.eq_ignore_ascii_case("poster") { let video_poster_url = attr.value.trim(); // Skip posters with empty source if video_poster_url.is_empty() { continue; } if opt_no_images { attr.value.clear(); continue; } let video_poster_full_url = resolve_url(&url, video_poster_url).unwrap_or_default(); let video_poster_url_fragment = get_url_fragment(video_poster_full_url.clone()); match retrieve_asset( cache, client, &url, &video_poster_full_url, opt_silent, ) { Ok(( video_poster_data, video_poster_final_url, video_poster_media_type, )) => { let video_poster_data_url = data_to_data_url( &video_poster_media_type, &video_poster_data, &video_poster_final_url, &video_poster_url_fragment, ); attr.value.clear(); attr.value.push_slice(video_poster_data_url.as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset if is_http_url(video_poster_full_url.clone()) { attr.value.clear(); attr.value.push_slice(video_poster_full_url.as_str()); } } } } } } _ => {} } // Process style attributes if opt_no_css { // Get rid of style attributes let mut i = 0; while i < attrs_mut.len() { let attr_name: &str = &attrs_mut[i].name.local; if attr_name.eq_ignore_ascii_case("style") { attrs_mut.remove(i); } else { i += 1; } } } else { // Otherwise, parse any links found in the attributes for attribute in attrs_mut .iter_mut() .filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style")) { let replacement = embed_css( cache, client, &url, attribute.value.as_ref(), opt_no_fonts, opt_no_images, opt_silent, ); // let replacement = str!(); attribute.value.clear(); attribute.value.push_slice(&replacement); } } if opt_no_js { // Get rid of JS event attributes let mut js_attr_indexes = Vec::new(); for (i, attr) in attrs_mut.iter_mut().enumerate() { if attr_is_event_handler(&attr.name.local) { js_attr_indexes.push(i); } } js_attr_indexes.reverse(); for attr_index in js_attr_indexes { attrs_mut.remove(attr_index); } } // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( cache, client, &url, child, opt_no_css, opt_no_fonts, opt_no_frames, opt_no_js, opt_no_images, opt_silent, ); } } _ => { // Note: in case of opt_no_js being set to true, there's no need to worry about // getting rid of comments that may contain scripts, e.g.