|
|
|
@ -3,16 +3,15 @@ use chrono::prelude::*;
|
|
|
|
|
use encoding_rs::Encoding;
|
|
|
|
|
use html5ever::interface::QualName;
|
|
|
|
|
use html5ever::parse_document;
|
|
|
|
|
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
|
|
|
|
use html5ever::serialize::{serialize, SerializeOpts};
|
|
|
|
|
use html5ever::tendril::{format_tendril, TendrilSink};
|
|
|
|
|
use html5ever::tree_builder::{Attribute, TreeSink};
|
|
|
|
|
use html5ever::{local_name, namespace_url, ns, LocalName};
|
|
|
|
|
use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
|
|
|
|
|
use regex::Regex;
|
|
|
|
|
use reqwest::blocking::Client;
|
|
|
|
|
|
|
|
|
|
use reqwest::Url;
|
|
|
|
|
use sha2::{Digest, Sha256, Sha384, Sha512};
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
use std::default::Default;
|
|
|
|
|
|
|
|
|
|
use crate::css::embed_css;
|
|
|
|
@ -28,11 +27,15 @@ struct SrcSetItem<'a> {
|
|
|
|
|
descriptor: &'a str,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"];
|
|
|
|
|
const ICON_VALUES: &[&str] = &["icon", "shortcut icon"];
|
|
|
|
|
|
|
|
|
|
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
|
|
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
|
serialize(&mut buf, document, SerializeOpts::default())
|
|
|
|
|
serialize(
|
|
|
|
|
&mut buf,
|
|
|
|
|
&SerializableHandle::from(document.clone()),
|
|
|
|
|
SerializeOpts::default(),
|
|
|
|
|
)
|
|
|
|
|
.expect("unable to serialize DOM into buffer");
|
|
|
|
|
|
|
|
|
|
let mut dom = html_to_dom(&buf, "utf-8".to_string());
|
|
|
|
@ -54,7 +57,7 @@ pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
|
|
|
|
|
Default::default(),
|
|
|
|
|
);
|
|
|
|
|
// Insert favicon LINK tag into HEAD
|
|
|
|
|
head.children.borrow_mut().push(favicon_node.clone());
|
|
|
|
|
head.children.borrow_mut().push(favicon_node);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -62,18 +65,18 @@ pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn check_integrity(data: &[u8], integrity: &str) -> bool {
|
|
|
|
|
if integrity.starts_with("sha256-") {
|
|
|
|
|
if let Some(stripped) = integrity.strip_prefix("sha256-") {
|
|
|
|
|
let mut hasher = Sha256::new();
|
|
|
|
|
hasher.update(data);
|
|
|
|
|
base64::encode(hasher.finalize()) == integrity[7..]
|
|
|
|
|
} else if integrity.starts_with("sha384-") {
|
|
|
|
|
base64::encode(hasher.finalize()) == stripped
|
|
|
|
|
} else if let Some(stripped) = integrity.strip_prefix("sha384-") {
|
|
|
|
|
let mut hasher = Sha384::new();
|
|
|
|
|
hasher.update(data);
|
|
|
|
|
base64::encode(hasher.finalize()) == integrity[7..]
|
|
|
|
|
} else if integrity.starts_with("sha512-") {
|
|
|
|
|
base64::encode(hasher.finalize()) == stripped
|
|
|
|
|
} else if let Some(stripped) = integrity.strip_prefix("sha512-") {
|
|
|
|
|
let mut hasher = Sha512::new();
|
|
|
|
|
hasher.update(data);
|
|
|
|
|
base64::encode(hasher.finalize()) == integrity[7..]
|
|
|
|
|
base64::encode(hasher.finalize()) == stripped
|
|
|
|
|
} else {
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
@ -125,7 +128,7 @@ pub fn create_metadata_tag(url: &Url) -> String {
|
|
|
|
|
format!(
|
|
|
|
|
"<!-- Saved from {} at {} using {} v{} -->",
|
|
|
|
|
if clean_url.scheme() == "http" || clean_url.scheme() == "https" {
|
|
|
|
|
&clean_url.as_str()
|
|
|
|
|
clean_url.as_str()
|
|
|
|
|
} else {
|
|
|
|
|
"local source"
|
|
|
|
|
},
|
|
|
|
@ -155,19 +158,12 @@ pub fn determine_link_node_type(node: &Handle) -> &str {
|
|
|
|
|
link_type
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn embed_srcset(
|
|
|
|
|
cache: &mut HashMap<String, Vec<u8>>,
|
|
|
|
|
client: &Client,
|
|
|
|
|
document_url: &Url,
|
|
|
|
|
srcset: &str,
|
|
|
|
|
options: &Options,
|
|
|
|
|
depth: u32,
|
|
|
|
|
) -> String {
|
|
|
|
|
pub fn embed_srcset(document_url: &Url, srcset: &str, options: &Options, depth: u32) -> String {
|
|
|
|
|
let mut array: Vec<SrcSetItem> = vec![];
|
|
|
|
|
let re = Regex::new(r",\s+").unwrap();
|
|
|
|
|
for srcset_item in re.split(srcset) {
|
|
|
|
|
let parts: Vec<&str> = srcset_item.trim().split_whitespace().collect();
|
|
|
|
|
if parts.len() > 0 {
|
|
|
|
|
let parts: Vec<&str> = srcset_item.split_whitespace().collect();
|
|
|
|
|
if !parts.is_empty() {
|
|
|
|
|
let path = parts[0].trim();
|
|
|
|
|
let descriptor = if parts.len() > 1 { parts[1].trim() } else { "" };
|
|
|
|
|
let srcset_real_item = SrcSetItem { path, descriptor };
|
|
|
|
@ -181,15 +177,8 @@ pub fn embed_srcset(
|
|
|
|
|
if options.no_images {
|
|
|
|
|
result.push_str(EMPTY_IMAGE_DATA_URL);
|
|
|
|
|
} else {
|
|
|
|
|
let image_full_url: Url = resolve_url(&document_url, part.path);
|
|
|
|
|
match retrieve_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
&image_full_url,
|
|
|
|
|
options,
|
|
|
|
|
depth + 1,
|
|
|
|
|
) {
|
|
|
|
|
let image_full_url: Url = resolve_url(document_url, part.path);
|
|
|
|
|
match retrieve_asset(document_url, &image_full_url, options, depth + 1) {
|
|
|
|
|
Ok((image_data, image_final_url, image_media_type, image_charset)) => {
|
|
|
|
|
let mut image_data_url = create_data_url(
|
|
|
|
|
&image_media_type,
|
|
|
|
@ -214,7 +203,7 @@ pub fn embed_srcset(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !part.descriptor.is_empty() {
|
|
|
|
|
result.push_str(" ");
|
|
|
|
|
result.push(' ');
|
|
|
|
|
result.push_str(part.descriptor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -239,12 +228,9 @@ pub fn find_base_node(node: &Handle) -> Option<Handle> {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
NodeData::Element { ref name, .. } => {
|
|
|
|
|
match name.local.as_ref() {
|
|
|
|
|
"head" => {
|
|
|
|
|
if name.local.as_ref() == "head" {
|
|
|
|
|
return get_child_node_by_name(node, "base");
|
|
|
|
|
}
|
|
|
|
|
_ => {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Dig deeper
|
|
|
|
|
for child in node.children.borrow().iter() {
|
|
|
|
@ -273,7 +259,7 @@ pub fn find_meta_charset_or_content_type_node(node: &Handle) -> Option<Handle> {
|
|
|
|
|
match name.local.as_ref() {
|
|
|
|
|
"head" => {
|
|
|
|
|
if let Some(meta_node) = get_child_node_by_name(node, "meta") {
|
|
|
|
|
if let Some(_) = get_node_attr(&meta_node, "charset") {
|
|
|
|
|
if get_node_attr(&meta_node, "charset").is_some() {
|
|
|
|
|
return Some(meta_node);
|
|
|
|
|
} else if let Some(meta_node_http_equiv_attr_value) =
|
|
|
|
|
get_node_attr(&meta_node, "http-equiv")
|
|
|
|
@ -324,7 +310,7 @@ pub fn get_charset(node: &Handle) -> Option<String> {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return None;
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option<Handle> {
|
|
|
|
@ -333,10 +319,7 @@ pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option<Handle
|
|
|
|
|
NodeData::Element { ref name, .. } => &*name.local == node_name,
|
|
|
|
|
_ => false,
|
|
|
|
|
});
|
|
|
|
|
match matching_children {
|
|
|
|
|
Some(node) => Some(node.clone()),
|
|
|
|
|
_ => None,
|
|
|
|
|
}
|
|
|
|
|
matching_children.cloned()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option<String> {
|
|
|
|
@ -361,7 +344,7 @@ pub fn get_node_name(node: &Handle) -> Option<&'_ str> {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn get_parent_node(child: &Handle) -> Handle {
|
|
|
|
|
let parent = child.parent.take().clone();
|
|
|
|
|
let parent = child.parent.take();
|
|
|
|
|
parent.and_then(|node| node.upgrade()).unwrap()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -406,14 +389,14 @@ pub fn has_favicon(handle: &Handle) -> bool {
|
|
|
|
|
found_favicon
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn html_to_dom(data: &Vec<u8>, document_encoding: String) -> RcDom {
|
|
|
|
|
pub fn html_to_dom(data: &[u8], document_encoding: String) -> RcDom {
|
|
|
|
|
let s: String;
|
|
|
|
|
|
|
|
|
|
if let Some(encoding) = Encoding::for_label(document_encoding.as_bytes()) {
|
|
|
|
|
let (string, _, _) = encoding.decode(&data);
|
|
|
|
|
let (string, _, _) = encoding.decode(data);
|
|
|
|
|
s = string.to_string();
|
|
|
|
|
} else {
|
|
|
|
|
s = String::from_utf8_lossy(&data).to_string();
|
|
|
|
|
s = String::from_utf8_lossy(data).to_string();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
parse_document(RcDom::default(), Default::default())
|
|
|
|
@ -428,7 +411,11 @@ pub fn is_icon(attr_value: &str) -> bool {
|
|
|
|
|
|
|
|
|
|
pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
|
|
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
|
serialize(&mut buf, document, SerializeOpts::default())
|
|
|
|
|
serialize(
|
|
|
|
|
&mut buf,
|
|
|
|
|
&SerializableHandle::from(document.clone()),
|
|
|
|
|
SerializeOpts::default(),
|
|
|
|
|
)
|
|
|
|
|
.expect("unable to serialize DOM into buffer");
|
|
|
|
|
|
|
|
|
|
let mut dom = html_to_dom(&buf, "utf-8".to_string());
|
|
|
|
@ -449,7 +436,7 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Insert newly created BASE node into HEAD
|
|
|
|
|
head_node.children.borrow_mut().push(base_node.clone());
|
|
|
|
|
head_node.children.borrow_mut().push(base_node);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -459,9 +446,9 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
|
|
|
|
|
|
|
|
|
|
pub fn set_charset(mut dom: RcDom, desired_charset: String) -> RcDom {
|
|
|
|
|
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(&dom.document) {
|
|
|
|
|
if let Some(_) = get_node_attr(&meta_charset_node, "charset") {
|
|
|
|
|
if get_node_attr(&meta_charset_node, "charset").is_some() {
|
|
|
|
|
set_node_attr(&meta_charset_node, "charset", Some(desired_charset));
|
|
|
|
|
} else if let Some(_) = get_node_attr(&meta_charset_node, "content") {
|
|
|
|
|
} else if get_node_attr(&meta_charset_node, "content").is_some() {
|
|
|
|
|
set_node_attr(
|
|
|
|
|
&meta_charset_node,
|
|
|
|
|
"content",
|
|
|
|
@ -481,10 +468,7 @@ pub fn set_charset(mut dom: RcDom, desired_charset: String) -> RcDom {
|
|
|
|
|
// Insert newly created META charset node into HEAD
|
|
|
|
|
if let Some(html_node) = get_child_node_by_name(&dom.document, "html") {
|
|
|
|
|
if let Some(head_node) = get_child_node_by_name(&html_node, "head") {
|
|
|
|
|
head_node
|
|
|
|
|
.children
|
|
|
|
|
.borrow_mut()
|
|
|
|
|
.push(meta_charset_node.clone());
|
|
|
|
|
head_node.children.borrow_mut().push(meta_charset_node);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -505,7 +489,7 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>)
|
|
|
|
|
|
|
|
|
|
if let Some(attr_value) = attr_value.clone() {
|
|
|
|
|
let _ = &attrs_mut[i].value.clear();
|
|
|
|
|
let _ = &attrs_mut[i].value.push_slice(&attr_value.as_str());
|
|
|
|
|
let _ = &attrs_mut[i].value.push_slice(attr_value.as_str());
|
|
|
|
|
} else {
|
|
|
|
|
// Remove attr completely if attr_value is not defined
|
|
|
|
|
attrs_mut.remove(i);
|
|
|
|
@ -518,7 +502,7 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>)
|
|
|
|
|
|
|
|
|
|
if !found_existing_attr {
|
|
|
|
|
// Add new attribute (since originally the target node didn't have it)
|
|
|
|
|
if let Some(attr_value) = attr_value.clone() {
|
|
|
|
|
if let Some(attr_value) = attr_value {
|
|
|
|
|
let name = LocalName::from(attr_name);
|
|
|
|
|
|
|
|
|
|
attrs_mut.push(Attribute {
|
|
|
|
@ -564,20 +548,24 @@ pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &O
|
|
|
|
|
// since there already may be one defined in the original document,
|
|
|
|
|
// and browsers don't allow re-defining them (for obvious reasons)
|
|
|
|
|
head.children.borrow_mut().reverse();
|
|
|
|
|
head.children.borrow_mut().push(meta.clone());
|
|
|
|
|
head.children.borrow_mut().push(meta);
|
|
|
|
|
head.children.borrow_mut().reverse();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
serialize(&mut buf, &doc, SerializeOpts::default())
|
|
|
|
|
serialize(
|
|
|
|
|
&mut buf,
|
|
|
|
|
&SerializableHandle::from(doc),
|
|
|
|
|
SerializeOpts::default(),
|
|
|
|
|
)
|
|
|
|
|
.expect("Unable to serialize DOM into buffer");
|
|
|
|
|
|
|
|
|
|
// Unwrap NOSCRIPT elements
|
|
|
|
|
if options.unwrap_noscript {
|
|
|
|
|
let s: &str = &String::from_utf8_lossy(&buf);
|
|
|
|
|
let noscript_re = Regex::new(r"<(?P<c>/?noscript[^>]*)>").unwrap();
|
|
|
|
|
buf = noscript_re.replace_all(&s, "<!--$c-->").as_bytes().to_vec();
|
|
|
|
|
buf = noscript_re.replace_all(s, "<!--$c-->").as_bytes().to_vec();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !document_encoding.is_empty() {
|
|
|
|
@ -592,8 +580,6 @@ pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &O
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn retrieve_and_embed_asset(
|
|
|
|
|
cache: &mut HashMap<String, Vec<u8>>,
|
|
|
|
|
client: &Client,
|
|
|
|
|
document_url: &Url,
|
|
|
|
|
node: &Handle,
|
|
|
|
|
attr_name: &str,
|
|
|
|
@ -601,18 +587,11 @@ pub fn retrieve_and_embed_asset(
|
|
|
|
|
options: &Options,
|
|
|
|
|
depth: u32,
|
|
|
|
|
) {
|
|
|
|
|
let resolved_url: Url = resolve_url(document_url, attr_value.clone());
|
|
|
|
|
let resolved_url: Url = resolve_url(document_url, attr_value);
|
|
|
|
|
|
|
|
|
|
match retrieve_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url.clone(),
|
|
|
|
|
&resolved_url,
|
|
|
|
|
options,
|
|
|
|
|
depth + 1,
|
|
|
|
|
) {
|
|
|
|
|
match retrieve_asset(&document_url.clone(), &resolved_url, options, depth + 1) {
|
|
|
|
|
Ok((data, final_url, mut media_type, charset)) => {
|
|
|
|
|
let node_name: &str = get_node_name(&node).unwrap();
|
|
|
|
|
let node_name: &str = get_node_name(node).unwrap();
|
|
|
|
|
|
|
|
|
|
// Check integrity if it's a LINK or SCRIPT element
|
|
|
|
|
let mut ok_to_include: bool = true;
|
|
|
|
@ -639,28 +618,21 @@ pub fn retrieve_and_embed_asset(
|
|
|
|
|
|
|
|
|
|
if node_name == "link" && determine_link_node_type(node) == "stylesheet" {
|
|
|
|
|
// Stylesheet LINK elements require special treatment
|
|
|
|
|
let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1);
|
|
|
|
|
let css: String = embed_css(&final_url, &s, options, depth + 1);
|
|
|
|
|
|
|
|
|
|
// Create and embed data URL
|
|
|
|
|
let css_data_url =
|
|
|
|
|
create_data_url(&media_type, &charset, css.as_bytes(), &final_url);
|
|
|
|
|
set_node_attr(&node, attr_name, Some(css_data_url.to_string()));
|
|
|
|
|
set_node_attr(node, attr_name, Some(css_data_url.to_string()));
|
|
|
|
|
} else if node_name == "frame" || node_name == "iframe" {
|
|
|
|
|
// (I)FRAMEs are also quite different from conventional resources
|
|
|
|
|
let frame_dom = html_to_dom(&data, charset.clone());
|
|
|
|
|
walk_and_embed_assets(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&final_url,
|
|
|
|
|
&frame_dom.document,
|
|
|
|
|
&options,
|
|
|
|
|
depth + 1,
|
|
|
|
|
);
|
|
|
|
|
walk_and_embed_assets(&final_url, &frame_dom.document, options, depth + 1);
|
|
|
|
|
|
|
|
|
|
let mut frame_data: Vec<u8> = Vec::new();
|
|
|
|
|
serialize(
|
|
|
|
|
&mut frame_data,
|
|
|
|
|
&frame_dom.document,
|
|
|
|
|
&SerializableHandle::from(frame_dom.document),
|
|
|
|
|
SerializeOpts::default(),
|
|
|
|
|
)
|
|
|
|
|
.unwrap();
|
|
|
|
@ -674,16 +646,14 @@ pub fn retrieve_and_embed_asset(
|
|
|
|
|
// Every other type of element gets processed here
|
|
|
|
|
|
|
|
|
|
// Parse media type for SCRIPT elements
|
|
|
|
|
if node_name == "script" {
|
|
|
|
|
if let Some(_) = get_node_attr(node, "src") {
|
|
|
|
|
if node_name == "script" && get_node_attr(node, "src").is_some() {
|
|
|
|
|
if let Some(script_node_type_attr_value) = get_node_attr(node, "type") {
|
|
|
|
|
media_type = script_node_type_attr_value.to_string();
|
|
|
|
|
media_type = script_node_type_attr_value;
|
|
|
|
|
} else {
|
|
|
|
|
// Fallback to default one if it's not specified
|
|
|
|
|
media_type = "application/javascript".to_string();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create and embed data URL
|
|
|
|
|
let mut data_url = create_data_url(&media_type, &charset, &data, &final_url);
|
|
|
|
@ -704,19 +674,12 @@ pub fn retrieve_and_embed_asset(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn walk_and_embed_assets(
|
|
|
|
|
cache: &mut HashMap<String, Vec<u8>>,
|
|
|
|
|
client: &Client,
|
|
|
|
|
document_url: &Url,
|
|
|
|
|
node: &Handle,
|
|
|
|
|
options: &Options,
|
|
|
|
|
depth: u32,
|
|
|
|
|
) {
|
|
|
|
|
pub fn walk_and_embed_assets(document_url: &Url, node: &Handle, options: &Options, depth: u32) {
|
|
|
|
|
match node.data {
|
|
|
|
|
NodeData::Document => {
|
|
|
|
|
// Dig deeper
|
|
|
|
|
for child in node.children.borrow().iter() {
|
|
|
|
|
walk_and_embed_assets(cache, client, &document_url, child, options, depth);
|
|
|
|
|
walk_and_embed_assets(document_url, child, options, depth);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
NodeData::Element {
|
|
|
|
@ -732,7 +695,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
|
|
|
|
|
{
|
|
|
|
|
// Remove http-equiv attributes from META nodes if they're able to control the page
|
|
|
|
|
set_node_attr(&node, "http-equiv", None);
|
|
|
|
|
set_node_attr(node, "http-equiv", None);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -744,9 +707,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
if let Some(link_attr_href_value) = get_node_attr(node, "href") {
|
|
|
|
|
if !options.no_images && !link_attr_href_value.is_empty() {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"href",
|
|
|
|
|
&link_attr_href_value,
|
|
|
|
@ -764,12 +725,9 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
set_node_attr(node, "href", None);
|
|
|
|
|
// Wipe integrity attribute
|
|
|
|
|
set_node_attr(node, "integrity", None);
|
|
|
|
|
} else {
|
|
|
|
|
if !link_attr_href_value.is_empty() {
|
|
|
|
|
} else if !link_attr_href_value.is_empty() {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"href",
|
|
|
|
|
&link_attr_href_value,
|
|
|
|
@ -778,7 +736,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if link_type == "preload" || link_type == "dns-prefetch" {
|
|
|
|
|
// Since all resources are embedded as data URLs, preloading and prefetching are not necessary
|
|
|
|
|
set_node_attr(node, "rel", None);
|
|
|
|
@ -786,7 +743,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
// Make sure that all other LINKs' href attributes are full URLs
|
|
|
|
|
if let Some(link_attr_href_value) = get_node_attr(node, "href") {
|
|
|
|
|
let href_full_url: Url =
|
|
|
|
|
resolve_url(&document_url, &link_attr_href_value);
|
|
|
|
|
resolve_url(document_url, &link_attr_href_value);
|
|
|
|
|
set_node_attr(node, "href", Some(href_full_url.to_string()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -809,8 +766,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
|
|
|
|
|
if !options.no_images && !body_attr_background_value.is_empty() {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"background",
|
|
|
|
@ -828,14 +783,13 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
|
|
|
|
|
if options.no_images {
|
|
|
|
|
// Put empty images into src and data-src attributes
|
|
|
|
|
if img_attr_src_value != None {
|
|
|
|
|
if img_attr_src_value.is_some() {
|
|
|
|
|
set_node_attr(node, "src", Some(EMPTY_IMAGE_DATA_URL.to_string()));
|
|
|
|
|
}
|
|
|
|
|
if img_attr_data_src_value != None {
|
|
|
|
|
if img_attr_data_src_value.is_some() {
|
|
|
|
|
set_node_attr(node, "data-src", Some(EMPTY_IMAGE_DATA_URL.to_string()));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if img_attr_src_value.clone().unwrap_or_default().is_empty()
|
|
|
|
|
} else if img_attr_src_value.clone().unwrap_or_default().is_empty()
|
|
|
|
|
&& img_attr_data_src_value
|
|
|
|
|
.clone()
|
|
|
|
|
.unwrap_or_default()
|
|
|
|
@ -855,8 +809,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
img_attr_src_value.unwrap_or_default()
|
|
|
|
|
};
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -865,19 +817,12 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
depth,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Resolve srcset attribute
|
|
|
|
|
if let Some(img_srcset) = get_node_attr(node, "srcset") {
|
|
|
|
|
if !img_srcset.is_empty() {
|
|
|
|
|
let resolved_srcset: String = embed_srcset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
&img_srcset,
|
|
|
|
|
options,
|
|
|
|
|
depth,
|
|
|
|
|
);
|
|
|
|
|
let resolved_srcset: String =
|
|
|
|
|
embed_srcset(document_url, &img_srcset, options, depth);
|
|
|
|
|
set_node_attr(node, "srcset", Some(resolved_srcset));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -900,8 +845,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
set_node_attr(node, "src", Some(value.to_string()));
|
|
|
|
|
} else {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -933,8 +876,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
|
|
|
|
|
if !options.no_images && !image_href.is_empty() {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"href",
|
|
|
|
@ -954,8 +895,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
set_node_attr(node, "src", None);
|
|
|
|
|
} else {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -969,8 +908,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
set_node_attr(node, "src", None);
|
|
|
|
|
} else {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -983,8 +920,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if let Some(source_attr_srcset_value) = get_node_attr(node, "srcset") {
|
|
|
|
|
if parent_node_name == "picture" {
|
|
|
|
|
if !source_attr_srcset_value.is_empty() {
|
|
|
|
|
if parent_node_name == "picture" && !source_attr_srcset_value.is_empty() {
|
|
|
|
|
if options.no_images {
|
|
|
|
|
set_node_attr(
|
|
|
|
|
node,
|
|
|
|
@ -993,9 +929,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
);
|
|
|
|
|
} else {
|
|
|
|
|
let resolved_srcset: String = embed_srcset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
document_url,
|
|
|
|
|
&source_attr_srcset_value,
|
|
|
|
|
options,
|
|
|
|
|
depth,
|
|
|
|
@ -1005,22 +939,17 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
"a" | "area" => {
|
|
|
|
|
if let Some(anchor_attr_href_value) = get_node_attr(node, "href") {
|
|
|
|
|
if anchor_attr_href_value
|
|
|
|
|
.clone()
|
|
|
|
|
.trim()
|
|
|
|
|
.starts_with("javascript:")
|
|
|
|
|
{
|
|
|
|
|
if anchor_attr_href_value.trim().starts_with("javascript:") {
|
|
|
|
|
if options.no_js {
|
|
|
|
|
// Replace with empty JS call to preserve original behavior
|
|
|
|
|
set_node_attr(node, "href", Some("javascript:;".to_string()));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Don't touch mailto: links or hrefs which begin with a hash sign
|
|
|
|
|
if !anchor_attr_href_value.clone().starts_with('#')
|
|
|
|
|
&& !is_url_and_has_protocol(&anchor_attr_href_value.clone())
|
|
|
|
|
if !anchor_attr_href_value.starts_with('#')
|
|
|
|
|
&& !is_url_and_has_protocol(&anchor_attr_href_value)
|
|
|
|
|
{
|
|
|
|
|
let href_full_url: Url =
|
|
|
|
|
resolve_url(document_url, &anchor_attr_href_value);
|
|
|
|
@ -1037,15 +966,13 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
// Empty inner content
|
|
|
|
|
node.children.borrow_mut().clear();
|
|
|
|
|
// Remove src attribute
|
|
|
|
|
if script_attr_src != None {
|
|
|
|
|
if script_attr_src.is_some() {
|
|
|
|
|
set_node_attr(node, "src", None);
|
|
|
|
|
// Wipe integrity attribute
|
|
|
|
|
set_node_attr(node, "integrity", None);
|
|
|
|
|
}
|
|
|
|
|
} else if !script_attr_src.clone().unwrap_or_default().is_empty() {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -1063,14 +990,8 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
for child_node in node.children.borrow_mut().iter_mut() {
|
|
|
|
|
if let NodeData::Text { ref contents } = child_node.data {
|
|
|
|
|
let mut tendril = contents.borrow_mut();
|
|
|
|
|
let replacement = embed_css(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
tendril.as_ref(),
|
|
|
|
|
options,
|
|
|
|
|
depth,
|
|
|
|
|
);
|
|
|
|
|
let replacement =
|
|
|
|
|
embed_css(document_url, tendril.as_ref(), options, depth);
|
|
|
|
|
tendril.clear();
|
|
|
|
|
tendril.push_slice(&replacement);
|
|
|
|
|
}
|
|
|
|
@ -1094,9 +1015,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
// Ignore (i)frames with empty source (they cause infinite loops)
|
|
|
|
|
if !frame_attr_src_value.trim().is_empty() {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
|
&frame_attr_src_value,
|
|
|
|
@ -1114,8 +1033,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
set_node_attr(node, "src", None);
|
|
|
|
|
} else {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -1133,8 +1050,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
set_node_attr(node, "src", None);
|
|
|
|
|
} else {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"src",
|
|
|
|
@ -1157,8 +1072,6 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
);
|
|
|
|
|
} else {
|
|
|
|
|
retrieve_and_embed_asset(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
document_url,
|
|
|
|
|
node,
|
|
|
|
|
"poster",
|
|
|
|
@ -1183,11 +1096,9 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
);
|
|
|
|
|
// Embed assets of NOSCRIPT node contents
|
|
|
|
|
walk_and_embed_assets(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
document_url,
|
|
|
|
|
&noscript_contents_dom.document,
|
|
|
|
|
&options,
|
|
|
|
|
options,
|
|
|
|
|
depth,
|
|
|
|
|
);
|
|
|
|
|
// Get rid of original contents
|
|
|
|
@ -1198,7 +1109,11 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
{
|
|
|
|
|
if let Some(body) = get_child_node_by_name(&html, "body") {
|
|
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
|
serialize(&mut buf, &body, SerializeOpts::default())
|
|
|
|
|
serialize(
|
|
|
|
|
&mut buf,
|
|
|
|
|
&SerializableHandle::from(body),
|
|
|
|
|
SerializeOpts::default(),
|
|
|
|
|
)
|
|
|
|
|
.expect("Unable to serialize DOM into buffer");
|
|
|
|
|
let result = String::from_utf8_lossy(&buf);
|
|
|
|
|
noscript_contents.push_slice(&result);
|
|
|
|
@ -1219,14 +1134,8 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
} else {
|
|
|
|
|
// Embed URLs found within the style attribute of this node
|
|
|
|
|
if let Some(node_attr_style_value) = get_node_attr(node, "style") {
|
|
|
|
|
let embedded_style = embed_css(
|
|
|
|
|
cache,
|
|
|
|
|
client,
|
|
|
|
|
&document_url,
|
|
|
|
|
&node_attr_style_value,
|
|
|
|
|
options,
|
|
|
|
|
depth,
|
|
|
|
|
);
|
|
|
|
|
let embedded_style =
|
|
|
|
|
embed_css(document_url, &node_attr_style_value, options, depth);
|
|
|
|
|
set_node_attr(node, "style", Some(embedded_style));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -1249,7 +1158,7 @@ pub fn walk_and_embed_assets(
|
|
|
|
|
|
|
|
|
|
// Dig deeper
|
|
|
|
|
for child in node.children.borrow().iter() {
|
|
|
|
|
walk_and_embed_assets(cache, client, &document_url, child, options, depth);
|
|
|
|
|
walk_and_embed_assets(document_url, child, options, depth);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
_ => {
|
|
|
|
|