Merge pull request #264 from snshn/fixes

Fixes
pull/265/head
Sunshine 3 years ago committed by GitHub
commit 1bb8141021
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -202,10 +202,11 @@ pub fn process_css<'a>(
import_contents, import_contents,
import_final_url, import_final_url,
import_media_type, import_media_type,
_import_charset, import_charset,
)) => { )) => {
let mut import_data_url = create_data_url( let mut import_data_url = create_data_url(
&import_media_type, &import_media_type,
&import_charset,
embed_css( embed_css(
cache, cache,
client, client,
@ -252,9 +253,9 @@ pub fn process_css<'a>(
options, options,
depth + 1, depth + 1,
) { ) {
Ok((data, final_url, media_type, _charset)) => { Ok((data, final_url, media_type, charset)) => {
let mut data_url = let mut data_url =
create_data_url(&media_type, &data, &final_url); create_data_url(&media_type, &charset, &data, &final_url);
data_url.set_fragment(resolved_url.fragment()); data_url.set_fragment(resolved_url.fragment());
result.push_str( result.push_str(
format_quoted_string(&data_url.to_string()).as_str(), format_quoted_string(&data_url.to_string()).as_str(),
@ -348,9 +349,10 @@ pub fn process_css<'a>(
options, options,
depth + 1, depth + 1,
) { ) {
Ok((css, final_url, media_type, _charset)) => { Ok((css, final_url, media_type, charset)) => {
let mut data_url = create_data_url( let mut data_url = create_data_url(
&media_type, &media_type,
&charset,
embed_css( embed_css(
cache, cache,
client, client,
@ -386,8 +388,9 @@ pub fn process_css<'a>(
options, options,
depth + 1, depth + 1,
) { ) {
Ok((data, final_url, media_type, _charset)) => { Ok((data, final_url, media_type, charset)) => {
let mut data_url = create_data_url(&media_type, &data, &final_url); let mut data_url =
create_data_url(&media_type, &charset, &data, &final_url);
data_url.set_fragment(full_url.fragment()); data_url.set_fragment(full_url.fragment());
result result
.push_str(format_quoted_string(&data_url.to_string()).as_str()); .push_str(format_quoted_string(&data_url.to_string()).as_str());

@ -188,10 +188,13 @@ pub fn embed_srcset(
options, options,
depth + 1, depth + 1,
) { ) {
Ok((image_data, image_final_url, image_media_type, _image_charset)) => { Ok((image_data, image_final_url, image_media_type, image_charset)) => {
// TODO: use image_charset let mut image_data_url = create_data_url(
let mut image_data_url = &image_media_type,
create_data_url(&image_media_type, &image_data, &image_final_url); &image_charset,
&image_data,
&image_final_url,
);
// Append retreved asset as a data URL // Append retreved asset as a data URL
image_data_url.set_fragment(image_full_url.fragment()); image_data_url.set_fragment(image_full_url.fragment());
result.push_str(image_data_url.as_ref()); result.push_str(image_data_url.as_ref());
@ -606,7 +609,7 @@ pub fn retrieve_and_embed_asset(
options, options,
depth + 1, depth + 1,
) { ) {
Ok((data, final_url, mut media_type, _charset)) => { Ok((data, final_url, mut media_type, charset)) => {
let node_name: &str = get_node_name(&node).unwrap(); let node_name: &str = get_node_name(&node).unwrap();
// Check integrity if it's a LINK or SCRIPT element // Check integrity if it's a LINK or SCRIPT element
@ -624,23 +627,25 @@ pub fn retrieve_and_embed_asset(
} }
if ok_to_include { if ok_to_include {
let s: String;
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
let (string, _, _) = encoding.decode(&data);
s = string.to_string();
} else {
s = String::from_utf8_lossy(&data).to_string();
}
if node_name == "link" && determine_link_node_type(node) == "stylesheet" { if node_name == "link" && determine_link_node_type(node) == "stylesheet" {
// Stylesheet LINK elements require special treatment // Stylesheet LINK elements require special treatment
let css: String = embed_css( let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1);
cache,
client,
&final_url,
&String::from_utf8_lossy(&data),
options,
depth + 1,
);
// Create and embed data URL // Create and embed data URL
let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url); let css_data_url =
create_data_url(&media_type, &charset, css.as_bytes(), &final_url);
set_node_attr(&node, attr_name, Some(css_data_url.to_string())); set_node_attr(&node, attr_name, Some(css_data_url.to_string()));
} else if node_name == "frame" || node_name == "iframe" { } else if node_name == "frame" || node_name == "iframe" {
// (I)FRAMEs are also quite different from conventional resources // (I)FRAMEs are also quite different from conventional resources
let frame_dom = html_to_dom(&data, "utf-8".to_string()); let frame_dom = html_to_dom(&data, charset.clone());
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client, client,
@ -659,8 +664,8 @@ pub fn retrieve_and_embed_asset(
.unwrap(); .unwrap();
// Create and embed data URL // Create and embed data URL
// TODO: use charset let mut frame_data_url =
let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url); create_data_url(&media_type, &charset, &frame_data, &final_url);
frame_data_url.set_fragment(resolved_url.fragment()); frame_data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(frame_data_url.to_string())); set_node_attr(node, attr_name, Some(frame_data_url.to_string()));
} else { } else {
@ -679,7 +684,7 @@ pub fn retrieve_and_embed_asset(
} }
// Create and embed data URL // Create and embed data URL
let mut data_url = create_data_url(&media_type, &data, &final_url); let mut data_url = create_data_url(&media_type, &charset, &data, &final_url);
data_url.set_fragment(resolved_url.fragment()); data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(data_url.to_string())); set_node_attr(node, attr_name, Some(data_url.to_string()));
} }
@ -725,14 +730,7 @@ pub fn walk_and_embed_assets(
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location") || meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
{ {
// Remove http-equiv attributes from META nodes if they're able to control the page // Remove http-equiv attributes from META nodes if they're able to control the page
set_node_attr( set_node_attr(&node, "http-equiv", None);
&node,
"http-equiv",
Some(format!(
"disabled by monolith ({})",
meta_attr_http_equiv_value
)),
);
} }
} }
} }
@ -1094,7 +1092,7 @@ pub fn walk_and_embed_assets(
client, client,
&document_url, &document_url,
node, node,
"href", "src",
&frame_attr_src_value, &frame_attr_src_value,
options, options,
depth, depth,

@ -86,7 +86,6 @@ fn main() {
} }
let target_url: Url; let target_url: Url;
let mut base_url: Url;
let mut use_stdin: bool = false; let mut use_stdin: bool = false;
// Determine exact target URL // Determine exact target URL
@ -156,20 +155,19 @@ fn main() {
HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"), HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"),
); );
} }
let timeout: u64 = if options.timeout > 0 { let client = if options.timeout > 0 {
options.timeout Client::builder().timeout(Duration::from_secs(options.timeout))
} else { } else {
std::u64::MAX / 4 // This is pretty close to infinity // No timeout is default
}; Client::builder()
let client = Client::builder() }
.timeout(Duration::from_secs(timeout)) .danger_accept_invalid_certs(options.insecure)
.danger_accept_invalid_certs(options.insecure) .default_headers(header_map)
.default_headers(header_map) .build()
.build() .expect("Failed to initialize HTTP client");
.expect("Failed to initialize HTTP client");
// At first we assume that base URL is the same as target URL
// At this stage we assume that the base URL is the same as the target URL let mut base_url: Url = target_url.clone();
base_url = target_url.clone();
let data: Vec<u8>; let data: Vec<u8>;
let mut document_encoding: String = str!(); let mut document_encoding: String = str!();
@ -214,16 +212,16 @@ fn main() {
dom = html_to_dom(&data, document_encoding.clone()); dom = html_to_dom(&data, document_encoding.clone());
// TODO: investigate if charset from filesystem/data URL/HTTP headers // TODO: investigate if charset from filesystem/data URL/HTTP headers
// has power over what's specified in HTML // has say over what's specified in HTML
// Attempt to determine document's charset // Attempt to determine document's charset
if let Some(charset) = get_charset(&dom.document) { if let Some(html_charset) = get_charset(&dom.document) {
if !charset.is_empty() { if !html_charset.is_empty() {
// Check if the charset specified inside HTML is valid // Check if the charset specified inside HTML is valid
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) { if let Some(encoding) = Encoding::for_label_no_replacement(html_charset.as_bytes()) {
// No point in parsing HTML again with the same encoding as before // No point in parsing HTML again with the same encoding as before
if encoding.name() != "UTF-8" { if encoding.name() != "UTF-8" {
document_encoding = charset; document_encoding = html_charset;
dom = html_to_dom(&data, document_encoding.clone()); dom = html_to_dom(&data, document_encoding.clone());
} }
} }
@ -233,8 +231,8 @@ fn main() {
// Use custom base URL if specified, read and use what's in the DOM otherwise // Use custom base URL if specified, read and use what's in the DOM otherwise
let custom_base_url: String = options.base_url.clone().unwrap_or(str!()); let custom_base_url: String = options.base_url.clone().unwrap_or(str!());
if custom_base_url.is_empty() { if custom_base_url.is_empty() {
// No custom base URL is specified, // No custom base URL is specified
// try to see if the document has BASE tag // Try to see if document has BASE element
if let Some(existing_base_url) = get_base_url(&dom.document) { if let Some(existing_base_url) = get_base_url(&dom.document) {
base_url = resolve_url(&target_url, &existing_base_url); base_url = resolve_url(&target_url, &existing_base_url);
} }
@ -253,8 +251,7 @@ fn main() {
} }
} }
Err(_) => { Err(_) => {
// Failed to parse given base URL, // Failed to parse given base URL, perhaps it's a filesystem path?
// perhaps it's a filesystem path?
if target_url.scheme() == "file" { if target_url.scheme() == "file" {
// Relative paths could work for documents saved from filesystem // Relative paths could work for documents saved from filesystem
let path: &Path = Path::new(&custom_base_url); let path: &Path = Path::new(&custom_base_url);
@ -302,9 +299,9 @@ fn main() {
&options, &options,
0, 0,
) { ) {
Ok((data, final_url, media_type, _charset)) => { Ok((data, final_url, media_type, charset)) => {
// TODO: use charset let favicon_data_url: Url =
let favicon_data_url: Url = create_data_url(&media_type, &data, &final_url); create_data_url(&media_type, &charset, &data, &final_url);
dom = add_favicon(&dom.document, favicon_data_url.to_string()); dom = add_favicon(&dom.document, favicon_data_url.to_string());
} }
Err(_) => { Err(_) => {
@ -322,7 +319,7 @@ fn main() {
// Serialize DOM tree // Serialize DOM tree
let mut result: Vec<u8> = serialize_document(dom, document_encoding, &options); let mut result: Vec<u8> = serialize_document(dom, document_encoding, &options);
// Add metadata tag // Prepend metadata comment tag
if !options.no_metadata { if !options.no_metadata {
let mut metadata_comment: String = create_metadata_tag(&target_url); let mut metadata_comment: String = create_metadata_tag(&target_url);
metadata_comment += "\n"; metadata_comment += "\n";

@ -290,7 +290,7 @@ mod passing {
} }
#[test] #[test]
fn keeps_integrity_for_linked_assets() { fn keeps_integrity_for_unfamiliar_links() {
let html = "<title>Has integrity</title>\ let html = "<title>Has integrity</title>\
<link integrity=\"sha384-12345\" rel=\"something\" href=\"https://some-site.com/some-file.ext\" />"; <link integrity=\"sha384-12345\" rel=\"something\" href=\"https://some-site.com/some-file.ext\" />";
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
@ -322,7 +322,7 @@ mod passing {
} }
#[test] #[test]
fn discards_integrity_for_linked_assets_nojs_nocss() { fn discards_integrity_for_known_links_nojs_nocss() {
let html = "\ let html = "\
<title>No integrity</title>\ <title>No integrity</title>\
<link integrity=\"\" rel=\"stylesheet\" href=\"data:;\"/>\ <link integrity=\"\" rel=\"stylesheet\" href=\"data:;\"/>\
@ -403,8 +403,8 @@ mod passing {
let html = "\ let html = "\
<html>\ <html>\
<head>\ <head>\
<meta http-equiv=\"Refresh\" value=\"20\"/>\ <meta http-equiv=\"Refresh\" content=\"2\"/>\
<meta http-equiv=\"Location\" value=\"https://freebsd.org\"/>\ <meta http-equiv=\"Location\" content=\"https://freebsd.org\"/>\
</head>\ </head>\
<body>\ <body>\
</body>\ </body>\
@ -433,8 +433,8 @@ mod passing {
"\ "\
<html>\ <html>\
<head>\ <head>\
<meta http-equiv=\"disabled by monolith (Refresh)\" value=\"20\">\ <meta content=\"2\">\
<meta http-equiv=\"disabled by monolith (Location)\" value=\"https://freebsd.org\">\ <meta content=\"https://freebsd.org\">\
</head>\ </head>\
<body>\ <body>\
</body>\ </body>\

@ -13,9 +13,14 @@ mod passing {
#[test] #[test]
fn encode_string_with_specific_media_type() { fn encode_string_with_specific_media_type() {
let mime = "application/javascript"; let media_type = "application/javascript";
let data = "var word = 'hello';\nalert(word);\n"; let data = "var word = 'hello';\nalert(word);\n";
let data_url = url::create_data_url(mime, data.as_bytes(), &Url::parse("data:,").unwrap()); let data_url = url::create_data_url(
media_type,
"",
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!( assert_eq!(
data_url.as_str(), data_url.as_str(),
@ -28,6 +33,7 @@ mod passing {
let data = "<svg></svg>\n"; let data = "<svg></svg>\n";
let data_url = url::create_data_url( let data_url = url::create_data_url(
"image/svg+xml", "image/svg+xml",
"",
data.as_bytes(), data.as_bytes(),
&Url::parse("data:,").unwrap(), &Url::parse("data:,").unwrap(),
); );
@ -37,4 +43,67 @@ mod passing {
"" ""
); );
} }
#[test]
fn encode_string_with_specific_media_type_and_charset() {
let media_type = "application/javascript";
let charset = "utf8";
let data = "var word = 'hello';\nalert(word);\n";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(
data_url.as_str(),
"data:application/javascript;charset=utf8;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
);
}
#[test]
fn create_data_url_with_us_ascii_charset() {
let media_type = "";
let charset = "us-ascii";
let data = "";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(data_url.as_str(), "data:;base64,");
}
#[test]
fn create_data_url_with_utf8_charset() {
let media_type = "";
let charset = "utf8";
let data = "";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(data_url.as_str(), "data:;charset=utf8;base64,");
}
#[test]
fn create_data_url_with_media_type_text_plain_and_utf8_charset() {
let media_type = "text/plain";
let charset = "utf8";
let data = "";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(data_url.as_str(), "data:text/plain;charset=utf8;base64,");
}
} }

@ -195,7 +195,7 @@ mod failing {
let dummy_url: Url = Url::parse("data:,").unwrap(); let dummy_url: Url = Url::parse("data:,").unwrap();
assert_eq!( assert_eq!(
utils::detect_media_type(b"abcdef0123456789", &dummy_url), utils::detect_media_type(b"abcdef0123456789", &dummy_url),
"application/octet-stream" ""
); );
} }
} }

@ -38,7 +38,7 @@ mod passing {
assert_eq!(&media_type, "text/html"); assert_eq!(&media_type, "text/html");
assert_eq!(&charset, "US-ASCII"); assert_eq!(&charset, "US-ASCII");
assert_eq!( assert_eq!(
url::create_data_url(&media_type, &data, &final_url), url::create_data_url(&media_type, &charset, &data, &final_url),
Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(), Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(),
); );
assert_eq!( assert_eq!(
@ -80,7 +80,7 @@ mod passing {
.unwrap(); .unwrap();
assert_eq!(&media_type, "application/javascript"); assert_eq!(&media_type, "application/javascript");
assert_eq!(&charset, ""); assert_eq!(&charset, "");
assert_eq!(url::create_data_url(&media_type, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap()); assert_eq!(url::create_data_url(&media_type, &charset, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap());
assert_eq!( assert_eq!(
final_url, final_url,
Url::parse(&format!( Url::parse(&format!(

@ -12,7 +12,8 @@ pub fn clean_url(url: Url) -> Url {
url url
} }
pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) -> Url { pub fn create_data_url(media_type: &str, charset: &str, data: &[u8], final_asset_url: &Url) -> Url {
// TODO: move this block out of this function
let media_type: String = if media_type.is_empty() { let media_type: String = if media_type.is_empty() {
detect_media_type(data, &final_asset_url) detect_media_type(data, &final_asset_url)
} else { } else {
@ -21,7 +22,14 @@ pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) ->
let mut data_url: Url = Url::parse("data:,").unwrap(); let mut data_url: Url = Url::parse("data:,").unwrap();
data_url.set_path(format!("{};base64,{}", media_type, base64::encode(data)).as_str()); let c: String =
if !charset.trim().is_empty() && !charset.trim().eq_ignore_ascii_case("US-ASCII") {
format!(";charset={}", charset.trim())
} else {
str!()
};
data_url.set_path(format!("{}{};base64,{}", media_type, c, base64::encode(data)).as_str());
data_url data_url
} }

@ -84,9 +84,9 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
"woff" => "font/woff", "woff" => "font/woff",
"woff2" => "font/woff2", "woff2" => "font/woff2",
"xml" => "text/xml", "xml" => "text/xml",
&_ => "application/octet-stream", &_ => "",
}, },
None => "application/octet-stream", None => "",
}; };
mime.to_string() mime.to_string()

Loading…
Cancel
Save