From c938ba6a2f3a806e700cfe747b87d4e51d6d3cc7 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Tue, 8 Jun 2021 04:49:14 -1000 Subject: [PATCH 1/4] modify proper attribute for (i)frame elements --- src/html.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/html.rs b/src/html.rs index fcc99fb..1b4130b 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1094,7 +1094,7 @@ pub fn walk_and_embed_assets( client, &document_url, node, - "href", + "src", &frame_attr_src_value, options, depth, From 125aeeec3b7558a9387d737bfdc0f0069ac31d46 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Tue, 8 Jun 2021 11:50:46 -1000 Subject: [PATCH 2/4] improve validation of charset found in HTML, use genuinely infinite timeout --- src/main.rs | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/main.rs b/src/main.rs index 03181b3..6b5e8e4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -86,7 +86,6 @@ fn main() { } let target_url: Url; - let mut base_url: Url; let mut use_stdin: bool = false; // Determine exact target URL @@ -156,20 +155,19 @@ fn main() { HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"), ); } - let timeout: u64 = if options.timeout > 0 { - options.timeout + let client = if options.timeout > 0 { + Client::builder().timeout(Duration::from_secs(options.timeout)) } else { - std::u64::MAX / 4 // This is pretty close to infinity - }; - let client = Client::builder() - .timeout(Duration::from_secs(timeout)) - .danger_accept_invalid_certs(options.insecure) - .default_headers(header_map) - .build() - .expect("Failed to initialize HTTP client"); - - // At this stage we assume that the base URL is the same as the target URL - base_url = target_url.clone(); + // No timeout is default + Client::builder() + } + .danger_accept_invalid_certs(options.insecure) + .default_headers(header_map) + .build() + .expect("Failed to initialize HTTP client"); + + // At first we assume that base URL is the same as target URL + let mut base_url: Url = target_url.clone(); let data: Vec; let mut document_encoding: String = str!(); @@ -214,16 +212,16 @@ fn main() { dom = html_to_dom(&data, document_encoding.clone()); // TODO: investigate if charset from filesystem/data URL/HTTP headers - // has power over what's specified in HTML + // has say over what's specified in HTML // Attempt to determine document's charset - if let Some(charset) = get_charset(&dom.document) { - if !charset.is_empty() { + if let Some(html_charset) = get_charset(&dom.document) { + if !html_charset.is_empty() { // Check if the charset specified inside HTML is valid - if let Some(encoding) = Encoding::for_label(charset.as_bytes()) { + if let Some(encoding) = Encoding::for_label_no_replacement(html_charset.as_bytes()) { // No point in parsing HTML again with the same encoding as before if encoding.name() != "UTF-8" { - document_encoding = charset; + document_encoding = html_charset; dom = html_to_dom(&data, document_encoding.clone()); } } @@ -233,8 +231,8 @@ fn main() { // Use custom base URL if specified, read and use what's in the DOM otherwise let custom_base_url: String = options.base_url.clone().unwrap_or(str!()); if custom_base_url.is_empty() { - // No custom base URL is specified, - // try to see if the document has BASE tag + // No custom base URL is specified + // Try to see if document has BASE element if let Some(existing_base_url) = get_base_url(&dom.document) { base_url = resolve_url(&target_url, &existing_base_url); } @@ -253,8 +251,7 @@ fn main() { } } Err(_) => { - // Failed to parse given base URL, - // perhaps it's a filesystem path? + // Failed to parse given base URL, perhaps it's a filesystem path? if target_url.scheme() == "file" { // Relative paths could work for documents saved from filesystem let path: &Path = Path::new(&custom_base_url); @@ -322,7 +319,7 @@ fn main() { // Serialize DOM tree let mut result: Vec = serialize_document(dom, document_encoding, &options); - // Add metadata tag + // Prepend metadata comment tag if !options.no_metadata { let mut metadata_comment: String = create_metadata_tag(&target_url); metadata_comment += "\n"; From 5effa38392ce5b6bb5b07f9a0a645d729bacca43 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Tue, 8 Jun 2021 12:25:19 -1000 Subject: [PATCH 3/4] use proper charset detection for linked assets --- src/html.rs | 34 +++++++++++-------------- src/tests/html/walk_and_embed_assets.rs | 12 ++++----- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/src/html.rs b/src/html.rs index 1b4130b..d9a64f9 100644 --- a/src/html.rs +++ b/src/html.rs @@ -606,7 +606,7 @@ pub fn retrieve_and_embed_asset( options, depth + 1, ) { - Ok((data, final_url, mut media_type, _charset)) => { + Ok((data, final_url, mut media_type, charset)) => { let node_name: &str = get_node_name(&node).unwrap(); // Check integrity if it's a LINK or SCRIPT element @@ -624,23 +624,25 @@ pub fn retrieve_and_embed_asset( } if ok_to_include { + let s: String; + if let Some(encoding) = Encoding::for_label(charset.as_bytes()) { + let (string, _, _) = encoding.decode(&data); + s = string.to_string(); + } else { + s = String::from_utf8_lossy(&data).to_string(); + } + if node_name == "link" && determine_link_node_type(node) == "stylesheet" { // Stylesheet LINK elements require special treatment - let css: String = embed_css( - cache, - client, - &final_url, - &String::from_utf8_lossy(&data), - options, - depth + 1, - ); + let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1); // Create and embed data URL - let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url); + // TODO: use charset + let css_data_url = create_data_url(&media_type, css.as_bytes(), &final_url); set_node_attr(&node, attr_name, Some(css_data_url.to_string())); } else if node_name == "frame" || node_name == "iframe" { // (I)FRAMEs are also quite different from conventional resources - let frame_dom = html_to_dom(&data, "utf-8".to_string()); + let frame_dom = html_to_dom(&data, charset); walk_and_embed_assets( cache, client, @@ -679,6 +681,7 @@ pub fn retrieve_and_embed_asset( } // Create and embed data URL + // TODO: use charset let mut data_url = create_data_url(&media_type, &data, &final_url); data_url.set_fragment(resolved_url.fragment()); set_node_attr(node, attr_name, Some(data_url.to_string())); @@ -725,14 +728,7 @@ pub fn walk_and_embed_assets( || meta_attr_http_equiv_value.eq_ignore_ascii_case("location") { // Remove http-equiv attributes from META nodes if they're able to control the page - set_node_attr( - &node, - "http-equiv", - Some(format!( - "disabled by monolith ({})", - meta_attr_http_equiv_value - )), - ); + set_node_attr(&node, "http-equiv", None); } } } diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index 8755cda..d06e2a1 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -290,7 +290,7 @@ mod passing { } #[test] - fn keeps_integrity_for_linked_assets() { + fn keeps_integrity_for_unfamiliar_links() { let html = "Has integrity\ "; let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); @@ -322,7 +322,7 @@ mod passing { } #[test] - fn discards_integrity_for_linked_assets_nojs_nocss() { + fn discards_integrity_for_known_links_nojs_nocss() { let html = "\ No integrity\ \ @@ -403,8 +403,8 @@ mod passing { let html = "\ \ \ - \ - \ + \ + \ \ \ \ @@ -433,8 +433,8 @@ mod passing { "\ \ \ - \ - \ + \ + \ \ \ \ From 4bc8043f0fab2d65b00378637b15d9b6766a56a1 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Tue, 8 Jun 2021 12:54:16 -1000 Subject: [PATCH 4/4] account for charset when creating data URLs --- src/css.rs | 15 +++--- src/html.rs | 24 ++++----- src/main.rs | 6 +-- src/tests/url/create_data_url.rs | 73 +++++++++++++++++++++++++++- src/tests/utils/detect_media_type.rs | 2 +- src/tests/utils/retrieve_asset.rs | 4 +- src/url.rs | 12 ++++- src/utils.rs | 4 +- 8 files changed, 111 insertions(+), 29 deletions(-) diff --git a/src/css.rs b/src/css.rs index f19ac0f..40f5810 100644 --- a/src/css.rs +++ b/src/css.rs @@ -202,10 +202,11 @@ pub fn process_css<'a>( import_contents, import_final_url, import_media_type, - _import_charset, + import_charset, )) => { let mut import_data_url = create_data_url( &import_media_type, + &import_charset, embed_css( cache, client, @@ -252,9 +253,9 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((data, final_url, media_type, _charset)) => { + Ok((data, final_url, media_type, charset)) => { let mut data_url = - create_data_url(&media_type, &data, &final_url); + create_data_url(&media_type, &charset, &data, &final_url); data_url.set_fragment(resolved_url.fragment()); result.push_str( format_quoted_string(&data_url.to_string()).as_str(), @@ -348,9 +349,10 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((css, final_url, media_type, _charset)) => { + Ok((css, final_url, media_type, charset)) => { let mut data_url = create_data_url( &media_type, + &charset, embed_css( cache, client, @@ -386,8 +388,9 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((data, final_url, media_type, _charset)) => { - let mut data_url = create_data_url(&media_type, &data, &final_url); + Ok((data, final_url, media_type, charset)) => { + let mut data_url = + create_data_url(&media_type, &charset, &data, &final_url); data_url.set_fragment(full_url.fragment()); result .push_str(format_quoted_string(&data_url.to_string()).as_str()); diff --git a/src/html.rs b/src/html.rs index d9a64f9..25abcf3 100644 --- a/src/html.rs +++ b/src/html.rs @@ -188,10 +188,13 @@ pub fn embed_srcset( options, depth + 1, ) { - Ok((image_data, image_final_url, image_media_type, _image_charset)) => { - // TODO: use image_charset - let mut image_data_url = - create_data_url(&image_media_type, &image_data, &image_final_url); + Ok((image_data, image_final_url, image_media_type, image_charset)) => { + let mut image_data_url = create_data_url( + &image_media_type, + &image_charset, + &image_data, + &image_final_url, + ); // Append retreved asset as a data URL image_data_url.set_fragment(image_full_url.fragment()); result.push_str(image_data_url.as_ref()); @@ -637,12 +640,12 @@ pub fn retrieve_and_embed_asset( let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1); // Create and embed data URL - // TODO: use charset - let css_data_url = create_data_url(&media_type, css.as_bytes(), &final_url); + let css_data_url = + create_data_url(&media_type, &charset, css.as_bytes(), &final_url); set_node_attr(&node, attr_name, Some(css_data_url.to_string())); } else if node_name == "frame" || node_name == "iframe" { // (I)FRAMEs are also quite different from conventional resources - let frame_dom = html_to_dom(&data, charset); + let frame_dom = html_to_dom(&data, charset.clone()); walk_and_embed_assets( cache, client, @@ -661,8 +664,8 @@ pub fn retrieve_and_embed_asset( .unwrap(); // Create and embed data URL - // TODO: use charset - let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url); + let mut frame_data_url = + create_data_url(&media_type, &charset, &frame_data, &final_url); frame_data_url.set_fragment(resolved_url.fragment()); set_node_attr(node, attr_name, Some(frame_data_url.to_string())); } else { @@ -681,8 +684,7 @@ pub fn retrieve_and_embed_asset( } // Create and embed data URL - // TODO: use charset - let mut data_url = create_data_url(&media_type, &data, &final_url); + let mut data_url = create_data_url(&media_type, &charset, &data, &final_url); data_url.set_fragment(resolved_url.fragment()); set_node_attr(node, attr_name, Some(data_url.to_string())); } diff --git a/src/main.rs b/src/main.rs index 6b5e8e4..7798759 100644 --- a/src/main.rs +++ b/src/main.rs @@ -299,9 +299,9 @@ fn main() { &options, 0, ) { - Ok((data, final_url, media_type, _charset)) => { - // TODO: use charset - let favicon_data_url: Url = create_data_url(&media_type, &data, &final_url); + Ok((data, final_url, media_type, charset)) => { + let favicon_data_url: Url = + create_data_url(&media_type, &charset, &data, &final_url); dom = add_favicon(&dom.document, favicon_data_url.to_string()); } Err(_) => { diff --git a/src/tests/url/create_data_url.rs b/src/tests/url/create_data_url.rs index 873dbda..f6c3cd5 100644 --- a/src/tests/url/create_data_url.rs +++ b/src/tests/url/create_data_url.rs @@ -13,9 +13,14 @@ mod passing { #[test] fn encode_string_with_specific_media_type() { - let mime = "application/javascript"; + let media_type = "application/javascript"; let data = "var word = 'hello';\nalert(word);\n"; - let data_url = url::create_data_url(mime, data.as_bytes(), &Url::parse("data:,").unwrap()); + let data_url = url::create_data_url( + media_type, + "", + data.as_bytes(), + &Url::parse("data:,").unwrap(), + ); assert_eq!( data_url.as_str(), @@ -28,6 +33,7 @@ mod passing { let data = "\n"; let data_url = url::create_data_url( "image/svg+xml", + "", data.as_bytes(), &Url::parse("data:,").unwrap(), ); @@ -37,4 +43,67 @@ mod passing { "" ); } + + #[test] + fn encode_string_with_specific_media_type_and_charset() { + let media_type = "application/javascript"; + let charset = "utf8"; + let data = "var word = 'hello';\nalert(word);\n"; + let data_url = url::create_data_url( + media_type, + charset, + data.as_bytes(), + &Url::parse("data:,").unwrap(), + ); + + assert_eq!( + data_url.as_str(), + "data:application/javascript;charset=utf8;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" + ); + } + + #[test] + fn create_data_url_with_us_ascii_charset() { + let media_type = ""; + let charset = "us-ascii"; + let data = ""; + let data_url = url::create_data_url( + media_type, + charset, + data.as_bytes(), + &Url::parse("data:,").unwrap(), + ); + + assert_eq!(data_url.as_str(), "data:;base64,"); + } + + #[test] + fn create_data_url_with_utf8_charset() { + let media_type = ""; + let charset = "utf8"; + let data = ""; + let data_url = url::create_data_url( + media_type, + charset, + data.as_bytes(), + &Url::parse("data:,").unwrap(), + ); + + assert_eq!(data_url.as_str(), "data:;charset=utf8;base64,"); + } + + #[test] + fn create_data_url_with_media_type_text_plain_and_utf8_charset() { + let media_type = "text/plain"; + let charset = "utf8"; + let data = ""; + let data_url = url::create_data_url( + media_type, + charset, + data.as_bytes(), + &Url::parse("data:,").unwrap(), + ); + + assert_eq!(data_url.as_str(), "data:text/plain;charset=utf8;base64,"); + } } diff --git a/src/tests/utils/detect_media_type.rs b/src/tests/utils/detect_media_type.rs index 707fc44..970af13 100644 --- a/src/tests/utils/detect_media_type.rs +++ b/src/tests/utils/detect_media_type.rs @@ -195,7 +195,7 @@ mod failing { let dummy_url: Url = Url::parse("data:,").unwrap(); assert_eq!( utils::detect_media_type(b"abcdef0123456789", &dummy_url), - "application/octet-stream" + "" ); } } diff --git a/src/tests/utils/retrieve_asset.rs b/src/tests/utils/retrieve_asset.rs index eee881d..1d12559 100644 --- a/src/tests/utils/retrieve_asset.rs +++ b/src/tests/utils/retrieve_asset.rs @@ -38,7 +38,7 @@ mod passing { assert_eq!(&media_type, "text/html"); assert_eq!(&charset, "US-ASCII"); assert_eq!( - url::create_data_url(&media_type, &data, &final_url), + url::create_data_url(&media_type, &charset, &data, &final_url), Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(), ); assert_eq!( @@ -80,7 +80,7 @@ mod passing { .unwrap(); assert_eq!(&media_type, "application/javascript"); assert_eq!(&charset, ""); - assert_eq!(url::create_data_url(&media_type, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap()); + assert_eq!(url::create_data_url(&media_type, &charset, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap()); assert_eq!( final_url, Url::parse(&format!( diff --git a/src/url.rs b/src/url.rs index eea0bb3..8f0221b 100644 --- a/src/url.rs +++ b/src/url.rs @@ -12,7 +12,8 @@ pub fn clean_url(url: Url) -> Url { url } -pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) -> Url { +pub fn create_data_url(media_type: &str, charset: &str, data: &[u8], final_asset_url: &Url) -> Url { + // TODO: move this block out of this function let media_type: String = if media_type.is_empty() { detect_media_type(data, &final_asset_url) } else { @@ -21,7 +22,14 @@ pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) -> let mut data_url: Url = Url::parse("data:,").unwrap(); - data_url.set_path(format!("{};base64,{}", media_type, base64::encode(data)).as_str()); + let c: String = + if !charset.trim().is_empty() && !charset.trim().eq_ignore_ascii_case("US-ASCII") { + format!(";charset={}", charset.trim()) + } else { + str!() + }; + + data_url.set_path(format!("{}{};base64,{}", media_type, c, base64::encode(data)).as_str()); data_url } diff --git a/src/utils.rs b/src/utils.rs index 012b419..1bd2b65 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -84,9 +84,9 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String { "woff" => "font/woff", "woff2" => "font/woff2", "xml" => "text/xml", - &_ => "application/octet-stream", + &_ => "", }, - None => "application/octet-stream", + None => "", }; mime.to_string()