From 125aeeec3b7558a9387d737bfdc0f0069ac31d46 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Tue, 8 Jun 2021 11:50:46 -1000 Subject: [PATCH] improve validation of charset found in HTML, use genuinely infinite timeout --- src/main.rs | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/main.rs b/src/main.rs index 03181b3..6b5e8e4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -86,7 +86,6 @@ fn main() { } let target_url: Url; - let mut base_url: Url; let mut use_stdin: bool = false; // Determine exact target URL @@ -156,20 +155,19 @@ fn main() { HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"), ); } - let timeout: u64 = if options.timeout > 0 { - options.timeout + let client = if options.timeout > 0 { + Client::builder().timeout(Duration::from_secs(options.timeout)) } else { - std::u64::MAX / 4 // This is pretty close to infinity - }; - let client = Client::builder() - .timeout(Duration::from_secs(timeout)) - .danger_accept_invalid_certs(options.insecure) - .default_headers(header_map) - .build() - .expect("Failed to initialize HTTP client"); - - // At this stage we assume that the base URL is the same as the target URL - base_url = target_url.clone(); + // No timeout is default + Client::builder() + } + .danger_accept_invalid_certs(options.insecure) + .default_headers(header_map) + .build() + .expect("Failed to initialize HTTP client"); + + // At first we assume that base URL is the same as target URL + let mut base_url: Url = target_url.clone(); let data: Vec; let mut document_encoding: String = str!(); @@ -214,16 +212,16 @@ fn main() { dom = html_to_dom(&data, document_encoding.clone()); // TODO: investigate if charset from filesystem/data URL/HTTP headers - // has power over what's specified in HTML + // has say over what's specified in HTML // Attempt to determine document's charset - if let Some(charset) = get_charset(&dom.document) { - if !charset.is_empty() { + if let Some(html_charset) = get_charset(&dom.document) { + if !html_charset.is_empty() { // Check if the charset specified inside HTML is valid - if let Some(encoding) = Encoding::for_label(charset.as_bytes()) { + if let Some(encoding) = Encoding::for_label_no_replacement(html_charset.as_bytes()) { // No point in parsing HTML again with the same encoding as before if encoding.name() != "UTF-8" { - document_encoding = charset; + document_encoding = html_charset; dom = html_to_dom(&data, document_encoding.clone()); } } @@ -233,8 +231,8 @@ fn main() { // Use custom base URL if specified, read and use what's in the DOM otherwise let custom_base_url: String = options.base_url.clone().unwrap_or(str!()); if custom_base_url.is_empty() { - // No custom base URL is specified, - // try to see if the document has BASE tag + // No custom base URL is specified + // Try to see if document has BASE element if let Some(existing_base_url) = get_base_url(&dom.document) { base_url = resolve_url(&target_url, &existing_base_url); } @@ -253,8 +251,7 @@ fn main() { } } Err(_) => { - // Failed to parse given base URL, - // perhaps it's a filesystem path? + // Failed to parse given base URL, perhaps it's a filesystem path? if target_url.scheme() == "file" { // Relative paths could work for documents saved from filesystem let path: &Path = Path::new(&custom_base_url); @@ -322,7 +319,7 @@ fn main() { // Serialize DOM tree let mut result: Vec = serialize_document(dom, document_encoding, &options); - // Add metadata tag + // Prepend metadata comment tag if !options.no_metadata { let mut metadata_comment: String = create_metadata_tag(&target_url); metadata_comment += "\n";