improve data URL media type detection

4 years ago · 594ad55bd8
parent d2615f51dc
commit 594ad55bd8
4 changed files with 68 additions and 40 deletions
--- a/src/main.rs
+++ b/src/main.rs
@ -122,9 +122,9 @@ fn main() {
        base_url = final_url;
        dom = html_to_dom(&data);
    } else if is_data_url(target_url) {
-        let text: String = data_url_to_text(target_url);
-        if text.len() == 0 {
-            eprintln!("Unsupported data URL input");
+        let (media_type, text): (String, String) = data_url_to_text(target_url);
+        if !media_type.eq_ignore_ascii_case("text/html") {
+            eprintln!("Unsupported data URL media type");
            process::exit(1);
        }
        base_url = str!(target_url);
--- a/src/tests/cli.rs
+++ b/src/tests/cli.rs
@ -62,7 +62,7 @@ fn passing_bad_input_data_url() -> Result<(), Box<dyn std::error::Error>> {
    // STDERR should contain error description
    assert_eq!(
        std::str::from_utf8(&out.stderr).unwrap(),
-        "Unsupported data URL input\n"
+        "Unsupported data URL media type\n"
    );

    // The exit code should be 1
--- a/src/tests/utils/data_url_to_text.rs
+++ b/src/tests/utils/data_url_to_text.rs
@ -9,48 +9,74 @@ use crate::utils;

 #[test]
 fn passing_parse_text_html_base64() {
+    let (media_type, text) = utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==");
+
+    assert_eq!(media_type, "text/html");
    assert_eq!(
-        utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
+        text,
        "Work expands so as to fill the time available for its completion"
    );
 }

 #[test]
 fn passing_parse_text_html_utf8() {
+    let (media_type, text) = utils::data_url_to_text(
+        "data:text/html;utf8,Work expands so as to fill the time available for its completion",
+    );
+
+    assert_eq!(media_type, "text/html");
    assert_eq!(
-        utils::data_url_to_text(
-            "data:text/html;utf8,Work expands so as to fill the time available for its completion"
-        ),
+        text,
        "Work expands so as to fill the time available for its completion"
    );
 }

 #[test]
 fn passing_parse_text_html_plaintext() {
+    let (media_type, text) = utils::data_url_to_text(
+        "data:text/html,Work expands so as to fill the time available for its completion",
+    );
+
+    assert_eq!(media_type, "text/html");
    assert_eq!(
-        utils::data_url_to_text(
-            "data:text/html,Work expands so as to fill the time available for its completion"
-        ),
+        text,
        "Work expands so as to fill the time available for its completion"
    );
 }

 #[test]
 fn passing_parse_text_html_charset_utf_8_between_two_whitespaces() {
+    let (media_type, text) = utils::data_url_to_text(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion ");
+
+    assert_eq!(media_type, "text/html");
    assert_eq!(
-        utils::data_url_to_text(
-            " data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "
-        ),
+        text,
        "Work expands so as to fill the time available for its completion"
    );
 }

 #[test]
 fn passing_parse_text_css_url_encoded() {
-    assert_eq!(
-        utils::data_url_to_text("data:text/css,div{background-color:%23000}"),
-        "div{background-color:#000}"
-    );
+    let (media_type, text) = utils::data_url_to_text("data:text/css,div{background-color:%23000}");
+
+    assert_eq!(media_type, "text/css");
+    assert_eq!(text, "div{background-color:#000}");
+}
+
+#[test]
+fn passing_parse_no_media_type_base64() {
+    let (media_type, text) = utils::data_url_to_text("data:;base64,dGVzdA==");
+
+    assert_eq!(media_type, "");
+    assert_eq!(text, "test");
+}
+
+#[test]
+fn passing_parse_no_media_type_no_encoding() {
+    let (media_type, text) = utils::data_url_to_text("data:;,test%20test");
+
+    assert_eq!(media_type, "");
+    assert_eq!(text, "test test");
 }

 //  ███████╗ █████╗ ██╗██╗     ██╗███╗   ██╗ ██████╗
@ -62,5 +88,8 @@ fn passing_parse_text_css_url_encoded() {

 #[test]
 fn failing_just_word_data() {
-    assert_eq!(utils::data_url_to_text("data"), "");
+    let (media_type, text) = utils::data_url_to_text("data");
+
+    assert_eq!(media_type, "");
+    assert_eq!(text, "");
 }
--- a/src/utils.rs
+++ b/src/utils.rs
@ -133,50 +133,47 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
    result.to_string()
 }

-pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
-    let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap());
+pub fn data_url_to_text<T: AsRef<str>>(url: T) -> (String, String) {
+    let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap());
    let path: String = parsed_url.path().to_string();
    let comma_loc: usize = path.find(',').unwrap_or(path.len());

-    if comma_loc == path.len() {
-        return str!();
-    }
-
    let meta_data: String = path.chars().take(comma_loc).collect();
    let raw_data: String = path.chars().skip(comma_loc + 1).collect();

    let data: String = decode_url(raw_data);

    let meta_data_items: Vec<&str> = meta_data.split(';').collect();
-    let mut media_type: &str = "";
    let mut encoding: &str = "";

-    // Detect media type and encoding
+    let mut media_type: String = str!();
+    let mut text: String = str!();
+
    let mut i: i8 = 0;
    for item in &meta_data_items {
        if i == 0 {
-            if is_plaintext_media_type(item) {
-                media_type = item;
-                continue;
+            media_type = str!(item);
+        } else {
+            if item.eq_ignore_ascii_case("base64")
+                || item.eq_ignore_ascii_case("utf8")
+                || item.eq_ignore_ascii_case("charset=UTF-8")
+            {
+                encoding = item;
            }
        }

-        if item.eq_ignore_ascii_case("base64") || item.eq_ignore_ascii_case("utf8") {
-            encoding = item;
-        }
-
        i = i + 1;
    }

-    if is_plaintext_media_type(media_type) {
+    if is_plaintext_media_type(&media_type) || media_type.is_empty() {
        if encoding.eq_ignore_ascii_case("base64") {
-            String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
+            text = String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
        } else {
-            data
+            text = data
        }
-    } else {
-        str!()
    }
+
+    (media_type, text)
 }

 pub fn decode_url(input: String) -> String {
@ -238,7 +235,9 @@ pub fn retrieve_asset(
        if as_data_url {
            Ok((url.to_string(), url.to_string()))
        } else {
-            Ok((data_url_to_text(url), url.to_string()))
+            let (_media_type, text) = data_url_to_text(url);
+
+            Ok((text, url.to_string()))
        }
    } else if is_file_url(&url) {
        // Check if parent_url is also file:///