From 44cac65a83486f629260a82ab5788f906265474f Mon Sep 17 00:00:00 2001 From: Sunshine Date: Thu, 25 Jun 2020 23:53:20 -0400 Subject: [PATCH] automatically remove "Refresh" and "Location" META tags --- .../0006-reload-and-location-meta-nodes.md | 19 +++++ src/html.rs | 19 +++++ src/tests/html/walk_and_embed_assets.rs | 74 +++++++++++++++---- 3 files changed, 99 insertions(+), 13 deletions(-) create mode 100644 docs/arch/0006-reload-and-location-meta-nodes.md diff --git a/docs/arch/0006-reload-and-location-meta-nodes.md b/docs/arch/0006-reload-and-location-meta-nodes.md new file mode 100644 index 0000000..3252b61 --- /dev/null +++ b/docs/arch/0006-reload-and-location-meta-nodes.md @@ -0,0 +1,19 @@ +# 4. Reload and location `meta` nodes + +Date: 2020-06-25 + +## Status + +Accepted + +## Context + +HTML documents may contain `meta` tags capable of automatically refreshing the page or redirecting to another location. + +## Decision + +Since the resulting document is saved to disk and generally not intended to be served over the network, it only makes sense to remove `meta` nodes that have `http-equiv` attribute equal to "Refresh" or "Location" in order to prevent them from forcing page to reload or redirect to another location. + +## Consequences + +Monolith will ensure that saved documents do not contain `meta` tags capable of changing location or reloading page. diff --git a/src/html.rs b/src/html.rs index f16f4bc..c49148e 100644 --- a/src/html.rs +++ b/src/html.rs @@ -170,6 +170,25 @@ pub fn walk_and_embed_assets( let attrs_mut = &mut attrs.borrow_mut(); match name.local.as_ref() { + "meta" => { + // Determine type + let mut is_unwanted_meta: bool = false; + for attr in attrs_mut.iter_mut() { + let attr_name: &str = &attr.name.local; + if attr_name.eq_ignore_ascii_case("http-equiv") { + let value: String = attr.value.to_string(); + is_unwanted_meta = value.eq_ignore_ascii_case("refresh") + || value.eq_ignore_ascii_case("location"); + } + } + + if is_unwanted_meta { + // Strip this node off all its attributes + while attrs_mut.len() > 0 { + attrs_mut.remove(0); + } + } + } "link" => { // Remove integrity attributes, keep value of the last one let mut integrity: String = str!(); diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index 9dbec0c..aad8019 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -211,15 +211,15 @@ mod passing { buf.iter().map(|&c| c as char).collect::(), format!( "\ - \ - \ - \ - \ -
\ - \ -
\ - \ - ", + \ + \ + \ + \ +
\ + \ +
\ + \ + ", empty_image = empty_image!() ) ); @@ -341,8 +341,8 @@ mod passing { #[test] fn no_js() { let html = "
\ - \ - \ + \ + \
"; let dom = html::html_to_dom(&html); let url = "http://localhost"; @@ -381,7 +381,7 @@ mod passing { } #[test] - fn with_no_integrity() { + fn discards_integrity() { let html = "No integrity\ \ "; @@ -415,8 +415,56 @@ mod passing { assert_eq!( buf.iter().map(|&c| c as char).collect::(), "\ - No integrity\ + No integrity\ + \ + " + ); + } + + #[test] + fn removes_unwanted_meta_tags() { + let html = "\ + \ + \ + \ + \ \ + "; + let dom = html::html_to_dom(&html); + let url = "http://localhost"; + let cache = &mut HashMap::new(); + let client = Client::new(); + let opt_no_css: bool = true; + let opt_no_fonts: bool = false; + let opt_no_frames: bool = true; + let opt_no_js: bool = true; + let opt_no_images: bool = true; + let opt_silent = true; + + html::walk_and_embed_assets( + cache, + &client, + &url, + &dom.document, + opt_no_css, + opt_no_fonts, + opt_no_frames, + opt_no_js, + opt_no_images, + opt_silent, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "\ + \ + \ + \ + \ + \ " ); }