Whitespace for unwrapped content & fix unwrap bug

This commit is contained in:
Tyler Hallada 2020-04-25 21:21:36 -04:00
parent fce50554a3
commit 00593d3c58
3 changed files with 96 additions and 2 deletions

View File

@ -123,6 +123,7 @@ impl<'arena> Node<'arena> {
if let Some(next_sibling) = next_sibling { if let Some(next_sibling) = next_sibling {
if let Some(last_child) = last_child { if let Some(last_child) = last_child {
next_sibling.previous_sibling.set(Some(last_child)); next_sibling.previous_sibling.set(Some(last_child));
last_child.next_sibling.set(Some(next_sibling));
} else { } else {
next_sibling.previous_sibling.set(previous_sibling); next_sibling.previous_sibling.set(previous_sibling);
} }
@ -138,6 +139,7 @@ impl<'arena> Node<'arena> {
if let Some(previous_sibling) = previous_sibling { if let Some(previous_sibling) = previous_sibling {
if let Some(first_child) = first_child { if let Some(first_child) = first_child {
previous_sibling.next_sibling.set(Some(first_child)); previous_sibling.next_sibling.set(Some(first_child));
first_child.previous_sibling.set(Some(previous_sibling));
} else { } else {
previous_sibling.next_sibling.set(next_sibling); previous_sibling.next_sibling.set(next_sibling);
} }

View File

@ -1,6 +1,6 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use crate::sanitizer::SanitizerConfig; use crate::sanitizer::{ContentWhitespace, SanitizerConfig};
lazy_static! { lazy_static! {
pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig { pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig {
@ -23,5 +23,32 @@ lazy_static! {
local_name!("script"), local_name!("script"),
local_name!("style"), local_name!("style"),
}, },
whitespace_around_unwrapped_content: hashmap! {
local_name!("address") => ContentWhitespace::space_around(),
local_name!("article") => ContentWhitespace::space_around(),
local_name!("aside") => ContentWhitespace::space_around(),
local_name!("blockquote") => ContentWhitespace::space_around(),
local_name!("br") => ContentWhitespace::space_around(),
local_name!("dd") => ContentWhitespace::space_around(),
local_name!("div") => ContentWhitespace::space_around(),
local_name!("dl") => ContentWhitespace::space_around(),
local_name!("footer") => ContentWhitespace::space_around(),
local_name!("h1") => ContentWhitespace::space_around(),
local_name!("h2") => ContentWhitespace::space_around(),
local_name!("h3") => ContentWhitespace::space_around(),
local_name!("h4") => ContentWhitespace::space_around(),
local_name!("h5") => ContentWhitespace::space_around(),
local_name!("h6") => ContentWhitespace::space_around(),
local_name!("header") => ContentWhitespace::space_around(),
local_name!("hgroup") => ContentWhitespace::space_around(),
local_name!("hr") => ContentWhitespace::space_around(),
local_name!("li") => ContentWhitespace::space_around(),
local_name!("nav") => ContentWhitespace::space_around(),
local_name!("ol") => ContentWhitespace::space_around(),
local_name!("p") => ContentWhitespace::space_around(),
local_name!("pre") => ContentWhitespace::space_around(),
local_name!("section") => ContentWhitespace::space_around(),
local_name!("ul") => ContentWhitespace::space_around(),
}
}; };
} }

View File

@ -3,7 +3,7 @@ use std::io::{Error, Read, Write};
use url::{ParseError, Url}; use url::{ParseError, Url};
use html5ever::interface::tree_builder::QuirksMode; use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName}; use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink}; use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
@ -31,6 +31,7 @@ pub struct SanitizerConfig {
pub allowed_css_properties: HashSet<CssProperty>, pub allowed_css_properties: HashSet<CssProperty>,
pub allow_css_comments: bool, pub allow_css_comments: bool,
pub remove_contents_when_unwrapped: HashSet<LocalName>, pub remove_contents_when_unwrapped: HashSet<LocalName>,
pub whitespace_around_unwrapped_content: HashMap<LocalName, ContentWhitespace<'static>>,
} }
#[derive(Debug, PartialEq, Eq, Hash, Clone)] #[derive(Debug, PartialEq, Eq, Hash, Clone)]
@ -39,6 +40,21 @@ pub enum Protocol<'a> {
Relative, Relative,
} }
#[derive(Debug, Clone)]
pub struct ContentWhitespace<'a> {
before: &'a str,
after: &'a str,
}
impl<'a> ContentWhitespace<'a> {
pub fn space_around() -> ContentWhitespace<'a> {
ContentWhitespace {
before: " ",
after: " ",
}
}
}
impl<'arena> Sanitizer<'arena> { impl<'arena> Sanitizer<'arena> {
pub fn new( pub fn new(
config: &'arena SanitizerConfig, config: &'arena SanitizerConfig,
@ -109,6 +125,7 @@ impl<'arena> Sanitizer<'arena> {
if self.should_remove_contents_when_unwrapped(node) { if self.should_remove_contents_when_unwrapped(node) {
node.detach(); node.detach();
} else if let Some(unwrapped_node) = node.unwrap() { } else if let Some(unwrapped_node) = node.unwrap() {
self.add_unwrapped_content_whitespace(node, unwrapped_node);
self.traverse(unwrapped_node); self.traverse(unwrapped_node);
} }
@ -347,6 +364,31 @@ impl<'arena> Sanitizer<'arena> {
} }
} }
} }
fn add_unwrapped_content_whitespace(
&self,
wrapping_node: Ref<'arena>,
unwrapped_node: Ref<'arena>,
) {
if let NodeData::Element { ref name, .. } = wrapping_node.data {
if let Some(content_whitespace) = self
.config
.whitespace_around_unwrapped_content
.get(&name.local)
{
if let NodeData::Text { ref contents, .. } = unwrapped_node.data {
contents.replace_with(|current| {
format_tendril!(
"{}{}{}",
content_whitespace.before,
current,
content_whitespace.after
)
});
}
}
}
}
} }
#[cfg(test)] #[cfg(test)]
@ -390,6 +432,7 @@ mod test {
allowed_css_properties: HashSet::new(), allowed_css_properties: HashSet::new(),
allow_css_comments: false, allow_css_comments: false,
remove_contents_when_unwrapped: HashSet::new(), remove_contents_when_unwrapped: HashSet::new(),
whitespace_around_unwrapped_content: HashMap::new(),
}; };
} }
@ -779,4 +822,26 @@ mod test {
"<!DOCTYPE html><html><div></div></html>" "<!DOCTYPE html><html><div></div></html>"
); );
} }
#[test]
fn add_unwrapped_content_whitespace() {
let mut unwrapped_whitespace_config = EMPTY_CONFIG.clone();
unwrapped_whitespace_config
.allowed_elements
.extend(vec![local_name!("html"), local_name!("div")]);
unwrapped_whitespace_config
.whitespace_around_unwrapped_content
.insert(local_name!("span"), ContentWhitespace::space_around());
let sanitizer = Sanitizer::new(&unwrapped_whitespace_config, vec![]);
let mut mock_data =
MockRead::new("<div>div-1<span>content-1</span><span>content-2</span>div-2</div>");
let mut output = vec![];
sanitizer
.sanitize_fragment(&mut mock_data, &mut output)
.unwrap();
assert_eq!(
str::from_utf8(&output).unwrap(),
"<html><div>div-1 content-1 content-2 div-2</div></html>"
);
}
} }