Whitespace for unwrapped content & fix unwrap bug

This commit is contained in:
Tyler Hallada 2020-04-25 21:21:36 -04:00
parent fce50554a3
commit 00593d3c58
3 changed files with 96 additions and 2 deletions

View File

@ -123,6 +123,7 @@ impl<'arena> Node<'arena> {
if let Some(next_sibling) = next_sibling {
if let Some(last_child) = last_child {
next_sibling.previous_sibling.set(Some(last_child));
last_child.next_sibling.set(Some(next_sibling));
} else {
next_sibling.previous_sibling.set(previous_sibling);
}
@ -138,6 +139,7 @@ impl<'arena> Node<'arena> {
if let Some(previous_sibling) = previous_sibling {
if let Some(first_child) = first_child {
previous_sibling.next_sibling.set(Some(first_child));
first_child.previous_sibling.set(Some(previous_sibling));
} else {
previous_sibling.next_sibling.set(next_sibling);
}

View File

@ -1,6 +1,6 @@
use std::collections::{HashMap, HashSet};
use crate::sanitizer::SanitizerConfig;
use crate::sanitizer::{ContentWhitespace, SanitizerConfig};
lazy_static! {
pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig {
@ -23,5 +23,32 @@ lazy_static! {
local_name!("script"),
local_name!("style"),
},
whitespace_around_unwrapped_content: hashmap! {
local_name!("address") => ContentWhitespace::space_around(),
local_name!("article") => ContentWhitespace::space_around(),
local_name!("aside") => ContentWhitespace::space_around(),
local_name!("blockquote") => ContentWhitespace::space_around(),
local_name!("br") => ContentWhitespace::space_around(),
local_name!("dd") => ContentWhitespace::space_around(),
local_name!("div") => ContentWhitespace::space_around(),
local_name!("dl") => ContentWhitespace::space_around(),
local_name!("footer") => ContentWhitespace::space_around(),
local_name!("h1") => ContentWhitespace::space_around(),
local_name!("h2") => ContentWhitespace::space_around(),
local_name!("h3") => ContentWhitespace::space_around(),
local_name!("h4") => ContentWhitespace::space_around(),
local_name!("h5") => ContentWhitespace::space_around(),
local_name!("h6") => ContentWhitespace::space_around(),
local_name!("header") => ContentWhitespace::space_around(),
local_name!("hgroup") => ContentWhitespace::space_around(),
local_name!("hr") => ContentWhitespace::space_around(),
local_name!("li") => ContentWhitespace::space_around(),
local_name!("nav") => ContentWhitespace::space_around(),
local_name!("ol") => ContentWhitespace::space_around(),
local_name!("p") => ContentWhitespace::space_around(),
local_name!("pre") => ContentWhitespace::space_around(),
local_name!("section") => ContentWhitespace::space_around(),
local_name!("ul") => ContentWhitespace::space_around(),
}
};
}

View File

@ -3,7 +3,7 @@ use std::io::{Error, Read, Write};
use url::{ParseError, Url};
use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
@ -31,6 +31,7 @@ pub struct SanitizerConfig {
pub allowed_css_properties: HashSet<CssProperty>,
pub allow_css_comments: bool,
pub remove_contents_when_unwrapped: HashSet<LocalName>,
pub whitespace_around_unwrapped_content: HashMap<LocalName, ContentWhitespace<'static>>,
}
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
@ -39,6 +40,21 @@ pub enum Protocol<'a> {
Relative,
}
#[derive(Debug, Clone)]
pub struct ContentWhitespace<'a> {
before: &'a str,
after: &'a str,
}
impl<'a> ContentWhitespace<'a> {
pub fn space_around() -> ContentWhitespace<'a> {
ContentWhitespace {
before: " ",
after: " ",
}
}
}
impl<'arena> Sanitizer<'arena> {
pub fn new(
config: &'arena SanitizerConfig,
@ -109,6 +125,7 @@ impl<'arena> Sanitizer<'arena> {
if self.should_remove_contents_when_unwrapped(node) {
node.detach();
} else if let Some(unwrapped_node) = node.unwrap() {
self.add_unwrapped_content_whitespace(node, unwrapped_node);
self.traverse(unwrapped_node);
}
@ -347,6 +364,31 @@ impl<'arena> Sanitizer<'arena> {
}
}
}
fn add_unwrapped_content_whitespace(
&self,
wrapping_node: Ref<'arena>,
unwrapped_node: Ref<'arena>,
) {
if let NodeData::Element { ref name, .. } = wrapping_node.data {
if let Some(content_whitespace) = self
.config
.whitespace_around_unwrapped_content
.get(&name.local)
{
if let NodeData::Text { ref contents, .. } = unwrapped_node.data {
contents.replace_with(|current| {
format_tendril!(
"{}{}{}",
content_whitespace.before,
current,
content_whitespace.after
)
});
}
}
}
}
}
#[cfg(test)]
@ -390,6 +432,7 @@ mod test {
allowed_css_properties: HashSet::new(),
allow_css_comments: false,
remove_contents_when_unwrapped: HashSet::new(),
whitespace_around_unwrapped_content: HashMap::new(),
};
}
@ -779,4 +822,26 @@ mod test {
"<!DOCTYPE html><html><div></div></html>"
);
}
#[test]
fn add_unwrapped_content_whitespace() {
let mut unwrapped_whitespace_config = EMPTY_CONFIG.clone();
unwrapped_whitespace_config
.allowed_elements
.extend(vec![local_name!("html"), local_name!("div")]);
unwrapped_whitespace_config
.whitespace_around_unwrapped_content
.insert(local_name!("span"), ContentWhitespace::space_around());
let sanitizer = Sanitizer::new(&unwrapped_whitespace_config, vec![]);
let mut mock_data =
MockRead::new("<div>div-1<span>content-1</span><span>content-2</span>div-2</div>");
let mut output = vec![];
sanitizer
.sanitize_fragment(&mut mock_data, &mut output)
.unwrap();
assert_eq!(
str::from_utf8(&output).unwrap(),
"<html><div>div-1 content-1 content-2 div-2</div></html>"
);
}
}