Whitespace for unwrapped content & fix unwrap bug
This commit is contained in:
parent
fce50554a3
commit
00593d3c58
src
@ -123,6 +123,7 @@ impl<'arena> Node<'arena> {
|
||||
if let Some(next_sibling) = next_sibling {
|
||||
if let Some(last_child) = last_child {
|
||||
next_sibling.previous_sibling.set(Some(last_child));
|
||||
last_child.next_sibling.set(Some(next_sibling));
|
||||
} else {
|
||||
next_sibling.previous_sibling.set(previous_sibling);
|
||||
}
|
||||
@ -138,6 +139,7 @@ impl<'arena> Node<'arena> {
|
||||
if let Some(previous_sibling) = previous_sibling {
|
||||
if let Some(first_child) = first_child {
|
||||
previous_sibling.next_sibling.set(Some(first_child));
|
||||
first_child.previous_sibling.set(Some(previous_sibling));
|
||||
} else {
|
||||
previous_sibling.next_sibling.set(next_sibling);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use crate::sanitizer::SanitizerConfig;
|
||||
use crate::sanitizer::{ContentWhitespace, SanitizerConfig};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig {
|
||||
@ -23,5 +23,32 @@ lazy_static! {
|
||||
local_name!("script"),
|
||||
local_name!("style"),
|
||||
},
|
||||
whitespace_around_unwrapped_content: hashmap! {
|
||||
local_name!("address") => ContentWhitespace::space_around(),
|
||||
local_name!("article") => ContentWhitespace::space_around(),
|
||||
local_name!("aside") => ContentWhitespace::space_around(),
|
||||
local_name!("blockquote") => ContentWhitespace::space_around(),
|
||||
local_name!("br") => ContentWhitespace::space_around(),
|
||||
local_name!("dd") => ContentWhitespace::space_around(),
|
||||
local_name!("div") => ContentWhitespace::space_around(),
|
||||
local_name!("dl") => ContentWhitespace::space_around(),
|
||||
local_name!("footer") => ContentWhitespace::space_around(),
|
||||
local_name!("h1") => ContentWhitespace::space_around(),
|
||||
local_name!("h2") => ContentWhitespace::space_around(),
|
||||
local_name!("h3") => ContentWhitespace::space_around(),
|
||||
local_name!("h4") => ContentWhitespace::space_around(),
|
||||
local_name!("h5") => ContentWhitespace::space_around(),
|
||||
local_name!("h6") => ContentWhitespace::space_around(),
|
||||
local_name!("header") => ContentWhitespace::space_around(),
|
||||
local_name!("hgroup") => ContentWhitespace::space_around(),
|
||||
local_name!("hr") => ContentWhitespace::space_around(),
|
||||
local_name!("li") => ContentWhitespace::space_around(),
|
||||
local_name!("nav") => ContentWhitespace::space_around(),
|
||||
local_name!("ol") => ContentWhitespace::space_around(),
|
||||
local_name!("p") => ContentWhitespace::space_around(),
|
||||
local_name!("pre") => ContentWhitespace::space_around(),
|
||||
local_name!("section") => ContentWhitespace::space_around(),
|
||||
local_name!("ul") => ContentWhitespace::space_around(),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ use std::io::{Error, Read, Write};
|
||||
use url::{ParseError, Url};
|
||||
|
||||
use html5ever::interface::tree_builder::QuirksMode;
|
||||
use html5ever::tendril::{StrTendril, TendrilSink};
|
||||
use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
|
||||
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
|
||||
|
||||
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
||||
@ -31,6 +31,7 @@ pub struct SanitizerConfig {
|
||||
pub allowed_css_properties: HashSet<CssProperty>,
|
||||
pub allow_css_comments: bool,
|
||||
pub remove_contents_when_unwrapped: HashSet<LocalName>,
|
||||
pub whitespace_around_unwrapped_content: HashMap<LocalName, ContentWhitespace<'static>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
@ -39,6 +40,21 @@ pub enum Protocol<'a> {
|
||||
Relative,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ContentWhitespace<'a> {
|
||||
before: &'a str,
|
||||
after: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ContentWhitespace<'a> {
|
||||
pub fn space_around() -> ContentWhitespace<'a> {
|
||||
ContentWhitespace {
|
||||
before: " ",
|
||||
after: " ",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'arena> Sanitizer<'arena> {
|
||||
pub fn new(
|
||||
config: &'arena SanitizerConfig,
|
||||
@ -109,6 +125,7 @@ impl<'arena> Sanitizer<'arena> {
|
||||
if self.should_remove_contents_when_unwrapped(node) {
|
||||
node.detach();
|
||||
} else if let Some(unwrapped_node) = node.unwrap() {
|
||||
self.add_unwrapped_content_whitespace(node, unwrapped_node);
|
||||
self.traverse(unwrapped_node);
|
||||
}
|
||||
|
||||
@ -347,6 +364,31 @@ impl<'arena> Sanitizer<'arena> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_unwrapped_content_whitespace(
|
||||
&self,
|
||||
wrapping_node: Ref<'arena>,
|
||||
unwrapped_node: Ref<'arena>,
|
||||
) {
|
||||
if let NodeData::Element { ref name, .. } = wrapping_node.data {
|
||||
if let Some(content_whitespace) = self
|
||||
.config
|
||||
.whitespace_around_unwrapped_content
|
||||
.get(&name.local)
|
||||
{
|
||||
if let NodeData::Text { ref contents, .. } = unwrapped_node.data {
|
||||
contents.replace_with(|current| {
|
||||
format_tendril!(
|
||||
"{}{}{}",
|
||||
content_whitespace.before,
|
||||
current,
|
||||
content_whitespace.after
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -390,6 +432,7 @@ mod test {
|
||||
allowed_css_properties: HashSet::new(),
|
||||
allow_css_comments: false,
|
||||
remove_contents_when_unwrapped: HashSet::new(),
|
||||
whitespace_around_unwrapped_content: HashMap::new(),
|
||||
};
|
||||
}
|
||||
|
||||
@ -779,4 +822,26 @@ mod test {
|
||||
"<!DOCTYPE html><html><div></div></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_unwrapped_content_whitespace() {
|
||||
let mut unwrapped_whitespace_config = EMPTY_CONFIG.clone();
|
||||
unwrapped_whitespace_config
|
||||
.allowed_elements
|
||||
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||
unwrapped_whitespace_config
|
||||
.whitespace_around_unwrapped_content
|
||||
.insert(local_name!("span"), ContentWhitespace::space_around());
|
||||
let sanitizer = Sanitizer::new(&unwrapped_whitespace_config, vec![]);
|
||||
let mut mock_data =
|
||||
MockRead::new("<div>div-1<span>content-1</span><span>content-2</span>div-2</div>");
|
||||
let mut output = vec![];
|
||||
sanitizer
|
||||
.sanitize_fragment(&mut mock_data, &mut output)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
str::from_utf8(&output).unwrap(),
|
||||
"<html><div>div-1 content-1 content-2 div-2</div></html>"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user