Whitespace for unwrapped content & fix unwrap bug
This commit is contained in:
parent
fce50554a3
commit
00593d3c58
@ -123,6 +123,7 @@ impl<'arena> Node<'arena> {
|
|||||||
if let Some(next_sibling) = next_sibling {
|
if let Some(next_sibling) = next_sibling {
|
||||||
if let Some(last_child) = last_child {
|
if let Some(last_child) = last_child {
|
||||||
next_sibling.previous_sibling.set(Some(last_child));
|
next_sibling.previous_sibling.set(Some(last_child));
|
||||||
|
last_child.next_sibling.set(Some(next_sibling));
|
||||||
} else {
|
} else {
|
||||||
next_sibling.previous_sibling.set(previous_sibling);
|
next_sibling.previous_sibling.set(previous_sibling);
|
||||||
}
|
}
|
||||||
@ -138,6 +139,7 @@ impl<'arena> Node<'arena> {
|
|||||||
if let Some(previous_sibling) = previous_sibling {
|
if let Some(previous_sibling) = previous_sibling {
|
||||||
if let Some(first_child) = first_child {
|
if let Some(first_child) = first_child {
|
||||||
previous_sibling.next_sibling.set(Some(first_child));
|
previous_sibling.next_sibling.set(Some(first_child));
|
||||||
|
first_child.previous_sibling.set(Some(previous_sibling));
|
||||||
} else {
|
} else {
|
||||||
previous_sibling.next_sibling.set(next_sibling);
|
previous_sibling.next_sibling.set(next_sibling);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use crate::sanitizer::SanitizerConfig;
|
use crate::sanitizer::{ContentWhitespace, SanitizerConfig};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig {
|
pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig {
|
||||||
@ -23,5 +23,32 @@ lazy_static! {
|
|||||||
local_name!("script"),
|
local_name!("script"),
|
||||||
local_name!("style"),
|
local_name!("style"),
|
||||||
},
|
},
|
||||||
|
whitespace_around_unwrapped_content: hashmap! {
|
||||||
|
local_name!("address") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("article") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("aside") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("blockquote") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("br") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("dd") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("div") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("dl") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("footer") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("h1") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("h2") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("h3") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("h4") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("h5") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("h6") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("header") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("hgroup") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("hr") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("li") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("nav") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("ol") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("p") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("pre") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("section") => ContentWhitespace::space_around(),
|
||||||
|
local_name!("ul") => ContentWhitespace::space_around(),
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ use std::io::{Error, Read, Write};
|
|||||||
use url::{ParseError, Url};
|
use url::{ParseError, Url};
|
||||||
|
|
||||||
use html5ever::interface::tree_builder::QuirksMode;
|
use html5ever::interface::tree_builder::QuirksMode;
|
||||||
use html5ever::tendril::{StrTendril, TendrilSink};
|
use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
|
||||||
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
|
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
|
||||||
|
|
||||||
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
||||||
@ -31,6 +31,7 @@ pub struct SanitizerConfig {
|
|||||||
pub allowed_css_properties: HashSet<CssProperty>,
|
pub allowed_css_properties: HashSet<CssProperty>,
|
||||||
pub allow_css_comments: bool,
|
pub allow_css_comments: bool,
|
||||||
pub remove_contents_when_unwrapped: HashSet<LocalName>,
|
pub remove_contents_when_unwrapped: HashSet<LocalName>,
|
||||||
|
pub whitespace_around_unwrapped_content: HashMap<LocalName, ContentWhitespace<'static>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||||
@ -39,6 +40,21 @@ pub enum Protocol<'a> {
|
|||||||
Relative,
|
Relative,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ContentWhitespace<'a> {
|
||||||
|
before: &'a str,
|
||||||
|
after: &'a str,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ContentWhitespace<'a> {
|
||||||
|
pub fn space_around() -> ContentWhitespace<'a> {
|
||||||
|
ContentWhitespace {
|
||||||
|
before: " ",
|
||||||
|
after: " ",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'arena> Sanitizer<'arena> {
|
impl<'arena> Sanitizer<'arena> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
config: &'arena SanitizerConfig,
|
config: &'arena SanitizerConfig,
|
||||||
@ -109,6 +125,7 @@ impl<'arena> Sanitizer<'arena> {
|
|||||||
if self.should_remove_contents_when_unwrapped(node) {
|
if self.should_remove_contents_when_unwrapped(node) {
|
||||||
node.detach();
|
node.detach();
|
||||||
} else if let Some(unwrapped_node) = node.unwrap() {
|
} else if let Some(unwrapped_node) = node.unwrap() {
|
||||||
|
self.add_unwrapped_content_whitespace(node, unwrapped_node);
|
||||||
self.traverse(unwrapped_node);
|
self.traverse(unwrapped_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -347,6 +364,31 @@ impl<'arena> Sanitizer<'arena> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_unwrapped_content_whitespace(
|
||||||
|
&self,
|
||||||
|
wrapping_node: Ref<'arena>,
|
||||||
|
unwrapped_node: Ref<'arena>,
|
||||||
|
) {
|
||||||
|
if let NodeData::Element { ref name, .. } = wrapping_node.data {
|
||||||
|
if let Some(content_whitespace) = self
|
||||||
|
.config
|
||||||
|
.whitespace_around_unwrapped_content
|
||||||
|
.get(&name.local)
|
||||||
|
{
|
||||||
|
if let NodeData::Text { ref contents, .. } = unwrapped_node.data {
|
||||||
|
contents.replace_with(|current| {
|
||||||
|
format_tendril!(
|
||||||
|
"{}{}{}",
|
||||||
|
content_whitespace.before,
|
||||||
|
current,
|
||||||
|
content_whitespace.after
|
||||||
|
)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@ -390,6 +432,7 @@ mod test {
|
|||||||
allowed_css_properties: HashSet::new(),
|
allowed_css_properties: HashSet::new(),
|
||||||
allow_css_comments: false,
|
allow_css_comments: false,
|
||||||
remove_contents_when_unwrapped: HashSet::new(),
|
remove_contents_when_unwrapped: HashSet::new(),
|
||||||
|
whitespace_around_unwrapped_content: HashMap::new(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -779,4 +822,26 @@ mod test {
|
|||||||
"<!DOCTYPE html><html><div></div></html>"
|
"<!DOCTYPE html><html><div></div></html>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_unwrapped_content_whitespace() {
|
||||||
|
let mut unwrapped_whitespace_config = EMPTY_CONFIG.clone();
|
||||||
|
unwrapped_whitespace_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
unwrapped_whitespace_config
|
||||||
|
.whitespace_around_unwrapped_content
|
||||||
|
.insert(local_name!("span"), ContentWhitespace::space_around());
|
||||||
|
let sanitizer = Sanitizer::new(&unwrapped_whitespace_config, vec![]);
|
||||||
|
let mut mock_data =
|
||||||
|
MockRead::new("<div>div-1<span>content-1</span><span>content-2</span>div-2</div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
str::from_utf8(&output).unwrap(),
|
||||||
|
"<html><div>div-1 content-1 content-2 div-2</div></html>"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user