Replace Transformer with Sanitizer
Tried to keep them separate but it was starting to get too messy. Users can just pass in custom transformers in the second constructor argument. Plus, this more closely matches the ruby sanitize API. Changed config to be a struct. So far, have only made a `DEFAULT_CONFIG`. Need to make all of the others as well. Started working on a `remove_contents_when_unwrapped` option to the config that will indicate whether to remove contents of elements that are unwrapped.
This commit is contained in:
@@ -158,8 +158,6 @@ impl<'arena> Node<'arena> {
|
|||||||
|
|
||||||
if let Some(first_child) = first_child {
|
if let Some(first_child) = first_child {
|
||||||
Some(first_child)
|
Some(first_child)
|
||||||
} else if let Some(next_sibling) = next_sibling {
|
|
||||||
Some(next_sibling)
|
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|||||||
25
src/config/default.rs
Normal file
25
src/config/default.rs
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
|
use crate::sanitizer::SanitizerConfig;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
pub static ref DEFAULT_CONFIG: SanitizerConfig = SanitizerConfig {
|
||||||
|
allow_comments: false,
|
||||||
|
allowed_elements: HashSet::new(),
|
||||||
|
allowed_attributes: HashSet::new(),
|
||||||
|
allowed_attributes_per_element: HashMap::new(),
|
||||||
|
add_attributes: HashMap::new(),
|
||||||
|
add_attributes_per_element: HashMap::new(),
|
||||||
|
allowed_protocols: HashMap::new(),
|
||||||
|
allowed_css_at_rules: HashSet::new(),
|
||||||
|
allowed_css_properties: HashSet::new(),
|
||||||
|
remove_contents_when_unwrapped: hashset! {
|
||||||
|
local_name!("iframe"),
|
||||||
|
local_name!("noembed"),
|
||||||
|
local_name!("noframes"),
|
||||||
|
local_name!("noscript"),
|
||||||
|
local_name!("script"),
|
||||||
|
local_name!("style"),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod default;
|
||||||
pub mod basic;
|
pub mod basic;
|
||||||
pub mod relaxed;
|
pub mod relaxed;
|
||||||
pub mod permissive;
|
pub mod permissive;
|
||||||
|
|||||||
29
src/main.rs
29
src/main.rs
@@ -11,11 +11,10 @@ extern crate string_cache;
|
|||||||
extern crate typed_arena;
|
extern crate typed_arena;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::default::Default;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use html5ever::tendril::StrTendril;
|
use html5ever::tendril::StrTendril;
|
||||||
use html5ever::{serialize, Attribute, LocalName, QualName};
|
use html5ever::{Attribute, LocalName, QualName};
|
||||||
|
|
||||||
use url::{ParseError, Url};
|
use url::{ParseError, Url};
|
||||||
|
|
||||||
@@ -31,19 +30,20 @@ mod css_at_rule {
|
|||||||
mod arena_dom;
|
mod arena_dom;
|
||||||
mod config;
|
mod config;
|
||||||
mod css_parser;
|
mod css_parser;
|
||||||
mod transformer;
|
mod sanitizer;
|
||||||
|
|
||||||
use arena_dom::{create_element, Arena, NodeData, Ref};
|
use arena_dom::{create_element, Arena, NodeData, Ref};
|
||||||
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS};
|
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, PROTOCOLS};
|
||||||
use config::relaxed::{CSS_AT_RULES, CSS_PROPERTIES};
|
use config::relaxed::{CSS_AT_RULES, CSS_PROPERTIES};
|
||||||
|
use config::default::DEFAULT_CONFIG;
|
||||||
use css_at_rule::CssAtRule;
|
use css_at_rule::CssAtRule;
|
||||||
use css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
|
use css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
|
||||||
use css_property::CssProperty;
|
use css_property::CssProperty;
|
||||||
use transformer::Transformer;
|
use sanitizer::Sanitizer;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let transformer = Transformer::new(
|
let sanitizer = Sanitizer::new(
|
||||||
&should_unwrap_node,
|
&DEFAULT_CONFIG,
|
||||||
vec![
|
vec![
|
||||||
&sanitize_style_tag_css,
|
&sanitize_style_tag_css,
|
||||||
&sanitize_style_attribute_css,
|
&sanitize_style_attribute_css,
|
||||||
@@ -53,9 +53,7 @@ fn main() {
|
|||||||
&add_single_elements_around_ul,
|
&add_single_elements_around_ul,
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
let root = transformer.parse_fragment(&mut io::stdin()).unwrap();
|
sanitizer.sanitize_fragment(&mut io::stdin(), &mut io::stdout()).unwrap();
|
||||||
transformer.traverse(root);
|
|
||||||
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
||||||
@@ -236,14 +234,3 @@ fn add_single_elements_around_ul<'arena>(node: Ref<'arena>, arena: Arena<'arena>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn should_unwrap_node(node: Ref) -> bool {
|
|
||||||
match node.data {
|
|
||||||
NodeData::Document
|
|
||||||
| NodeData::Doctype { .. }
|
|
||||||
| NodeData::Text { .. }
|
|
||||||
| NodeData::ProcessingInstruction { .. } => false,
|
|
||||||
NodeData::Comment { .. } => true,
|
|
||||||
NodeData::Element { ref name, .. } => !ELEMENTS.contains(&name.local),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
339
src/sanitizer.rs
Normal file
339
src/sanitizer.rs
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::io::{Error, Read, Write};
|
||||||
|
|
||||||
|
use html5ever::interface::tree_builder::QuirksMode;
|
||||||
|
use html5ever::tendril::TendrilSink;
|
||||||
|
use html5ever::{parse_document, parse_fragment, serialize, LocalName, QualName};
|
||||||
|
|
||||||
|
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
||||||
|
use crate::css_at_rule::CssAtRule;
|
||||||
|
use crate::css_property::CssProperty;
|
||||||
|
|
||||||
|
pub struct Sanitizer<'arena> {
|
||||||
|
arena: typed_arena::Arena<Node<'arena>>,
|
||||||
|
config: &'arena SanitizerConfig,
|
||||||
|
transformers: Vec<&'arena dyn Fn(Ref<'arena>, Arena<'arena>)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SanitizerConfig {
|
||||||
|
pub allow_comments: bool,
|
||||||
|
pub allowed_elements: HashSet<LocalName>,
|
||||||
|
pub allowed_attributes: HashSet<LocalName>,
|
||||||
|
pub allowed_attributes_per_element: HashMap<LocalName, HashSet<LocalName>>,
|
||||||
|
pub add_attributes: HashMap<LocalName, &'static str>,
|
||||||
|
pub add_attributes_per_element: HashMap<LocalName, HashMap<LocalName, &'static str>>,
|
||||||
|
pub allowed_protocols: HashMap<LocalName, HashMap<LocalName, HashSet<&'static str>>>,
|
||||||
|
pub allowed_css_at_rules: HashSet<CssAtRule>,
|
||||||
|
pub allowed_css_properties: HashSet<CssProperty>,
|
||||||
|
pub remove_contents_when_unwrapped: HashSet<LocalName>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'arena> Sanitizer<'arena> {
|
||||||
|
pub fn new(
|
||||||
|
config: &'arena SanitizerConfig,
|
||||||
|
transformers: Vec<&'arena dyn Fn(Ref<'arena>, Arena<'arena>)>,
|
||||||
|
) -> Sanitizer<'arena> {
|
||||||
|
Sanitizer {
|
||||||
|
arena: typed_arena::Arena::new(),
|
||||||
|
config,
|
||||||
|
transformers,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn sanitize_fragment(
|
||||||
|
&'arena self,
|
||||||
|
input: &mut impl Read,
|
||||||
|
output: &mut impl Write,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let root = self.parse_fragment(input)?;
|
||||||
|
self.traverse(root);
|
||||||
|
serialize(output, root, Default::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn sanitize_document(
|
||||||
|
&'arena self,
|
||||||
|
input: &mut impl Read,
|
||||||
|
output: &mut impl Write,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let root = self.parse_document(input)?;
|
||||||
|
self.traverse(root);
|
||||||
|
serialize(output, root, Default::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_document(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
data.read_to_end(&mut bytes)?;
|
||||||
|
let sink = Sink {
|
||||||
|
arena: &self.arena,
|
||||||
|
document: self.arena.alloc(Node::new(NodeData::Document)),
|
||||||
|
quirks_mode: QuirksMode::NoQuirks,
|
||||||
|
};
|
||||||
|
Ok(parse_document(sink, Default::default())
|
||||||
|
.from_utf8()
|
||||||
|
.one(&bytes[..]))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_fragment(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
data.read_to_end(&mut bytes)?;
|
||||||
|
let sink = Sink {
|
||||||
|
arena: &self.arena,
|
||||||
|
document: self.arena.alloc(Node::new(NodeData::Document)),
|
||||||
|
quirks_mode: QuirksMode::NoQuirks,
|
||||||
|
};
|
||||||
|
Ok(parse_fragment(
|
||||||
|
sink,
|
||||||
|
Default::default(),
|
||||||
|
QualName::new(None, ns!(html), local_name!("body")),
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.from_utf8()
|
||||||
|
.one(&bytes[..]))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn traverse(&'arena self, node: Ref<'arena>) {
|
||||||
|
println!("{}", &node);
|
||||||
|
if self.should_unwrap_node(node) {
|
||||||
|
let sibling = node.next_sibling.get();
|
||||||
|
|
||||||
|
if self.should_remove_contents_when_unwrapped(node) {
|
||||||
|
node.detach();
|
||||||
|
} else if let Some(unwrapped_node) = node.unwrap() {
|
||||||
|
self.traverse(unwrapped_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(sibling) = sibling {
|
||||||
|
self.traverse(sibling);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("TRANSFORMING: {}", &node);
|
||||||
|
for transformer in self.transformers.iter() {
|
||||||
|
transformer(node, &self.arena);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(child) = node.first_child.get() {
|
||||||
|
println!("traversing child");
|
||||||
|
self.traverse(child);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(sibling) = node.next_sibling.get() {
|
||||||
|
println!("traversing sibling");
|
||||||
|
self.traverse(sibling);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn should_unwrap_node(&'arena self, node: Ref) -> bool {
|
||||||
|
match node.data {
|
||||||
|
NodeData::Document
|
||||||
|
| NodeData::Doctype { .. }
|
||||||
|
| NodeData::Text { .. }
|
||||||
|
| NodeData::ProcessingInstruction { .. } => false,
|
||||||
|
NodeData::Comment { .. } => !self.config.allow_comments,
|
||||||
|
NodeData::Element { ref name, .. } => {
|
||||||
|
!self.config.allowed_elements.contains(&name.local)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn should_remove_contents_when_unwrapped(&'arena self, node: Ref) -> bool {
|
||||||
|
match node.data {
|
||||||
|
NodeData::Document
|
||||||
|
| NodeData::Doctype { .. }
|
||||||
|
| NodeData::Text { .. }
|
||||||
|
| NodeData::ProcessingInstruction { .. }
|
||||||
|
| NodeData::Comment { .. } => false,
|
||||||
|
NodeData::Element { ref name, .. } => self
|
||||||
|
.config
|
||||||
|
.remove_contents_when_unwrapped
|
||||||
|
.contains(&name.local),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
struct MockRead {
|
||||||
|
contents: &'static str,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MockRead {
|
||||||
|
fn new(contents: &'static str) -> MockRead {
|
||||||
|
MockRead { contents }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Read for MockRead {
|
||||||
|
fn read(&mut self, _: &mut [u8]) -> Result<usize, Error> {
|
||||||
|
Ok(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, Error> {
|
||||||
|
buf.extend_from_slice(self.contents.as_bytes());
|
||||||
|
Ok(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref EMPTY_CONFIG: SanitizerConfig = SanitizerConfig {
|
||||||
|
allow_comments: false,
|
||||||
|
allowed_elements: HashSet::new(),
|
||||||
|
allowed_attributes: HashSet::new(),
|
||||||
|
allowed_attributes_per_element: HashMap::new(),
|
||||||
|
add_attributes: HashMap::new(),
|
||||||
|
add_attributes_per_element: HashMap::new(),
|
||||||
|
allowed_protocols: HashMap::new(),
|
||||||
|
allowed_css_at_rules: HashSet::new(),
|
||||||
|
allowed_css_properties: HashSet::new(),
|
||||||
|
remove_contents_when_unwrapped: HashSet::new(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn disallow_all_elements() {
|
||||||
|
let sanitizer = Sanitizer::new(&EMPTY_CONFIG, vec![]);
|
||||||
|
let mut mock_data = MockRead::new("<div><!-- remove me --></div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(str::from_utf8(&output).unwrap(), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_html_comments() {
|
||||||
|
let mut disallow_comments_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_comments_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_comments_config, vec![]);
|
||||||
|
let mut mock_data = MockRead::new("<div><!-- remove me --></div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(str::from_utf8(&output).unwrap(), "<html><div></div></html>");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_script_elements() {
|
||||||
|
let mut disallow_script_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_script_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_script_config, vec![]);
|
||||||
|
let mut mock_data = MockRead::new("<div><script>alert('haX0rz')</script></div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
str::from_utf8(&output).unwrap(),
|
||||||
|
"<html><div>alert('haX0rz')</div></html>"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_script_element_siblings() {
|
||||||
|
let mut disallow_script_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_script_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_script_config, vec![]);
|
||||||
|
let mut mock_data =
|
||||||
|
MockRead::new("<div><script>alert('haX0rz')</script><script>two</script></div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
str::from_utf8(&output).unwrap(),
|
||||||
|
"<html><div>alert('haX0rz')two</div></html>"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_script_element_in_separate_sub_trees() {
|
||||||
|
let mut disallow_script_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_script_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_script_config, vec![]);
|
||||||
|
let mut mock_data = MockRead::new(
|
||||||
|
"<div><script>alert('haX0rz')</script></div><div><script>two</script></div>",
|
||||||
|
);
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
str::from_utf8(&output).unwrap(),
|
||||||
|
"<html><div>alert('haX0rz')</div><div>two</div></html>"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_script_elements_and_contents() {
|
||||||
|
let mut disallow_script_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_script_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
disallow_script_config
|
||||||
|
.remove_contents_when_unwrapped
|
||||||
|
.insert(local_name!("script"));
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_script_config, vec![]);
|
||||||
|
let mut mock_data = MockRead::new("<div><script>alert('haX0rz')</script></div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(str::from_utf8(&output).unwrap(), "<html><div></div></html>");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_script_elements_and_content_siblings() {
|
||||||
|
let mut disallow_script_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_script_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
disallow_script_config
|
||||||
|
.remove_contents_when_unwrapped
|
||||||
|
.insert(local_name!("script"));
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_script_config, vec![]);
|
||||||
|
let mut mock_data =
|
||||||
|
MockRead::new("<div><script>alert('haX0rz')</script><script>two</script></div>");
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(str::from_utf8(&output).unwrap(), "<html><div></div></html>");
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: this is failing, need to fix the traversal & detach algorithm
|
||||||
|
#[test]
|
||||||
|
fn remove_script_elements_and_content_in_separate_sub_trees() {
|
||||||
|
let mut disallow_script_config = EMPTY_CONFIG.clone();
|
||||||
|
disallow_script_config
|
||||||
|
.allowed_elements
|
||||||
|
.extend(vec![local_name!("html"), local_name!("div")]);
|
||||||
|
disallow_script_config
|
||||||
|
.remove_contents_when_unwrapped
|
||||||
|
.insert(local_name!("script"));
|
||||||
|
let sanitizer = Sanitizer::new(&disallow_script_config, vec![]);
|
||||||
|
let mut mock_data = MockRead::new(
|
||||||
|
"<div><script>alert('haX0rz')</script><div><div><script>two</script></div>",
|
||||||
|
);
|
||||||
|
let mut output = vec![];
|
||||||
|
sanitizer
|
||||||
|
.sanitize_fragment(&mut mock_data, &mut output)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(str::from_utf8(&output).unwrap(), "<html><div></div><div></div></html>");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,206 +0,0 @@
|
|||||||
use std::io::{Error, Read};
|
|
||||||
|
|
||||||
use html5ever::interface::tree_builder::QuirksMode;
|
|
||||||
use html5ever::tendril::TendrilSink;
|
|
||||||
use html5ever::{parse_document, parse_fragment, QualName};
|
|
||||||
|
|
||||||
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
|
||||||
|
|
||||||
// TODO: What are the performance implications of using a vec of closures instead of one
|
|
||||||
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
|
|
||||||
// TODO: how to integrate CSS parsing and transforming?
|
|
||||||
pub struct Transformer<'arena, T>
|
|
||||||
where
|
|
||||||
T: Fn(Ref) -> bool,
|
|
||||||
{
|
|
||||||
arena: typed_arena::Arena<Node<'arena>>,
|
|
||||||
should_unwrap: T,
|
|
||||||
transformer_fns: Vec<&'arena dyn Fn(Ref<'arena>, Arena<'arena>)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'arena, T> Transformer<'arena, T>
|
|
||||||
where
|
|
||||||
T: Fn(Ref) -> bool,
|
|
||||||
{
|
|
||||||
pub fn new(
|
|
||||||
should_unwrap: T,
|
|
||||||
transformers: Vec<&'arena dyn Fn(Ref<'arena>, Arena<'arena>)>,
|
|
||||||
) -> Transformer<'arena, T> {
|
|
||||||
Transformer {
|
|
||||||
arena: typed_arena::Arena::new(),
|
|
||||||
should_unwrap,
|
|
||||||
transformer_fns: transformers,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn parse_document(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
data.read_to_end(&mut bytes)?;
|
|
||||||
let sink = Sink {
|
|
||||||
arena: &self.arena,
|
|
||||||
document: self.arena.alloc(Node::new(NodeData::Document)),
|
|
||||||
quirks_mode: QuirksMode::NoQuirks,
|
|
||||||
};
|
|
||||||
Ok(parse_document(sink, Default::default())
|
|
||||||
.from_utf8()
|
|
||||||
.one(&bytes[..]))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn parse_fragment(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
data.read_to_end(&mut bytes)?;
|
|
||||||
let sink = Sink {
|
|
||||||
arena: &self.arena,
|
|
||||||
document: self.arena.alloc(Node::new(NodeData::Document)),
|
|
||||||
quirks_mode: QuirksMode::NoQuirks,
|
|
||||||
};
|
|
||||||
Ok(parse_fragment(
|
|
||||||
sink,
|
|
||||||
Default::default(),
|
|
||||||
QualName::new(None, ns!(html), local_name!("body")),
|
|
||||||
vec![],
|
|
||||||
)
|
|
||||||
.from_utf8()
|
|
||||||
.one(&bytes[..]))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn traverse(&'arena self, node: Ref<'arena>) {
|
|
||||||
if (self.should_unwrap)(node) {
|
|
||||||
if let Some(unwrapped_node) = node.unwrap() {
|
|
||||||
return self.traverse(unwrapped_node);
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for transformer in self.transformer_fns.iter() {
|
|
||||||
transformer(node, &self.arena);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(child) = node.first_child.get() {
|
|
||||||
self.traverse(child);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(sibling) = node.next_sibling.get() {
|
|
||||||
self.traverse(sibling);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
use std::str;
|
|
||||||
|
|
||||||
use html5ever::serialize;
|
|
||||||
|
|
||||||
use crate::arena_dom::{create_element, NodeData};
|
|
||||||
|
|
||||||
struct MockRead {
|
|
||||||
contents: &'static str,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MockRead {
|
|
||||||
fn new(contents: &'static str) -> MockRead {
|
|
||||||
MockRead { contents }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Read for MockRead {
|
|
||||||
fn read(&mut self, _: &mut [u8]) -> Result<usize, Error> {
|
|
||||||
Ok(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, Error> {
|
|
||||||
buf.extend_from_slice(self.contents.as_bytes());
|
|
||||||
Ok(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
|
|
||||||
// if let NodeData::Element { ref name, .. } = node.data {
|
|
||||||
// if name.local == LocalName::from(tag_name) {
|
|
||||||
// return true;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if let Some(child) = node.first_child.get() {
|
|
||||||
// if node_contains_tag(child, tag_name) {
|
|
||||||
// return true;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if let Some(sibling) = node.next_sibling.get() {
|
|
||||||
// if node_contains_tag(sibling, tag_name) {
|
|
||||||
// return true;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// false
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn count_nodes(node: Ref) -> usize {
|
|
||||||
// let mut count = 1;
|
|
||||||
|
|
||||||
// if let Some(child) = node.first_child.get() {
|
|
||||||
// count += count_nodes(child);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if let Some(sibling) = node.next_sibling.get() {
|
|
||||||
// count += count_nodes(sibling);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// count
|
|
||||||
// }
|
|
||||||
|
|
||||||
fn assert_serialized_html_eq(node: Ref, expected: &str) {
|
|
||||||
let mut output = vec![];
|
|
||||||
serialize(&mut output, node, Default::default()).unwrap();
|
|
||||||
assert_eq!(str::from_utf8(&output).unwrap(), expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn traversal() {
|
|
||||||
let transformer = Transformer::new(|_| false, vec![&|_, _| {}]);
|
|
||||||
let mut mock_data = MockRead::new("<div></div>");
|
|
||||||
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
|
||||||
transformer.traverse(root);
|
|
||||||
assert_serialized_html_eq(root, "<html><div></div></html>");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn unwraps_element() {
|
|
||||||
let transformer = Transformer::new(
|
|
||||||
|node| {
|
|
||||||
if let NodeData::Element { ref name, .. } = node.data {
|
|
||||||
return name.local == local_name!("div");
|
|
||||||
}
|
|
||||||
false
|
|
||||||
},
|
|
||||||
vec![&|_, _| {}],
|
|
||||||
);
|
|
||||||
let mut mock_data = MockRead::new("<div></div>");
|
|
||||||
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
|
||||||
transformer.traverse(root);
|
|
||||||
assert_serialized_html_eq(root, "<html></html>");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn adds_element() {
|
|
||||||
let transformer = Transformer::new(
|
|
||||||
|_| false,
|
|
||||||
vec![&|node, arena| {
|
|
||||||
if let NodeData::Element { ref name, .. } = node.data {
|
|
||||||
if let local_name!("div") = name.local {
|
|
||||||
node.insert_after(create_element(arena, "span"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}],
|
|
||||||
);
|
|
||||||
let mut mock_data = MockRead::new("<div></div>");
|
|
||||||
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
|
||||||
transformer.traverse(root);
|
|
||||||
assert_serialized_html_eq(root, "<html><div></div><span></span></html>");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user