Abstracted out all traversal logic to Traverser
This commit is contained in:
parent
e20ec1224e
commit
47bd10f508
@ -17,6 +17,7 @@ use std::borrow::Cow;
|
|||||||
use std::cell::{Cell, RefCell};
|
use std::cell::{Cell, RefCell};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::default::Default;
|
use std::default::Default;
|
||||||
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::ptr;
|
use std::ptr;
|
||||||
|
|
||||||
@ -167,7 +168,7 @@ impl<'arena> Node<'arena> {
|
|||||||
Some(next_child) => {
|
Some(next_child) => {
|
||||||
next_child.parent.set(parent);
|
next_child.parent.set(parent);
|
||||||
child = next_child.next_sibling.get();
|
child = next_child.next_sibling.get();
|
||||||
},
|
}
|
||||||
None => break,
|
None => break,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -232,6 +233,32 @@ impl<'arena> Node<'arena> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'arena> fmt::Display for Node<'arena> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write_node(self, 0, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_node<'arena>(
|
||||||
|
node: &Node<'arena>,
|
||||||
|
indent: usize,
|
||||||
|
f: &mut fmt::Formatter<'_>,
|
||||||
|
) -> fmt::Result {
|
||||||
|
let indent_str = " ".repeat(indent);
|
||||||
|
writeln!(f, "{}Node {{", &indent_str)?;
|
||||||
|
writeln!(f, "{} data: {:?}", &indent_str, node.data)?;
|
||||||
|
let mut child = node.first_child.get();
|
||||||
|
if child.is_some() {
|
||||||
|
writeln!(f, "{} children: [", &indent_str)?;
|
||||||
|
while let Some(next_child) = child {
|
||||||
|
write_node(next_child, indent + 2, f)?;
|
||||||
|
child = next_child.next_sibling.get();
|
||||||
|
}
|
||||||
|
writeln!(f, "{} ]", &indent_str)?;
|
||||||
|
}
|
||||||
|
writeln!(f, "{}}}", &indent_str)
|
||||||
|
}
|
||||||
|
|
||||||
impl<'arena> Sink<'arena> {
|
impl<'arena> Sink<'arena> {
|
||||||
fn new_node(&self, data: NodeData<'arena>) -> Ref<'arena> {
|
fn new_node(&self, data: NodeData<'arena>) -> Ref<'arena> {
|
||||||
self.arena.alloc(Node::new(data))
|
self.arena.alloc(Node::new(data))
|
||||||
@ -458,9 +485,11 @@ impl<'arena> Serialize for Node<'arena> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(&ChildrenOnly(_), _) => {},
|
(&ChildrenOnly(_), _) => {}
|
||||||
|
|
||||||
(&IncludeNode, &NodeData::Doctype { ref name, .. }) => serializer.write_doctype(&name)?,
|
(&IncludeNode, &NodeData::Doctype { ref name, .. }) => {
|
||||||
|
serializer.write_doctype(&name)?
|
||||||
|
}
|
||||||
(&IncludeNode, &NodeData::Text { ref contents }) => {
|
(&IncludeNode, &NodeData::Text { ref contents }) => {
|
||||||
serializer.write_text(&contents.borrow())?
|
serializer.write_text(&contents.borrow())?
|
||||||
}
|
}
|
||||||
|
89
src/main.rs
89
src/main.rs
@ -35,40 +35,17 @@ mod traverser;
|
|||||||
|
|
||||||
use arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, NodeData, Ref};
|
use arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, NodeData, Ref};
|
||||||
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS};
|
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS};
|
||||||
use config::relaxed::{CSS_PROPERTIES, CSS_AT_RULES};
|
use config::relaxed::{CSS_AT_RULES, CSS_PROPERTIES};
|
||||||
use css_parser::{CssRule, parse_css_style_attribute, parse_css_stylesheet};
|
|
||||||
use css_property::CssProperty;
|
|
||||||
use css_at_rule::CssAtRule;
|
use css_at_rule::CssAtRule;
|
||||||
|
use css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
|
||||||
|
use css_property::CssProperty;
|
||||||
|
use traverser::Traverser;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let mut bytes = Vec::new();
|
let traverser = Traverser::new(&should_unwrap_node, vec![Box::new(&transform_node)]);
|
||||||
io::stdin().read_to_end(&mut bytes).unwrap();
|
let root = traverser.parse(&mut io::stdin()).unwrap();
|
||||||
let arena = typed_arena::Arena::new();
|
traverser.traverse(root);
|
||||||
let doc = html5ever_parse_slice_into_arena(&bytes, &arena);
|
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
|
||||||
sanitize(doc, &arena);
|
|
||||||
serialize(&mut io::stdout(), doc, Default::default())
|
|
||||||
.ok()
|
|
||||||
.expect("serialization failed")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn sanitize<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
|
|
||||||
if let Some(unwrapped) = maybe_unwrap_node(&node) {
|
|
||||||
if let Some(unwrapped_node) = unwrapped {
|
|
||||||
return sanitize(unwrapped_node, arena);
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
transform_node(&node, arena);
|
|
||||||
|
|
||||||
if let Some(child) = node.first_child.get() {
|
|
||||||
sanitize(child, arena);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(sibling) = node.next_sibling.get() {
|
|
||||||
sanitize(sibling, arena);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
||||||
@ -80,21 +57,17 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
|||||||
sanitized_css += " {\n";
|
sanitized_css += " {\n";
|
||||||
for declaration in style_rule.declarations.into_iter() {
|
for declaration in style_rule.declarations.into_iter() {
|
||||||
let declaration_string = &declaration.to_string();
|
let declaration_string = &declaration.to_string();
|
||||||
if CSS_PROPERTIES
|
if CSS_PROPERTIES.contains(&CssProperty::from(declaration.property)) {
|
||||||
.contains(&CssProperty::from(declaration.property))
|
|
||||||
{
|
|
||||||
sanitized_css += " ";
|
sanitized_css += " ";
|
||||||
sanitized_css += declaration_string;
|
sanitized_css += declaration_string;
|
||||||
sanitized_css += " ";
|
sanitized_css += " ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sanitized_css += "\n}";
|
sanitized_css += "\n}";
|
||||||
},
|
}
|
||||||
CssRule::AtRule(at_rule) => {
|
CssRule::AtRule(at_rule) => {
|
||||||
dbg!(&at_rule);
|
dbg!(&at_rule);
|
||||||
if CSS_AT_RULES
|
if CSS_AT_RULES.contains(&CssAtRule::from(at_rule.name.clone())) {
|
||||||
.contains(&CssAtRule::from(at_rule.name.clone()))
|
|
||||||
{
|
|
||||||
sanitized_css += &format!("@{} ", &at_rule.name);
|
sanitized_css += &format!("@{} ", &at_rule.name);
|
||||||
sanitized_css += &at_rule.prelude.trim();
|
sanitized_css += &at_rule.prelude.trim();
|
||||||
if let Some(block) = at_rule.block {
|
if let Some(block) = at_rule.block {
|
||||||
@ -111,21 +84,22 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: make separate rich and plain transformers
|
// TODO: make separate rich and plain transformers
|
||||||
// TODO: add whitelist of tags, remove any not in it DONE
|
// DONE: add whitelist of tags, remove any not in it
|
||||||
// TODO: add whitelist of attributes, remove any not in it DONE
|
// DONE: add whitelist of attributes, remove any not in it
|
||||||
// TODO: add map of tags to attributes, remove any on tag not in the mapped value DONE
|
// DONE: add map of tags to attributes, remove any on tag not in the mapped value
|
||||||
// TODO: add whitelist of url schemes, parse urls and remove any not in it DONE
|
// DONE: add whitelist of url schemes, parse urls and remove any not in it
|
||||||
// TODO: strip comments DONE
|
// DONE: strip comments
|
||||||
// TODO: parse style tags and attributes DONE
|
// DONE: parse style tags and attributes
|
||||||
// TODO: add whitelist of CSS properties, remove any not in it DONE
|
// DONE: add whitelist of CSS properties, remove any not in it
|
||||||
// TODO: scope selectors in rich formatter
|
// TODO: scope selectors in rich formatter
|
||||||
// TODO: add class attributes to elements in rich formatter
|
// TODO: add class attributes to elements in rich formatter
|
||||||
fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
|
// TODO: separate this out into multiple separate transformers
|
||||||
|
fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
|
||||||
match node.data {
|
match node.data {
|
||||||
NodeData::Document
|
NodeData::Document
|
||||||
| NodeData::Doctype { .. }
|
| NodeData::Doctype { .. }
|
||||||
| NodeData::Comment { .. }
|
| NodeData::Comment { .. }
|
||||||
| NodeData::ProcessingInstruction { .. } => {}
|
| NodeData::ProcessingInstruction { .. } => false,
|
||||||
NodeData::Text { ref contents } => {
|
NodeData::Text { ref contents } => {
|
||||||
// TODO: seems rather expensive to lookup the parent on every Text node. Better
|
// TODO: seems rather expensive to lookup the parent on every Text node. Better
|
||||||
// solution would be to pass some sort of context from the parent that marks that this
|
// solution would be to pass some sort of context from the parent that marks that this
|
||||||
@ -138,9 +112,11 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
|
|||||||
let sanitized_css = css_rules_to_string(rules);
|
let sanitized_css = css_rules_to_string(rules);
|
||||||
dbg!(&sanitized_css);
|
dbg!(&sanitized_css);
|
||||||
contents.replace(StrTendril::from(sanitized_css));
|
contents.replace(StrTendril::from(sanitized_css));
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
false
|
||||||
}
|
}
|
||||||
NodeData::Element {
|
NodeData::Element {
|
||||||
ref attrs,
|
ref attrs,
|
||||||
@ -165,9 +141,7 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
|
|||||||
let mut sanitized_css = String::new();
|
let mut sanitized_css = String::new();
|
||||||
for declaration in declarations.into_iter() {
|
for declaration in declarations.into_iter() {
|
||||||
let declaration_string = &declaration.to_string();
|
let declaration_string = &declaration.to_string();
|
||||||
if CSS_PROPERTIES
|
if CSS_PROPERTIES.contains(&CssProperty::from(declaration.property)) {
|
||||||
.contains(&CssProperty::from(declaration.property))
|
|
||||||
{
|
|
||||||
sanitized_css += declaration_string;
|
sanitized_css += declaration_string;
|
||||||
sanitized_css += " ";
|
sanitized_css += " ";
|
||||||
}
|
}
|
||||||
@ -229,23 +203,18 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
|
|||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn maybe_unwrap_node(node: Ref) -> Option<Option<Ref>> {
|
fn should_unwrap_node(node: Ref) -> bool {
|
||||||
match node.data {
|
match node.data {
|
||||||
NodeData::Document
|
NodeData::Document
|
||||||
| NodeData::Doctype { .. }
|
| NodeData::Doctype { .. }
|
||||||
| NodeData::Text { .. }
|
| NodeData::Text { .. }
|
||||||
| NodeData::ProcessingInstruction { .. } => None,
|
| NodeData::ProcessingInstruction { .. } => false,
|
||||||
NodeData::Comment { .. } => Some(node.unwrap()),
|
NodeData::Comment { .. } => true,
|
||||||
NodeData::Element { ref name, .. } => {
|
NodeData::Element { ref name, .. } => !ELEMENTS.contains(&name.local),
|
||||||
if !ELEMENTS.contains(&name.local) {
|
|
||||||
Some(node.unwrap())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,35 +1,67 @@
|
|||||||
extern crate typed_arena;
|
extern crate typed_arena;
|
||||||
|
|
||||||
use std::io::{self, Read, Error};
|
use std::io::{self, Error, Read};
|
||||||
|
|
||||||
use html5ever::{serialize, Attribute, LocalName, QualName};
|
use html5ever::{serialize, Attribute, LocalName, QualName};
|
||||||
|
|
||||||
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref};
|
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref};
|
||||||
|
|
||||||
pub struct Traverser<'arena> {
|
// TODO: I don't love the "Traverser" name. Should maybe come up with something else.
|
||||||
|
// (it also unwraps nodes and calls transformer functions... does a lot more than traverse)
|
||||||
|
// TODO: What are the performance implications of using a vec of boxed closures instead of one
|
||||||
|
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
|
||||||
|
// TODO: how to integrate CSS parsing and transforming?
|
||||||
|
pub struct Traverser<'arena, T>
|
||||||
|
where
|
||||||
|
T: Fn(Ref) -> bool,
|
||||||
|
{
|
||||||
arena: typed_arena::Arena<Node<'arena>>,
|
arena: typed_arena::Arena<Node<'arena>>,
|
||||||
|
should_unwrap: T,
|
||||||
|
transformers: Vec<Box<&'arena dyn Fn(Ref<'arena>, Arena<'arena>) -> bool>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'arena> Traverser<'arena> {
|
impl<'arena, T> Traverser<'arena, T>
|
||||||
fn new() -> Traverser<'arena> {
|
where
|
||||||
|
T: Fn(Ref) -> bool,
|
||||||
|
{
|
||||||
|
pub fn new(
|
||||||
|
should_unwrap: T,
|
||||||
|
transformers: Vec<Box<&'arena dyn Fn(Ref<'arena>, Arena<'arena>) -> bool>>,
|
||||||
|
) -> Traverser<'arena, T> {
|
||||||
Traverser {
|
Traverser {
|
||||||
arena: typed_arena::Arena::new(),
|
arena: typed_arena::Arena::new(),
|
||||||
|
should_unwrap,
|
||||||
|
transformers,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn traverse(&'arena self, data: &mut impl Read) -> Result<(), Error> {
|
pub fn parse(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||||
dbg!("traverse");
|
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
data.read_to_end(&mut bytes)?;
|
data.read_to_end(&mut bytes)?;
|
||||||
dbg!(&bytes);
|
Ok(html5ever_parse_slice_into_arena(&bytes, &self.arena))
|
||||||
// let node = html5ever_parse_slice_into_arena(&bytes, &self.arena);
|
|
||||||
// dbg!(&node);
|
|
||||||
// self.visit(node);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn visit(&'arena self, node: Ref<'arena>) {
|
pub fn traverse(&'arena self, node: Ref<'arena>) {
|
||||||
dbg!(&node);
|
println!("{}", &node);
|
||||||
|
if (self.should_unwrap)(node) {
|
||||||
|
if let Some(unwrapped_node) = node.unwrap() {
|
||||||
|
return self.traverse(unwrapped_node);
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for transformer in self.transformers.iter() {
|
||||||
|
println!("transformer result: {}", transformer(node, &self.arena));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(child) = node.first_child.get() {
|
||||||
|
self.traverse(child);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(sibling) = node.next_sibling.get() {
|
||||||
|
self.traverse(sibling);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,10 +86,14 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn traversal() {
|
fn traversal() {
|
||||||
let mut traverser = Traverser::new();
|
let mut traverser = Traverser::new(
|
||||||
|
|node| false,
|
||||||
|
vec![Box::new(&|n, _| false), Box::new(&|m, _| true)],
|
||||||
|
);
|
||||||
let mut mock_data = MockRead;
|
let mut mock_data = MockRead;
|
||||||
// let mut file = File::open("src/test/div.html").unwrap();
|
// let mut file = File::open("src/test/div.html").unwrap();
|
||||||
traverser.traverse(&mut mock_data).unwrap();
|
let root = traverser.parse(&mut mock_data).unwrap();
|
||||||
|
traverser.traverse(root);
|
||||||
assert!(false);
|
assert!(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user