Split up transform_node into separate functions

This commit is contained in:
Tyler Hallada 2020-04-16 19:44:59 -04:00
parent 47bd10f508
commit f1671c0758
2 changed files with 169 additions and 99 deletions

View File

@ -42,7 +42,17 @@ use css_property::CssProperty;
use traverser::Traverser; use traverser::Traverser;
fn main() { fn main() {
let traverser = Traverser::new(&should_unwrap_node, vec![Box::new(&transform_node)]); let traverser = Traverser::new(
&should_unwrap_node,
vec![
Box::new(&sanitize_style_tag_css),
Box::new(&sanitize_style_attribute_css),
Box::new(&remove_attributes),
Box::new(&add_attributes),
Box::new(&sanitize_attribute_protocols),
Box::new(&add_single_elements_around_ul),
],
);
let root = traverser.parse(&mut io::stdin()).unwrap(); let root = traverser.parse(&mut io::stdin()).unwrap();
traverser.traverse(root); traverser.traverse(root);
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed") serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
@ -94,13 +104,10 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
// TODO: scope selectors in rich formatter // TODO: scope selectors in rich formatter
// TODO: add class attributes to elements in rich formatter // TODO: add class attributes to elements in rich formatter
// TODO: separate this out into multiple separate transformers // TODO: separate this out into multiple separate transformers
fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { // TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
match node.data { // detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
NodeData::Document fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
| NodeData::Doctype { .. } if let NodeData::Text { ref contents } = node.data {
| NodeData::Comment { .. }
| NodeData::ProcessingInstruction { .. } => false,
NodeData::Text { ref contents } => {
// TODO: seems rather expensive to lookup the parent on every Text node. Better // TODO: seems rather expensive to lookup the parent on every Text node. Better
// solution would be to pass some sort of context from the parent that marks that this // solution would be to pass some sort of context from the parent that marks that this
// Text node is inside a <style>. // Text node is inside a <style>.
@ -116,26 +123,16 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
} }
} }
} }
}
false false
} }
NodeData::Element {
ref attrs,
ref name,
..
} => {
let ref mut attrs = attrs.borrow_mut();
let mut allowed_attrs: HashSet<LocalName> = ALL_ATTRIBUTES.clone(); fn sanitize_style_attribute_css<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
if let Some(element_attrs) = ATTRIBUTES.get(&name.local) { if let NodeData::Element { ref attrs, .. } = node.data {
allowed_attrs = allowed_attrs.union(element_attrs).cloned().collect(); let mut has_transformed = false;
} for attr in attrs.borrow_mut().iter_mut() {
let mut i = 0; if attr.name.local == local_name!("style") {
while i != attrs.len() { let css_str = &attr.value;
if !allowed_attrs.contains(&attrs[i].name.local) {
attrs.remove(i);
} else {
if attrs[i].name.local == local_name!("style") {
let css_str = &attrs[i].value;
let declarations = parse_css_style_attribute(css_str); let declarations = parse_css_style_attribute(css_str);
dbg!(&declarations); dbg!(&declarations);
let mut sanitized_css = String::new(); let mut sanitized_css = String::new();
@ -148,11 +145,51 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
} }
let sanitized_css = sanitized_css.trim(); let sanitized_css = sanitized_css.trim();
dbg!(&sanitized_css); dbg!(&sanitized_css);
attrs[i].value = StrTendril::from(sanitized_css); attr.value = StrTendril::from(sanitized_css);
has_transformed = true;
}
}
return has_transformed;
}
false
}
fn remove_attributes<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
let mut has_transformed = false;
let ref mut attrs = attrs.borrow_mut();
let mut allowed_attrs: HashSet<LocalName> = ALL_ATTRIBUTES.clone();
if let Some(element_attrs) = ATTRIBUTES.get(&name.local) {
allowed_attrs = allowed_attrs.union(element_attrs).cloned().collect();
}
let mut i = 0;
while i != attrs.len() {
if !allowed_attrs.contains(&attrs[i].name.local) {
attrs.remove(i);
has_transformed = true;
} }
i += 1; i += 1;
} }
return has_transformed;
} }
false
}
fn add_attributes<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
let mut has_transformed = false;
let ref mut attrs = attrs.borrow_mut();
if let Some(add_attributes) = ADD_ATTRIBUTES.get(&name.local) { if let Some(add_attributes) = ADD_ATTRIBUTES.get(&name.local) {
for (name, &value) in add_attributes.iter() { for (name, &value) in add_attributes.iter() {
@ -160,8 +197,23 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
name: QualName::new(None, ns!(), name.clone()), name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value), value: StrTendril::from(value),
}); });
has_transformed = true;
} }
} }
return has_transformed;
}
false
}
fn sanitize_attribute_protocols<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
let mut has_transformed = false;
let ref mut attrs = attrs.borrow_mut();
if let Some(protocols) = PROTOCOLS.get(&name.local) { if let Some(protocols) = PROTOCOLS.get(&name.local) {
let mut i = 0; let mut i = 0;
@ -171,27 +223,39 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
Ok(url) => { Ok(url) => {
if !allowed_protocols.contains(url.scheme()) { if !allowed_protocols.contains(url.scheme()) {
attrs.remove(i); attrs.remove(i);
has_transformed = true;
} else { } else {
i += 1; i += 1;
} }
} }
Err(ParseError::RelativeUrlWithoutBase) => { Err(ParseError::RelativeUrlWithoutBase) => {
attrs[i].value = attrs[i].value = StrTendril::from(format!("http://{}", attrs[i].value));
StrTendril::from(format!("http://{}", attrs[i].value)); has_transformed = true;
i += 1; i += 1;
} }
Err(_) => { Err(_) => {
attrs.remove(i); attrs.remove(i);
has_transformed = true;
} }
} }
} else { } else {
i += 1; i += 1;
} }
} }
return has_transformed;
}
}
false
} }
match name.local { fn add_single_elements_around_ul<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
local_name!("ul") => { if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
if let local_name!("ul") = name.local {
node.insert_before(create_element( node.insert_before(create_element(
arena, arena,
QualName::new(None, ns!(), LocalName::from("single")), QualName::new(None, ns!(), LocalName::from("single")),
@ -200,13 +264,11 @@ fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
arena, arena,
QualName::new(None, ns!(), LocalName::from("single")), QualName::new(None, ns!(), LocalName::from("single")),
)); ));
return true;
} }
_ => {}
} }
false false
} }
}
}
fn should_unwrap_node(node: Ref) -> bool { fn should_unwrap_node(node: Ref) -> bool {
match node.data { match node.data {

View File

@ -4,7 +4,7 @@ use std::io::{self, Error, Read};
use html5ever::{serialize, Attribute, LocalName, QualName}; use html5ever::{serialize, Attribute, LocalName, QualName};
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref}; use crate::arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, Node, Ref};
// TODO: I don't love the "Traverser" name. Should maybe come up with something else. // TODO: I don't love the "Traverser" name. Should maybe come up with something else.
// (it also unwraps nodes and calls transformer functions... does a lot more than traverse) // (it also unwraps nodes and calls transformer functions... does a lot more than traverse)
@ -63,6 +63,14 @@ where
self.traverse(sibling); self.traverse(sibling);
} }
} }
// TODO: how to call this from transformer functions?
pub fn create_element(&'arena self, name: &str) -> Ref<'arena> {
create_element(
&self.arena,
QualName::new(None, ns!(), LocalName::from(name)),
)
}
} }
#[cfg(test)] #[cfg(test)]