Clean up lint errors, add tests

This commit is contained in:
Tyler Hallada 2020-04-18 18:30:15 -04:00
parent f1671c0758
commit 34ffccc512
4 changed files with 140 additions and 98 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
/target /target
**/*.rs.bk **/*.rs.bk
tags
tags.temp

View File

@ -25,11 +25,12 @@ use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, T
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer, TraversalScope}; use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{parse_document, Attribute, ExpandedName, QualName}; use html5ever::{parse_document, Attribute, ExpandedName, LocalName, QualName};
// TODO: does this function really belong here?
pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> { pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
let sink = Sink { let sink = Sink {
arena: arena, arena,
document: arena.alloc(Node::new(NodeData::Document)), document: arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks, quirks_mode: QuirksMode::NoQuirks,
}; };
@ -38,9 +39,9 @@ pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> R
.one(bytes) .one(bytes)
} }
pub fn create_element<'arena>(arena: Arena<'arena>, name: QualName) -> Ref<'arena> { pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
arena.alloc(Node::new(NodeData::Element { arena.alloc(Node::new(NodeData::Element {
name: name, name: QualName::new(None, ns!(), LocalName::from(name)),
attrs: RefCell::new(vec![]), attrs: RefCell::new(vec![]),
template_contents: None, template_contents: None,
mathml_annotation_xml_integration_point: false, mathml_annotation_xml_integration_point: false,
@ -103,7 +104,7 @@ impl<'arena> Node<'arena> {
next_sibling: Cell::new(None), next_sibling: Cell::new(None),
first_child: Cell::new(None), first_child: Cell::new(None),
last_child: Cell::new(None), last_child: Cell::new(None),
data: data, data,
} }
} }
@ -163,14 +164,9 @@ impl<'arena> Node<'arena> {
} }
let mut child = first_child; let mut child = first_child;
loop { while let Some(next_child) = child {
match child { next_child.parent.set(parent);
Some(next_child) => { child = next_child.next_sibling.get();
next_child.parent.set(parent);
child = next_child.next_sibling.get();
}
None => break,
}
} }
if let Some(first_child) = first_child { if let Some(first_child) = first_child {
@ -351,7 +347,7 @@ impl<'arena> TreeSink for Sink<'arena> {
flags: ElementFlags, flags: ElementFlags,
) -> Ref<'arena> { ) -> Ref<'arena> {
self.new_node(NodeData::Element { self.new_node(NodeData::Element {
name: name, name,
attrs: RefCell::new(attrs), attrs: RefCell::new(attrs),
template_contents: if flags.template { template_contents: if flags.template {
Some(self.new_node(NodeData::Document)) Some(self.new_node(NodeData::Document))
@ -368,7 +364,7 @@ impl<'arena> TreeSink for Sink<'arena> {
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Ref<'arena> { fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Ref<'arena> {
self.new_node(NodeData::ProcessingInstruction { self.new_node(NodeData::ProcessingInstruction {
target: target, target,
contents: data, contents: data,
}) })
} }
@ -409,9 +405,9 @@ impl<'arena> TreeSink for Sink<'arena> {
system_id: StrTendril, system_id: StrTendril,
) { ) {
self.document.append(self.new_node(NodeData::Doctype { self.document.append(self.new_node(NodeData::Doctype {
name: name, name,
public_id: public_id, public_id,
system_id: system_id, system_id,
})) }))
} }

View File

@ -12,7 +12,7 @@ extern crate typed_arena;
use std::collections::HashSet; use std::collections::HashSet;
use std::default::Default; use std::default::Default;
use std::io::{self, Read}; use std::io;
use html5ever::tendril::StrTendril; use html5ever::tendril::StrTendril;
use html5ever::{serialize, Attribute, LocalName, QualName}; use html5ever::{serialize, Attribute, LocalName, QualName};
@ -33,7 +33,7 @@ mod config;
mod css_parser; mod css_parser;
mod traverser; mod traverser;
use arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, NodeData, Ref}; use arena_dom::{create_element, Arena, NodeData, Ref};
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS}; use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS};
use config::relaxed::{CSS_AT_RULES, CSS_PROPERTIES}; use config::relaxed::{CSS_AT_RULES, CSS_PROPERTIES};
use css_at_rule::CssAtRule; use css_at_rule::CssAtRule;
@ -45,12 +45,12 @@ fn main() {
let traverser = Traverser::new( let traverser = Traverser::new(
&should_unwrap_node, &should_unwrap_node,
vec![ vec![
Box::new(&sanitize_style_tag_css), &sanitize_style_tag_css,
Box::new(&sanitize_style_attribute_css), &sanitize_style_attribute_css,
Box::new(&remove_attributes), &remove_attributes,
Box::new(&add_attributes), &add_attributes,
Box::new(&sanitize_attribute_protocols), &sanitize_attribute_protocols,
Box::new(&add_single_elements_around_ul), &add_single_elements_around_ul,
], ],
); );
let root = traverser.parse(&mut io::stdin()).unwrap(); let root = traverser.parse(&mut io::stdin()).unwrap();
@ -106,7 +106,7 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
// TODO: separate this out into multiple separate transformers // TODO: separate this out into multiple separate transformers
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation // TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements. // detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Text { ref contents } = node.data { if let NodeData::Text { ref contents } = node.data {
// TODO: seems rather expensive to lookup the parent on every Text node. Better // TODO: seems rather expensive to lookup the parent on every Text node. Better
// solution would be to pass some sort of context from the parent that marks that this // solution would be to pass some sort of context from the parent that marks that this
@ -119,17 +119,14 @@ fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bo
let sanitized_css = css_rules_to_string(rules); let sanitized_css = css_rules_to_string(rules);
dbg!(&sanitized_css); dbg!(&sanitized_css);
contents.replace(StrTendril::from(sanitized_css)); contents.replace(StrTendril::from(sanitized_css));
return true;
} }
} }
} }
} }
false
} }
fn sanitize_style_attribute_css<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { fn sanitize_style_attribute_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element { ref attrs, .. } = node.data { if let NodeData::Element { ref attrs, .. } = node.data {
let mut has_transformed = false;
for attr in attrs.borrow_mut().iter_mut() { for attr in attrs.borrow_mut().iter_mut() {
if attr.name.local == local_name!("style") { if attr.name.local == local_name!("style") {
let css_str = &attr.value; let css_str = &attr.value;
@ -146,23 +143,19 @@ fn sanitize_style_attribute_css<'arena>(node: Ref<'arena>, arena: Arena<'arena>)
let sanitized_css = sanitized_css.trim(); let sanitized_css = sanitized_css.trim();
dbg!(&sanitized_css); dbg!(&sanitized_css);
attr.value = StrTendril::from(sanitized_css); attr.value = StrTendril::from(sanitized_css);
has_transformed = true;
} }
} }
return has_transformed;
} }
false
} }
fn remove_attributes<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { fn remove_attributes<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element { if let NodeData::Element {
ref attrs, ref attrs,
ref name, ref name,
.. ..
} = node.data } = node.data
{ {
let mut has_transformed = false; let attrs = &mut attrs.borrow_mut();
let ref mut attrs = attrs.borrow_mut();
let mut allowed_attrs: HashSet<LocalName> = ALL_ATTRIBUTES.clone(); let mut allowed_attrs: HashSet<LocalName> = ALL_ATTRIBUTES.clone();
if let Some(element_attrs) = ATTRIBUTES.get(&name.local) { if let Some(element_attrs) = ATTRIBUTES.get(&name.local) {
allowed_attrs = allowed_attrs.union(element_attrs).cloned().collect(); allowed_attrs = allowed_attrs.union(element_attrs).cloned().collect();
@ -172,24 +165,20 @@ fn remove_attributes<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
while i != attrs.len() { while i != attrs.len() {
if !allowed_attrs.contains(&attrs[i].name.local) { if !allowed_attrs.contains(&attrs[i].name.local) {
attrs.remove(i); attrs.remove(i);
has_transformed = true;
} }
i += 1; i += 1;
} }
return has_transformed;
} }
false
} }
fn add_attributes<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { fn add_attributes<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element { if let NodeData::Element {
ref attrs, ref attrs,
ref name, ref name,
.. ..
} = node.data } = node.data
{ {
let mut has_transformed = false; let attrs = &mut attrs.borrow_mut();
let ref mut attrs = attrs.borrow_mut();
if let Some(add_attributes) = ADD_ATTRIBUTES.get(&name.local) { if let Some(add_attributes) = ADD_ATTRIBUTES.get(&name.local) {
for (name, &value) in add_attributes.iter() { for (name, &value) in add_attributes.iter() {
@ -197,23 +186,19 @@ fn add_attributes<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool {
name: QualName::new(None, ns!(), name.clone()), name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value), value: StrTendril::from(value),
}); });
has_transformed = true;
} }
} }
return has_transformed;
} }
false
} }
fn sanitize_attribute_protocols<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { fn sanitize_attribute_protocols<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element { if let NodeData::Element {
ref attrs, ref attrs,
ref name, ref name,
.. ..
} = node.data } = node.data
{ {
let mut has_transformed = false; let attrs = &mut attrs.borrow_mut();
let ref mut attrs = attrs.borrow_mut();
if let Some(protocols) = PROTOCOLS.get(&name.local) { if let Some(protocols) = PROTOCOLS.get(&name.local) {
let mut i = 0; let mut i = 0;
@ -223,51 +208,33 @@ fn sanitize_attribute_protocols<'arena>(node: Ref<'arena>, arena: Arena<'arena>)
Ok(url) => { Ok(url) => {
if !allowed_protocols.contains(url.scheme()) { if !allowed_protocols.contains(url.scheme()) {
attrs.remove(i); attrs.remove(i);
has_transformed = true;
} else { } else {
i += 1; i += 1;
} }
} }
Err(ParseError::RelativeUrlWithoutBase) => { Err(ParseError::RelativeUrlWithoutBase) => {
attrs[i].value = StrTendril::from(format!("http://{}", attrs[i].value)); attrs[i].value = StrTendril::from(format!("http://{}", attrs[i].value));
has_transformed = true;
i += 1; i += 1;
} }
Err(_) => { Err(_) => {
attrs.remove(i); attrs.remove(i);
has_transformed = true;
} }
} }
} else { } else {
i += 1; i += 1;
} }
} }
return has_transformed;
} }
} }
false
} }
fn add_single_elements_around_ul<'arena>(node: Ref<'arena>, arena: Arena<'arena>) -> bool { fn add_single_elements_around_ul<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
if let NodeData::Element { if let NodeData::Element { ref name, .. } = node.data {
ref attrs,
ref name,
..
} = node.data
{
if let local_name!("ul") = name.local { if let local_name!("ul") = name.local {
node.insert_before(create_element( node.insert_before(create_element(arena, "single"));
arena, node.insert_after(create_element(arena, "single"));
QualName::new(None, ns!(), LocalName::from("single")),
));
node.insert_after(create_element(
arena,
QualName::new(None, ns!(), LocalName::from("single")),
));
return true;
} }
} }
false
} }
fn should_unwrap_node(node: Ref) -> bool { fn should_unwrap_node(node: Ref) -> bool {

View File

@ -1,10 +1,8 @@
extern crate typed_arena; extern crate typed_arena;
use std::io::{self, Error, Read}; use std::io::{Error, Read};
use html5ever::{serialize, Attribute, LocalName, QualName}; use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref};
use crate::arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, Node, Ref};
// TODO: I don't love the "Traverser" name. Should maybe come up with something else. // TODO: I don't love the "Traverser" name. Should maybe come up with something else.
// (it also unwraps nodes and calls transformer functions... does a lot more than traverse) // (it also unwraps nodes and calls transformer functions... does a lot more than traverse)
@ -17,7 +15,7 @@ where
{ {
arena: typed_arena::Arena<Node<'arena>>, arena: typed_arena::Arena<Node<'arena>>,
should_unwrap: T, should_unwrap: T,
transformers: Vec<Box<&'arena dyn Fn(Ref<'arena>, Arena<'arena>) -> bool>>, transformers: Vec<&'arena dyn Fn(Ref<'arena>, Arena<'arena>)>,
} }
impl<'arena, T> Traverser<'arena, T> impl<'arena, T> Traverser<'arena, T>
@ -26,7 +24,7 @@ where
{ {
pub fn new( pub fn new(
should_unwrap: T, should_unwrap: T,
transformers: Vec<Box<&'arena dyn Fn(Ref<'arena>, Arena<'arena>) -> bool>>, transformers: Vec<&'arena dyn Fn(Ref<'arena>, Arena<'arena>)>,
) -> Traverser<'arena, T> { ) -> Traverser<'arena, T> {
Traverser { Traverser {
arena: typed_arena::Arena::new(), arena: typed_arena::Arena::new(),
@ -42,7 +40,6 @@ where
} }
pub fn traverse(&'arena self, node: Ref<'arena>) { pub fn traverse(&'arena self, node: Ref<'arena>) {
println!("{}", &node);
if (self.should_unwrap)(node) { if (self.should_unwrap)(node) {
if let Some(unwrapped_node) = node.unwrap() { if let Some(unwrapped_node) = node.unwrap() {
return self.traverse(unwrapped_node); return self.traverse(unwrapped_node);
@ -52,7 +49,7 @@ where
} }
for transformer in self.transformers.iter() { for transformer in self.transformers.iter() {
println!("transformer result: {}", transformer(node, &self.arena)); transformer(node, &self.arena);
} }
if let Some(child) = node.first_child.get() { if let Some(child) = node.first_child.get() {
@ -63,45 +60,125 @@ where
self.traverse(sibling); self.traverse(sibling);
} }
} }
// TODO: how to call this from transformer functions?
pub fn create_element(&'arena self, name: &str) -> Ref<'arena> {
create_element(
&self.arena,
QualName::new(None, ns!(), LocalName::from(name)),
)
}
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
use std::fs::File; use std::str;
struct MockRead; use html5ever::serialize;
use crate::arena_dom::{create_element, NodeData};
struct MockRead {
contents: &'static str,
}
impl MockRead {
fn new(contents: &'static str) -> MockRead {
MockRead { contents }
}
}
impl Read for MockRead { impl Read for MockRead {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> { fn read(&mut self, _: &mut [u8]) -> Result<usize, Error> {
Ok(1) Ok(1)
} }
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, Error> { fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, Error> {
buf.extend_from_slice(b"<div></div>"); buf.extend_from_slice(self.contents.as_bytes());
Ok(1) Ok(1)
} }
} }
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
// if let NodeData::Element { ref name, .. } = node.data {
// if name.local == LocalName::from(tag_name) {
// return true;
// }
// }
// if let Some(child) = node.first_child.get() {
// if node_contains_tag(child, tag_name) {
// return true;
// }
// }
// if let Some(sibling) = node.next_sibling.get() {
// if node_contains_tag(sibling, tag_name) {
// return true;
// }
// }
// false
// }
// fn count_nodes(node: Ref) -> usize {
// let mut count = 1;
// if let Some(child) = node.first_child.get() {
// count += count_nodes(child);
// }
// if let Some(sibling) = node.next_sibling.get() {
// count += count_nodes(sibling);
// }
// count
// }
fn assert_serialized_html_eq(node: Ref, expected: &str) {
let mut output = vec![];
serialize(&mut output, node, Default::default()).unwrap();
assert_eq!(str::from_utf8(&output).unwrap(), expected);
}
#[test] #[test]
fn traversal() { fn traversal() {
let mut traverser = Traverser::new( let traverser = Traverser::new(|_| false, vec![&|_, _| {}]);
|node| false, let mut mock_data = MockRead::new("<div></div>");
vec![Box::new(&|n, _| false), Box::new(&|m, _| true)],
);
let mut mock_data = MockRead;
// let mut file = File::open("src/test/div.html").unwrap();
let root = traverser.parse(&mut mock_data).unwrap(); let root = traverser.parse(&mut mock_data).unwrap();
traverser.traverse(root); traverser.traverse(root);
assert!(false); assert_serialized_html_eq(root, "<html><head></head><body><div></div></body></html>");
}
#[test]
fn unwraps_element() {
let traverser = Traverser::new(
|node| {
if let NodeData::Element { ref name, .. } = node.data {
return name.local == local_name!("div");
}
false
},
vec![&|_, _| {}],
);
let mut mock_data = MockRead::new("<div></div>");
let root = traverser.parse(&mut mock_data).unwrap();
traverser.traverse(root);
assert_serialized_html_eq(root, "<html><head></head><body></body></html>");
}
#[test]
fn adds_element() {
let traverser = Traverser::new(
|_| false,
vec![&|node, arena| {
if let NodeData::Element { ref name, .. } = node.data {
if let local_name!("div") = name.local {
node.insert_after(create_element(arena, "span"));
}
}
}],
);
let mut mock_data = MockRead::new("<div></div>");
let root = traverser.parse(&mut mock_data).unwrap();
traverser.traverse(root);
assert_serialized_html_eq(
root,
"<html><head></head><body><div></div><span></span></body></html>",
);
} }
} }