Initial commit: arena_dom and basic sanitize

This commit is contained in:
Tyler Hallada
2019-03-09 20:26:43 -05:00
commit 6f682f1e91
9 changed files with 1616 additions and 0 deletions

484
src/arena_dom.rs Normal file
View File

@@ -0,0 +1,484 @@
// Majority of this file is from the html5ever project.
// https://github.com/servo/html5ever/blob/45b2fca5c6/html5ever/examples/arena.rs
//
// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
extern crate html5ever;
extern crate typed_arena;
use std::borrow::Cow;
use std::cell::{Cell, RefCell};
use std::collections::HashSet;
use std::default::Default;
use std::io;
use std::ptr;
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{parse_document, Attribute, ExpandedName, QualName};
pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
let sink = Sink {
arena: arena,
document: arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
parse_document(sink, Default::default())
.from_utf8()
.one(bytes)
}
pub fn create_element<'arena>(arena: Arena<'arena>, name: QualName) -> Ref<'arena> {
arena.alloc(Node::new(NodeData::Element {
name: name,
attrs: RefCell::new(vec![]),
template_contents: None,
mathml_annotation_xml_integration_point: false,
}))
}
pub type Arena<'arena> = &'arena typed_arena::Arena<Node<'arena>>;
pub type Ref<'arena> = &'arena Node<'arena>;
pub type Link<'arena> = Cell<Option<Ref<'arena>>>;
pub struct Sink<'arena> {
arena: Arena<'arena>,
document: Ref<'arena>,
quirks_mode: QuirksMode,
}
pub struct Node<'arena> {
pub parent: Link<'arena>,
pub next_sibling: Link<'arena>,
pub previous_sibling: Link<'arena>,
pub first_child: Link<'arena>,
pub last_child: Link<'arena>,
pub data: NodeData<'arena>,
}
pub enum NodeData<'arena> {
Document,
Doctype {
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
},
Text {
contents: RefCell<StrTendril>,
},
Comment {
contents: StrTendril,
},
Element {
name: QualName,
attrs: RefCell<Vec<Attribute>>,
template_contents: Option<Ref<'arena>>,
mathml_annotation_xml_integration_point: bool,
},
ProcessingInstruction {
target: StrTendril,
contents: StrTendril,
},
}
impl<'arena> Node<'arena> {
pub fn new(data: NodeData<'arena>) -> Self {
Node {
parent: Cell::new(None),
previous_sibling: Cell::new(None),
next_sibling: Cell::new(None),
first_child: Cell::new(None),
last_child: Cell::new(None),
data: data,
}
}
pub fn detach(&self) {
let parent = self.parent.take();
let previous_sibling = self.previous_sibling.take();
let next_sibling = self.next_sibling.take();
if let Some(next_sibling) = next_sibling {
next_sibling.previous_sibling.set(previous_sibling);
} else if let Some(parent) = parent {
parent.last_child.set(previous_sibling);
}
if let Some(previous_sibling) = previous_sibling {
previous_sibling.next_sibling.set(next_sibling);
} else if let Some(parent) = parent {
parent.first_child.set(next_sibling);
}
}
pub fn unwrap(&self) -> Option<&'arena Self> {
let parent = self.parent.take();
let previous_sibling = self.previous_sibling.take();
let next_sibling = self.next_sibling.take();
let first_child = self.first_child.take();
let last_child = self.last_child.take();
if let Some(next_sibling) = next_sibling {
if let Some(last_child) = last_child {
next_sibling.previous_sibling.set(Some(last_child));
} else {
next_sibling.previous_sibling.set(previous_sibling);
}
} else if let Some(parent) = parent {
parent.last_child.set(previous_sibling);
if let Some(last_child) = last_child {
parent.last_child.set(Some(last_child));
} else {
parent.last_child.set(previous_sibling);
}
}
if let Some(previous_sibling) = previous_sibling {
if let Some(first_child) = first_child {
previous_sibling.next_sibling.set(Some(first_child));
} else {
previous_sibling.next_sibling.set(next_sibling);
}
} else if let Some(parent) = parent {
parent.first_child.set(next_sibling);
if let Some(first_child) = first_child {
parent.first_child.set(Some(first_child));
} else {
parent.first_child.set(next_sibling);
}
}
let mut child = first_child;
loop {
match child {
Some(next_child) => {
next_child.parent.set(parent);
child = next_child.next_sibling.get();
},
None => break,
}
}
if let Some(first_child) = first_child {
Some(first_child)
} else if let Some(next_sibling) = next_sibling {
Some(next_sibling)
} else {
None
}
}
pub fn append(&'arena self, new_child: &'arena Self) {
new_child.detach();
new_child.parent.set(Some(self));
if let Some(last_child) = self.last_child.take() {
new_child.previous_sibling.set(Some(last_child));
debug_assert!(last_child.next_sibling.get().is_none());
last_child.next_sibling.set(Some(new_child));
} else {
debug_assert!(self.first_child.get().is_none());
self.first_child.set(Some(new_child));
}
self.last_child.set(Some(new_child));
}
pub fn insert_before(&'arena self, new_sibling: &'arena Self) {
new_sibling.detach();
new_sibling.parent.set(self.parent.get());
new_sibling.next_sibling.set(Some(self));
if let Some(previous_sibling) = self.previous_sibling.take() {
new_sibling.previous_sibling.set(Some(previous_sibling));
debug_assert!(ptr::eq::<Node>(
previous_sibling.next_sibling.get().unwrap(),
self
));
previous_sibling.next_sibling.set(Some(new_sibling));
} else if let Some(parent) = self.parent.get() {
debug_assert!(ptr::eq::<Node>(parent.first_child.get().unwrap(), self));
parent.first_child.set(Some(new_sibling));
}
self.previous_sibling.set(Some(new_sibling));
}
pub fn insert_after(&'arena self, new_sibling: &'arena Self) {
new_sibling.detach();
new_sibling.parent.set(self.parent.get());
new_sibling.previous_sibling.set(Some(self));
if let Some(next_sibling) = self.next_sibling.take() {
new_sibling.next_sibling.set(Some(next_sibling));
debug_assert!(ptr::eq::<Node>(
next_sibling.previous_sibling.get().unwrap(),
self
));
next_sibling.previous_sibling.set(Some(new_sibling));
} else if let Some(parent) = self.parent.get() {
debug_assert!(ptr::eq::<Node>(parent.last_child.get().unwrap(), self));
parent.last_child.set(Some(new_sibling));
}
self.next_sibling.set(Some(new_sibling));
}
}
impl<'arena> Sink<'arena> {
fn new_node(&self, data: NodeData<'arena>) -> Ref<'arena> {
self.arena.alloc(Node::new(data))
}
fn append_common<P, A>(&self, child: NodeOrText<Ref<'arena>>, previous: P, append: A)
where
P: FnOnce() -> Option<Ref<'arena>>,
A: FnOnce(Ref<'arena>),
{
let new_node = match child {
NodeOrText::AppendText(text) => {
// Append to an existing Text node if we have one.
if let Some(&Node {
data: NodeData::Text { ref contents },
..
}) = previous()
{
contents.borrow_mut().push_tendril(&text);
return;
}
self.new_node(NodeData::Text {
contents: RefCell::new(text),
})
}
NodeOrText::AppendNode(node) => node,
};
append(new_node)
}
}
impl<'arena> TreeSink for Sink<'arena> {
type Handle = Ref<'arena>;
type Output = Ref<'arena>;
fn finish(self) -> Ref<'arena> {
self.document
}
fn parse_error(&mut self, _: Cow<'static, str>) {}
fn get_document(&mut self) -> Ref<'arena> {
self.document
}
fn set_quirks_mode(&mut self, mode: QuirksMode) {
self.quirks_mode = mode;
}
fn same_node(&self, x: &Ref<'arena>, y: &Ref<'arena>) -> bool {
ptr::eq::<Node>(*x, *y)
}
fn elem_name<'a>(&self, target: &'a Ref<'arena>) -> ExpandedName<'a> {
match target.data {
NodeData::Element { ref name, .. } => name.expanded(),
_ => panic!("not an element!"),
}
}
fn get_template_contents(&mut self, target: &Ref<'arena>) -> Ref<'arena> {
if let NodeData::Element {
template_contents: Some(ref contents),
..
} = target.data
{
contents
} else {
panic!("not a template element!")
}
}
fn is_mathml_annotation_xml_integration_point(&self, target: &Ref<'arena>) -> bool {
if let NodeData::Element {
mathml_annotation_xml_integration_point,
..
} = target.data
{
mathml_annotation_xml_integration_point
} else {
panic!("not an element!")
}
}
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
flags: ElementFlags,
) -> Ref<'arena> {
self.new_node(NodeData::Element {
name: name,
attrs: RefCell::new(attrs),
template_contents: if flags.template {
Some(self.new_node(NodeData::Document))
} else {
None
},
mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
})
}
fn create_comment(&mut self, text: StrTendril) -> Ref<'arena> {
self.new_node(NodeData::Comment { contents: text })
}
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Ref<'arena> {
self.new_node(NodeData::ProcessingInstruction {
target: target,
contents: data,
})
}
fn append(&mut self, parent: &Ref<'arena>, child: NodeOrText<Ref<'arena>>) {
self.append_common(
child,
|| parent.last_child.get(),
|new_node| parent.append(new_node),
)
}
fn append_before_sibling(&mut self, sibling: &Ref<'arena>, child: NodeOrText<Ref<'arena>>) {
self.append_common(
child,
|| sibling.previous_sibling.get(),
|new_node| sibling.insert_before(new_node),
)
}
fn append_based_on_parent_node(
&mut self,
element: &Ref<'arena>,
prev_element: &Ref<'arena>,
child: NodeOrText<Ref<'arena>>,
) {
if element.parent.get().is_some() {
self.append_before_sibling(element, child)
} else {
self.append(prev_element, child)
}
}
fn append_doctype_to_document(
&mut self,
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
) {
self.document.append(self.new_node(NodeData::Doctype {
name: name,
public_id: public_id,
system_id: system_id,
}))
}
fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<Attribute>) {
let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
attrs.borrow_mut()
} else {
panic!("not an element")
};
let existing_names = existing
.iter()
.map(|e| e.name.clone())
.collect::<HashSet<_>>();
existing.extend(
attrs
.into_iter()
.filter(|attr| !existing_names.contains(&attr.name)),
);
}
fn remove_from_parent(&mut self, target: &Ref<'arena>) {
target.detach()
}
fn reparent_children(&mut self, node: &Ref<'arena>, new_parent: &Ref<'arena>) {
let mut next_child = node.first_child.get();
while let Some(child) = next_child {
debug_assert!(ptr::eq::<Node>(child.parent.get().unwrap(), *node));
next_child = child.next_sibling.get();
new_parent.append(child)
}
}
}
// Implementation adapted from implementation for RcDom:
// https://github.com/servo/html5ever/blob/45b2fca5c6/markup5ever/rcdom.rs#L410
impl<'arena> Serialize for Node<'arena> {
fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
where
S: Serializer,
{
match (&traversal_scope, &self.data) {
(
_,
&NodeData::Element {
ref name,
ref attrs,
..
},
) => {
if traversal_scope == IncludeNode {
serializer.start_elem(
name.clone(),
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
)?;
}
if let Some(child) = self.first_child.get() {
child.serialize(serializer, IncludeNode)?;
}
if traversal_scope == IncludeNode {
serializer.end_elem(name.clone())?;
}
}
(&ChildrenOnly(_), &NodeData::Document) => {
if let Some(child) = self.first_child.get() {
child.serialize(serializer, IncludeNode)?;
}
}
(&ChildrenOnly(_), _) => {},
(&IncludeNode, &NodeData::Doctype { ref name, .. }) => serializer.write_doctype(&name)?,
(&IncludeNode, &NodeData::Text { ref contents }) => {
serializer.write_text(&contents.borrow())?
}
(&IncludeNode, &NodeData::Comment { ref contents }) => {
serializer.write_comment(&contents)?
}
(
&IncludeNode,
&NodeData::ProcessingInstruction {
ref target,
ref contents,
},
) => serializer.write_processing_instruction(target, contents)?,
(&IncludeNode, &NodeData::Document) => panic!("Can't serialize Document node itself"),
}
if let Some(sibling) = self.next_sibling.get() {
sibling.serialize(serializer, IncludeNode)?
}
Ok(())
}
}

72
src/config/basic.rs Normal file
View File

@@ -0,0 +1,72 @@
use html5ever::LocalName;
use std::collections::{HashMap, HashSet};
lazy_static! {
pub static ref ELEMENTS: HashSet<LocalName> = hashset! {
local_name!("a"),
local_name!("abbr"),
local_name!("blockquote"),
local_name!("br"),
local_name!("cite"),
local_name!("code"),
local_name!("dd"),
local_name!("dfn"),
local_name!("dl"),
local_name!("dt"),
local_name!("kbd"),
local_name!("li"),
local_name!("mark"),
local_name!("ol"),
local_name!("p"),
local_name!("pre"),
local_name!("q"),
local_name!("s"),
local_name!("samp"),
local_name!("small"),
local_name!("strike"),
local_name!("sub"),
local_name!("sup"),
local_name!("time"),
local_name!("ul"),
local_name!("var"),
};
pub static ref ALL_ATTRIBUTES: HashSet<LocalName> = hashset! {};
pub static ref ATTRIBUTES: HashMap<LocalName, HashSet<LocalName>> = hashmap! {
local_name!("a") => hashset!{
local_name!("href"),
},
local_name!("abbr") => hashset!{
local_name!("title"),
},
local_name!("blockquote") => hashset!{
local_name!("cite"),
},
local_name!("dfn") => hashset!{
local_name!("title"),
},
local_name!("q") => hashset!{
local_name!("cite"),
},
local_name!("time") => hashset!{
local_name!("datetime"),
LocalName::from("pubdate"),
},
};
pub static ref ADD_ATTRIBUTES: HashMap<LocalName, HashMap<LocalName, &'static str>> = hashmap! {
local_name!("a") => hashmap! {
local_name!("rel") => "nofollow",
},
};
pub static ref PROTOCOLS: HashMap<LocalName, HashMap<LocalName, HashSet<&'static str>>> = hashmap! {
local_name!("a") => hashmap! {
local_name!("href") => hashset!{"ftp", "http", "https", "mailto"},
},
local_name!("blockquote") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("q") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
};
}

3
src/config/mod.rs Normal file
View File

@@ -0,0 +1,3 @@
pub mod basic;
pub mod relaxed;
pub mod permissive;

177
src/config/permissive.rs Normal file
View File

@@ -0,0 +1,177 @@
use html5ever::LocalName;
use std::collections::{HashMap, HashSet};
use super::relaxed::{
ADD_ATTRIBUTES as RELAXED_ADD_ATTRIBUTES, ALL_ATTRIBUTES as RELAXED_ALL_ATTRIBUTES,
ATTRIBUTES as RELAXED_ATTRIBUTES, ELEMENTS as RELAXED_ELEMENTS,
};
lazy_static! {
pub static ref ELEMENTS: HashSet<LocalName> = RELAXED_ELEMENTS
.union(&hashset!(
local_name!("acronym"),
local_name!("basefont"),
local_name!("big"),
local_name!("blink"),
local_name!("center"),
LocalName::from("command"),
local_name!("dir"),
local_name!("font"),
local_name!("marquee"),
local_name!("strike"),
local_name!("tt"),
local_name!("form"),
local_name!("input"),
local_name!("button"),
LocalName::from("single"),
LocalName::from("double"),
))
.into_iter()
.cloned()
.collect();
pub static ref ALL_ATTRIBUTES: HashSet<LocalName> = RELAXED_ALL_ATTRIBUTES
.union(&hashset! {
local_name!("bgcolor"),
local_name!("width"),
local_name!("height"),
local_name!("border"),
local_name!("color"),
local_name!("background"),
})
.into_iter()
.cloned()
.collect();
// Can't figure out how to merge HashMaps :(
pub static ref ATTRIBUTES: HashMap<LocalName, HashSet<LocalName>> = hashmap! {
local_name!("a") => hashset!{
local_name!("href"),
local_name!("hreflang"),
local_name!("name"),
local_name!("rel"),
},
local_name!("abbr") => hashset!{
local_name!("title"),
},
local_name!("blockquote") => hashset!{
local_name!("cite"),
},
local_name!("button") => hashset!{
local_name!("type"),
},
local_name!("col") => hashset!{
local_name!("span"),
local_name!("width"),
},
local_name!("colgroup") => hashset!{
local_name!("span"),
local_name!("width"),
},
local_name!("data") => hashset!{
local_name!("value"),
},
local_name!("del") => hashset!{
local_name!("cite"),
local_name!("datetime"),
},
local_name!("dfn") => hashset!{
local_name!("title"),
},
local_name!("img") => hashset!{
local_name!("align"),
local_name!("alt"),
local_name!("border"),
local_name!("height"),
local_name!("src"),
local_name!("srcset"),
local_name!("width"),
},
local_name!("input") => hashset!{
local_name!("type"),
local_name!("name"),
local_name!("value"),
},
local_name!("ins") => hashset!{
local_name!("cite"),
local_name!("datetime"),
},
local_name!("li") => hashset!{
local_name!("value"),
},
local_name!("ol") => hashset!{
LocalName::from("reversed"),
local_name!("start"),
local_name!("type"),
},
local_name!("q") => hashset!{
local_name!("cite"),
},
local_name!("style") => hashset!{
local_name!("media"),
local_name!("scoped"),
local_name!("type"),
},
local_name!("table") => hashset!{
local_name!("align"),
local_name!("bgcolor"),
local_name!("border"),
local_name!("cellpadding"),
local_name!("cellspacing"),
local_name!("frame"),
local_name!("rules"),
LocalName::from("sortable"),
local_name!("summary"),
local_name!("width"),
},
local_name!("td") => hashset!{
local_name!("abbr"),
local_name!("align"),
local_name!("axis"),
local_name!("colspan"),
local_name!("headers"),
local_name!("rowspan"),
local_name!("valign"),
local_name!("width"),
},
local_name!("th") => hashset!{
local_name!("abbr"),
local_name!("align"),
local_name!("axis"),
local_name!("colspan"),
local_name!("headers"),
local_name!("rowspan"),
local_name!("scope"),
LocalName::from("sorted"),
local_name!("valign"),
local_name!("width"),
},
local_name!("time") => hashset!{
local_name!("datetime"),
LocalName::from("pubdate"),
},
local_name!("ul") => hashset!{
local_name!("type"),
},
};
pub static ref ADD_ATTRIBUTES: HashMap<LocalName, HashMap<LocalName, &'static str>> = RELAXED_ADD_ATTRIBUTES.clone();
pub static ref PROTOCOLS: HashMap<LocalName, HashMap<LocalName, HashSet<&'static str>>> = hashmap! {
local_name!("a") => hashmap! {
local_name!("href") => hashset!{"ftp", "http", "https", "mailto"},
},
local_name!("blockquote") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("del") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("img") => hashmap! {
local_name!("src") => hashset!{"http", "https"},
},
local_name!("ins") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("q") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
};
}

199
src/config/relaxed.rs Normal file
View File

@@ -0,0 +1,199 @@
use html5ever::LocalName;
use std::collections::{HashMap, HashSet};
use super::basic::{
ADD_ATTRIBUTES as BASIC_ADD_ATTRIBUTES, ALL_ATTRIBUTES as BASIC_ALL_ATTRIBUTES,
ATTRIBUTES as BASIC_ATTRIBUTES, ELEMENTS as BASIC_ELEMENTS,
};
lazy_static! {
pub static ref ELEMENTS: HashSet<LocalName> = BASIC_ELEMENTS
.union(&hashset!(
local_name!("address"),
local_name!("article"),
local_name!("aside"),
local_name!("bdi"),
local_name!("bdo"),
local_name!("body"),
local_name!("caption"),
local_name!("col"),
local_name!("colgroup"),
local_name!("data"),
local_name!("del"),
local_name!("div"),
local_name!("figcaption"),
local_name!("figure"),
local_name!("footer"),
local_name!("h1"),
local_name!("h2"),
local_name!("h3"),
local_name!("h4"),
local_name!("h5"),
local_name!("h6"),
local_name!("head"),
local_name!("header"),
local_name!("hgroup"),
local_name!("hr"),
local_name!("html"),
local_name!("img"),
local_name!("ins"),
local_name!("main"),
local_name!("nav"),
local_name!("rp"),
local_name!("rt"),
local_name!("ruby"),
local_name!("section"),
local_name!("span"),
local_name!("style"),
local_name!("summary"),
local_name!("sup"),
local_name!("table"),
local_name!("tbody"),
local_name!("td"),
local_name!("tfoot"),
local_name!("th"),
local_name!("thead"),
local_name!("title"),
local_name!("tr"),
local_name!("wbr"),
))
.into_iter()
.cloned()
.collect();
pub static ref ALL_ATTRIBUTES: HashSet<LocalName> = BASIC_ALL_ATTRIBUTES.union(&hashset! {
local_name!("class"),
local_name!("dir"),
local_name!("hidden"),
local_name!("id"),
local_name!("lang"),
local_name!("style"),
local_name!("tabindex"),
local_name!("title"),
LocalName::from("translate"),
}).into_iter().cloned().collect();
// Can't figure out how to merge HashMaps :(
pub static ref ATTRIBUTES: HashMap<LocalName, HashSet<LocalName>> = hashmap! {
local_name!("a") => hashset!{
local_name!("href"),
local_name!("hreflang"),
local_name!("name"),
local_name!("rel"),
},
local_name!("abbr") => hashset!{
local_name!("title"),
},
local_name!("blockquote") => hashset!{
local_name!("cite"),
},
local_name!("col") => hashset!{
local_name!("span"),
local_name!("width"),
},
local_name!("colgroup") => hashset!{
local_name!("span"),
local_name!("width"),
},
local_name!("data") => hashset!{
local_name!("value"),
},
local_name!("del") => hashset!{
local_name!("cite"),
local_name!("datetime"),
},
local_name!("dfn") => hashset!{
local_name!("title"),
},
local_name!("img") => hashset!{
local_name!("align"),
local_name!("alt"),
local_name!("border"),
local_name!("height"),
local_name!("src"),
local_name!("srcset"),
local_name!("width"),
},
local_name!("ins") => hashset!{
local_name!("cite"),
local_name!("datetime"),
},
local_name!("li") => hashset!{
local_name!("value"),
},
local_name!("ol") => hashset!{
LocalName::from("reversed"),
local_name!("start"),
local_name!("type"),
},
local_name!("q") => hashset!{
local_name!("cite"),
},
local_name!("style") => hashset!{
local_name!("media"),
local_name!("scoped"),
local_name!("type"),
},
local_name!("table") => hashset!{
local_name!("align"),
local_name!("bgcolor"),
local_name!("border"),
local_name!("cellpadding"),
local_name!("cellspacing"),
local_name!("frame"),
local_name!("rules"),
LocalName::from("sortable"),
local_name!("summary"),
local_name!("width"),
},
local_name!("td") => hashset!{
local_name!("abbr"),
local_name!("align"),
local_name!("axis"),
local_name!("colspan"),
local_name!("headers"),
local_name!("rowspan"),
local_name!("valign"),
local_name!("width"),
},
local_name!("th") => hashset!{
local_name!("abbr"),
local_name!("align"),
local_name!("axis"),
local_name!("colspan"),
local_name!("headers"),
local_name!("rowspan"),
local_name!("scope"),
LocalName::from("sorted"),
local_name!("valign"),
local_name!("width"),
},
local_name!("time") => hashset!{
local_name!("datetime"),
LocalName::from("pubdate"),
},
local_name!("ul") => hashset!{
local_name!("type"),
},
};
pub static ref ADD_ATTRIBUTES: HashMap<LocalName, HashMap<LocalName, &'static str>> = BASIC_ADD_ATTRIBUTES.clone();
pub static ref PROTOCOLS: HashMap<LocalName, HashMap<LocalName, HashSet<&'static str>>> = hashmap! {
local_name!("a") => hashmap! {
local_name!("href") => hashset!{"ftp", "http", "https", "mailto"},
},
local_name!("blockquote") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("del") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("img") => hashmap! {
local_name!("src") => hashset!{"http", "https"},
},
local_name!("ins") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
local_name!("q") => hashmap! {
local_name!("cite") => hashset!{"http", "https"},
},
};
}

164
src/main.rs Normal file
View File

@@ -0,0 +1,164 @@
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate html5ever;
#[macro_use]
extern crate maplit;
extern crate typed_arena;
use std::collections::HashSet;
use std::default::Default;
use std::io::{self, Read};
use html5ever::tendril::StrTendril;
use html5ever::{serialize, Attribute, LocalName, QualName};
use url::{ParseError, Url};
mod arena_dom;
mod config;
use arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, NodeData, Ref};
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS};
fn main() {
let mut bytes = Vec::new();
io::stdin().read_to_end(&mut bytes).unwrap();
let arena = typed_arena::Arena::new();
let doc = html5ever_parse_slice_into_arena(&bytes, &arena);
sanitize(doc, &arena);
serialize(&mut io::stdout(), doc, Default::default())
.ok()
.expect("serialization failed")
}
fn sanitize<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
if let Some(unwrapped) = maybe_unwrap_node(&node) {
if let Some(unwrapped_node) = unwrapped {
return sanitize(unwrapped_node, arena);
} else {
return;
}
}
transform_node(&node, arena);
if let Some(child) = node.first_child.get() {
sanitize(child, arena);
}
if let Some(sibling) = node.next_sibling.get() {
sanitize(sibling, arena);
}
}
// TODO: make separate rich and plain transformers
// TODO: add whitelist of tags, remove any not in it DONE
// TODO: add whitelist of attributes, remove any not in it DONE
// TODO: add map of tags to attributes, remove any on tag not in the mapped value DONE
// TODO: add whitelist of url schemes, parse urls and remove any not in it DONE
// TODO: strip comments DONE
// TODO: parse style tags and attributes
// TODO: add whitelist of CSS properties, remove any not in it
// TODO: scope selectors in rich formatter
// TODO: add class attributes to elements in rich formatter
fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
match node.data {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::Text { .. }
| NodeData::Comment { .. }
| NodeData::ProcessingInstruction { .. } => {}
NodeData::Element {
ref attrs,
ref name,
..
} => {
let ref mut attrs = attrs.borrow_mut();
let mut allowed_attrs: HashSet<LocalName> = ALL_ATTRIBUTES.clone();
if let Some(element_attrs) = ATTRIBUTES.get(&name.local) {
allowed_attrs = allowed_attrs
.union(element_attrs)
.into_iter()
.cloned()
.collect();
}
let mut i = 0;
while i != attrs.len() {
if !allowed_attrs.contains(&attrs[i].name.local) {
attrs.remove(i);
} else {
i += 1;
}
}
if let Some(add_attributes) = ADD_ATTRIBUTES.get(&name.local) {
for (name, &value) in add_attributes.iter() {
attrs.push(Attribute {
name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value),
});
}
}
if let Some(protocols) = PROTOCOLS.get(&name.local) {
let mut i = 0;
while i != attrs.len() {
if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) {
match Url::parse(&attrs[i].value) {
Ok(url) => {
if !allowed_protocols.contains(url.scheme()) {
attrs.remove(i);
} else {
i += 1;
}
}
Err(ParseError::RelativeUrlWithoutBase) => {
attrs[i].value =
StrTendril::from(format!("http://{}", attrs[i].value));
i += 1;
}
Err(_) => {
attrs.remove(i);
}
}
} else {
i += 1;
}
}
}
match name.local {
local_name!("ul") => {
node.insert_before(create_element(
arena,
QualName::new(None, ns!(), LocalName::from("single")),
));
node.insert_after(create_element(
arena,
QualName::new(None, ns!(), LocalName::from("single")),
));
}
_ => {}
}
}
}
}
fn maybe_unwrap_node<'arena>(node: Ref<'arena>) -> Option<Option<Ref<'arena>>> {
match node.data {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::Text { .. }
| NodeData::ProcessingInstruction { .. } => None,
NodeData::Comment { .. } => Some(node.unwrap()),
NodeData::Element { ref name, .. } => {
if !ELEMENTS.contains(&name.local) {
Some(node.unwrap())
} else {
None
}
}
}
}