#![warn(clippy::all)] #[macro_use] extern crate lazy_static; #[macro_use] extern crate html5ever; #[macro_use] extern crate maplit; #[macro_use] extern crate cssparser; extern crate string_cache; extern crate typed_arena; use std::collections::HashSet; use std::default::Default; use std::io::{self, Read}; use html5ever::tendril::StrTendril; use html5ever::{serialize, Attribute, LocalName, QualName}; use url::{ParseError, Url}; #[macro_use] mod css_property { include!(concat!(env!("OUT_DIR"), "/css_property.rs")); } mod arena_dom; mod config; mod css_parser; use arena_dom::{create_element, html5ever_parse_slice_into_arena, Arena, NodeData, Ref}; use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, ELEMENTS, PROTOCOLS}; use config::relaxed::CSS_PROPERTIES; use css_parser::{CssRule, parse_css_style_attribute, parse_css_stylesheet}; use css_property::CssProperty; fn main() { let mut bytes = Vec::new(); io::stdin().read_to_end(&mut bytes).unwrap(); let arena = typed_arena::Arena::new(); let doc = html5ever_parse_slice_into_arena(&bytes, &arena); sanitize(doc, &arena); serialize(&mut io::stdout(), doc, Default::default()) .ok() .expect("serialization failed") } fn sanitize<'arena>(node: Ref<'arena>, arena: Arena<'arena>) { if let Some(unwrapped) = maybe_unwrap_node(&node) { if let Some(unwrapped_node) = unwrapped { return sanitize(unwrapped_node, arena); } else { return; } } transform_node(&node, arena); if let Some(child) = node.first_child.get() { sanitize(child, arena); } if let Some(sibling) = node.next_sibling.get() { sanitize(sibling, arena); } } // TODO: make separate rich and plain transformers // TODO: add whitelist of tags, remove any not in it DONE // TODO: add whitelist of attributes, remove any not in it DONE // TODO: add map of tags to attributes, remove any on tag not in the mapped value DONE // TODO: add whitelist of url schemes, parse urls and remove any not in it DONE // TODO: strip comments DONE // TODO: parse style tags and attributes DONE // TODO: add whitelist of CSS properties, remove any not in it DONE // TODO: scope selectors in rich formatter // TODO: add class attributes to elements in rich formatter fn transform_node<'arena>(node: Ref<'arena>, arena: Arena<'arena>) { match node.data { NodeData::Document | NodeData::Doctype { .. } | NodeData::Comment { .. } | NodeData::ProcessingInstruction { .. } => {} NodeData::Text { ref contents } => { // TODO: seems rather expensive to lookup the parent on every Text node. Better // solution would be to pass some sort of context from the parent that marks that this // Text node is inside a