CSS sanitization and serialisation in a good place

This commit is contained in:
Tyler Hallada 2020-04-25 18:59:18 -04:00
parent e4316f1a6f
commit caca121bef
5 changed files with 81 additions and 42 deletions

View File

@ -13,6 +13,7 @@ lazy_static! {
allowed_protocols: HashMap::new(), allowed_protocols: HashMap::new(),
allowed_css_at_rules: HashSet::new(), allowed_css_at_rules: HashSet::new(),
allowed_css_properties: HashSet::new(), allowed_css_properties: HashSet::new(),
allow_css_comments: false,
remove_contents_when_unwrapped: hashset! { remove_contents_when_unwrapped: hashset! {
local_name!("iframe"), local_name!("iframe"),
local_name!("noembed"), local_name!("noembed"),

View File

@ -35,6 +35,7 @@ pub struct CssDeclaration {
pub value: String, pub value: String,
} }
#[derive(Debug)]
struct CssAtRulePrelude { struct CssAtRulePrelude {
name: String, name: String,
prelude: String, prelude: String,
@ -145,7 +146,6 @@ impl<'i> QualifiedRuleParser<'i> for CssParser {
prelude.push_str("/**/"); prelude.push_str("/**/");
} }
previous_token = token_type; previous_token = token_type;
dbg!(&token);
token.to_css(&mut prelude).unwrap(); token.to_css(&mut prelude).unwrap();
// TODO: do I need to handle parse_nested_block here? // TODO: do I need to handle parse_nested_block here?
} }
@ -206,7 +206,6 @@ impl<'i> DeclarationParser<'i> for CssDeclarationParser {
name: CowRcStr<'i>, name: CowRcStr<'i>,
input: &mut Parser<'i, 't>, input: &mut Parser<'i, 't>,
) -> Result<Self::Declaration, ParseError<'i, CssError>> { ) -> Result<Self::Declaration, ParseError<'i, CssError>> {
dbg!(&name);
// let start = input.position(); // let start = input.position();
let mut value = String::new(); let mut value = String::new();
let mut previous_token = TokenSerializationType::nothing(); let mut previous_token = TokenSerializationType::nothing();
@ -216,26 +215,45 @@ impl<'i> DeclarationParser<'i> for CssDeclarationParser {
value.push_str("/**/"); value.push_str("/**/");
} }
previous_token = token_type; previous_token = token_type;
dbg!(&token);
token.to_css(&mut value).unwrap(); token.to_css(&mut value).unwrap();
// TODO: do I need to handle parse_nested_block here? // TODO: do I need to handle parse_nested_block here?
} }
// input.next_including_whitespace_and_comments()?; // input.next_including_whitespace_and_comments()?;
// let value = input.slice_from(start); // let value = input.slice_from(start);
dbg!(&value);
Ok(vec![CssDeclaration { Ok(vec![CssDeclaration {
property: name.to_string(), property: name.to_string(),
value: value.to_string(), value,
}]) }])
} }
} }
impl<'i> AtRuleParser<'i> for CssDeclarationParser { impl<'i> AtRuleParser<'i> for CssDeclarationParser {
type PreludeBlock = (); type PreludeBlock = CssAtRulePrelude;
type PreludeNoBlock = (); type PreludeNoBlock = CssAtRulePrelude;
type AtRule = Vec<CssDeclaration>; type AtRule = Vec<CssDeclaration>;
type Error = CssError; type Error = CssError;
fn parse_prelude<'t>(
&mut self,
name: CowRcStr<'i>,
input: &mut Parser<'i, 't>,
) -> Result<AtRuleType<Self::PreludeNoBlock, Self::PreludeBlock>, CssParseError<'i>> {
let mut prelude = String::new();
Ok(AtRuleType::WithBlock(CssAtRulePrelude {
name: name.to_string(),
prelude,
}))
}
fn parse_block<'t>(
&mut self,
prelude: Self::PreludeBlock,
_location: SourceLocation,
input: &mut Parser<'i, 't>,
) -> Result<Self::AtRule, CssParseError<'i>> {
Ok(vec![])
}
} }
pub fn parse_declarations<'i>( pub fn parse_declarations<'i>(

View File

@ -1,15 +1,24 @@
// Note: not using this parser. It would be easier to preserve whitespace in the css strings using
// this parser but it requires me to do too much of what cssparser is already doing for me
// (distinguishing between an at-rule `Ident` vs. a style rule `Ident`).
use std::borrow::Borrow;
use std::cell::Ref; use std::cell::Ref;
use cssparser::{ParseError, Parser, ParserInput, ToCss, Token, TokenSerializationType}; use cssparser::{ParseError, Parser, ParserInput, ToCss, Token, TokenSerializationType};
use html5ever::tendril::StrTendril; use html5ever::tendril::StrTendril;
use crate::css_property::CssProperty;
use crate::sanitizer::SanitizerConfig;
pub fn write_to( pub fn write_to(
mut previous_token: TokenSerializationType, mut previous_token: TokenSerializationType,
input: &mut Parser, input: &mut Parser,
string: &mut String, string: &mut String,
preserve_comments: bool, config: &SanitizerConfig,
skipping_property: bool,
skipping_at_rule: bool,
) { ) {
while let Ok(token) = if preserve_comments { while let Ok(token) = if config.allow_css_comments {
input input
.next_including_whitespace_and_comments() .next_including_whitespace_and_comments()
.map(|t| t.clone()) .map(|t| t.clone())
@ -17,10 +26,24 @@ pub fn write_to(
input.next_including_whitespace().map(|t| t.clone()) input.next_including_whitespace().map(|t| t.clone())
} { } {
let token_type = token.serialization_type(); let token_type = token.serialization_type();
if !preserve_comments && previous_token.needs_separator_when_before(token_type) { let mut skipping_property = skipping_property;
let mut skipping_at_rule = skipping_at_rule;
if !config.allow_css_comments && previous_token.needs_separator_when_before(token_type) {
string.push_str("/**/") string.push_str("/**/")
} }
previous_token = token_type; previous_token = token_type;
match &token {
Token::Ident(property) => {
let property_str: &str = property.borrow();
if !config
.allowed_css_properties
.contains(&CssProperty::from(property_str))
{
skipping_property = true;
}
}
_ => {}
}
dbg!(&token); dbg!(&token);
token.to_css(string).unwrap(); token.to_css(string).unwrap();
let closing_token = match token { let closing_token = match token {
@ -31,7 +54,14 @@ pub fn write_to(
}; };
if let Some(closing_token) = closing_token { if let Some(closing_token) = closing_token {
let result: Result<_, ParseError<()>> = input.parse_nested_block(|input| { let result: Result<_, ParseError<()>> = input.parse_nested_block(|input| {
write_to(previous_token, input, string, preserve_comments); write_to(
previous_token,
input,
string,
config,
skipping_property,
skipping_at_rule,
);
Ok(()) Ok(())
}); });
result.unwrap(); result.unwrap();
@ -40,13 +70,15 @@ pub fn write_to(
} }
} }
pub fn parse_and_serialize(input: Ref<StrTendril>, output: &mut String, preserve_comments: bool) { pub fn parse_and_serialize(input: Ref<StrTendril>, output: &mut String, config: &SanitizerConfig) {
let mut parser_input = ParserInput::new(&input); let mut parser_input = ParserInput::new(&input);
let parser = &mut Parser::new(&mut parser_input); let parser = &mut Parser::new(&mut parser_input);
write_to( write_to(
TokenSerializationType::nothing(), TokenSerializationType::nothing(),
parser, parser,
output, output,
preserve_comments, config,
false,
false,
); );
} }

View File

@ -24,7 +24,6 @@ mod css_at_rule {
mod arena_dom; mod arena_dom;
mod config; mod config;
mod css_parser; mod css_parser;
mod css_parser_2;
mod sanitizer; mod sanitizer;
use arena_dom::{create_element, Arena, NodeData, Ref}; use arena_dom::{create_element, Arena, NodeData, Ref};

View File

@ -9,8 +9,8 @@ use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName,
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink}; use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
use crate::css_at_rule::CssAtRule; use crate::css_at_rule::CssAtRule;
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule}; use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
use crate::css_parser_2::parse_and_serialize;
use crate::css_property::CssProperty; use crate::css_property::CssProperty;
use crate::css_token_parser::parse_and_serialize;
pub struct Sanitizer<'arena> { pub struct Sanitizer<'arena> {
arena: typed_arena::Arena<Node<'arena>>, arena: typed_arena::Arena<Node<'arena>>,
@ -29,6 +29,7 @@ pub struct SanitizerConfig {
pub allowed_protocols: HashMap<LocalName, HashMap<LocalName, HashSet<Protocol<'static>>>>, pub allowed_protocols: HashMap<LocalName, HashMap<LocalName, HashSet<Protocol<'static>>>>,
pub allowed_css_at_rules: HashSet<CssAtRule>, pub allowed_css_at_rules: HashSet<CssAtRule>,
pub allowed_css_properties: HashSet<CssProperty>, pub allowed_css_properties: HashSet<CssProperty>,
pub allow_css_comments: bool,
pub remove_contents_when_unwrapped: HashSet<LocalName>, pub remove_contents_when_unwrapped: HashSet<LocalName>,
} }
@ -129,7 +130,7 @@ impl<'arena> Sanitizer<'arena> {
self.add_attributes(node); self.add_attributes(node);
self.sanitize_attribute_protocols(node); self.sanitize_attribute_protocols(node);
self.sanitize_style_tag_css(node); self.sanitize_style_tag_css(node);
// self.sanitize_style_attribute_css(node); self.sanitize_style_attribute_css(node);
// self.serialize_css_test(node); // self.serialize_css_test(node);
for transformer in self.transformers.iter() { for transformer in self.transformers.iter() {
@ -241,16 +242,11 @@ impl<'arena> Sanitizer<'arena> {
let attrs = &mut attrs.borrow_mut(); let attrs = &mut attrs.borrow_mut();
if let Some(protocols) = self.config.allowed_protocols.get(&name.local) { if let Some(protocols) = self.config.allowed_protocols.get(&name.local) {
dbg!(protocols);
dbg!(&attrs);
let mut i = 0; let mut i = 0;
while i != attrs.len() { while i != attrs.len() {
dbg!(&attrs[i].name.local);
if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) { if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) {
dbg!(allowed_protocols);
match Url::parse(&attrs[i].value) { match Url::parse(&attrs[i].value) {
Ok(url) => { Ok(url) => {
dbg!(Protocol::Scheme(url.scheme()));
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme())) { if !allowed_protocols.contains(&Protocol::Scheme(url.scheme())) {
attrs.remove(i); attrs.remove(i);
} else { } else {
@ -258,7 +254,6 @@ impl<'arena> Sanitizer<'arena> {
} }
} }
Err(ParseError::RelativeUrlWithoutBase) => { Err(ParseError::RelativeUrlWithoutBase) => {
dbg!("relative");
if !allowed_protocols.contains(&Protocol::Relative) { if !allowed_protocols.contains(&Protocol::Relative) {
attrs.remove(i); attrs.remove(i);
} else { } else {
@ -294,10 +289,9 @@ impl<'arena> Sanitizer<'arena> {
sanitized_css += declaration_string; sanitized_css += declaration_string;
} }
} }
sanitized_css += "}"; sanitized_css += " }";
} }
CssRule::AtRule(at_rule) => { CssRule::AtRule(at_rule) => {
dbg!(&at_rule);
if self if self
.config .config
.allowed_css_at_rules .allowed_css_at_rules
@ -308,7 +302,9 @@ impl<'arena> Sanitizer<'arena> {
if let Some(block) = at_rule.block { if let Some(block) = at_rule.block {
sanitized_css += "{"; sanitized_css += "{";
sanitized_css += &self.serialize_sanitized_css_rules(block); sanitized_css += &self.serialize_sanitized_css_rules(block);
sanitized_css += "}"; sanitized_css += " }";
} else {
sanitized_css += "; ";
} }
} }
} }
@ -326,9 +322,7 @@ impl<'arena> Sanitizer<'arena> {
if let NodeData::Element { ref name, .. } = parent.data { if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") { if name.local == local_name!("style") {
let rules = parse_css_stylesheet(&contents.borrow()); let rules = parse_css_stylesheet(&contents.borrow());
dbg!(&rules);
let sanitized_css = self.serialize_sanitized_css_rules(rules); let sanitized_css = self.serialize_sanitized_css_rules(rules);
dbg!(&sanitized_css);
contents.replace(StrTendril::from(sanitized_css)); contents.replace(StrTendril::from(sanitized_css));
} }
} }
@ -342,7 +336,6 @@ impl<'arena> Sanitizer<'arena> {
if attr.name.local == local_name!("style") { if attr.name.local == local_name!("style") {
let css_str = &attr.value; let css_str = &attr.value;
let declarations = parse_css_style_attribute(css_str); let declarations = parse_css_style_attribute(css_str);
dbg!(&declarations);
let mut sanitized_css = String::new(); let mut sanitized_css = String::new();
for declaration in declarations.into_iter() { for declaration in declarations.into_iter() {
let declaration_string = &declaration.to_string(); let declaration_string = &declaration.to_string();
@ -356,7 +349,6 @@ impl<'arena> Sanitizer<'arena> {
} }
} }
let sanitized_css = sanitized_css.trim(); let sanitized_css = sanitized_css.trim();
dbg!(&sanitized_css);
attr.value = StrTendril::from(sanitized_css); attr.value = StrTendril::from(sanitized_css);
} }
} }
@ -369,7 +361,7 @@ impl<'arena> Sanitizer<'arena> {
if let NodeData::Element { ref name, .. } = parent.data { if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") { if name.local == local_name!("style") {
let mut serialized_css = String::new(); let mut serialized_css = String::new();
parse_and_serialize(contents.borrow(), &mut serialized_css, true); parse_and_serialize(contents.borrow(), &mut serialized_css, self.config);
} }
} }
} }
@ -415,6 +407,7 @@ mod test {
allowed_protocols: HashMap::new(), allowed_protocols: HashMap::new(),
allowed_css_at_rules: HashSet::new(), allowed_css_at_rules: HashSet::new(),
allowed_css_properties: HashSet::new(), allowed_css_properties: HashSet::new(),
allow_css_comments: false,
remove_contents_when_unwrapped: HashSet::new(), remove_contents_when_unwrapped: HashSet::new(),
}; };
} }
@ -728,31 +721,27 @@ mod test {
} }
#[test] #[test]
fn sanitize_style_tag_css() { fn sanitize_style_attribute_css() {
let mut sanitize_css_config = EMPTY_CONFIG.clone(); let mut sanitize_css_config = EMPTY_CONFIG.clone();
sanitize_css_config sanitize_css_config
.allowed_elements .allowed_elements
.extend(vec![local_name!("html"), local_name!("style")]); .extend(vec![local_name!("html"), local_name!("div")]);
sanitize_css_config
.allowed_attributes
.extend(vec![local_name!("style")]);
sanitize_css_config sanitize_css_config
.allowed_css_properties .allowed_css_properties
.extend(vec![css_property!("margin"), css_property!("color")]); .extend(vec![css_property!("margin"), css_property!("color")]);
sanitize_css_config
.allowed_css_at_rules
.extend(vec![css_at_rule!("charset")]);
let sanitizer = Sanitizer::new(&sanitize_css_config, vec![]); let sanitizer = Sanitizer::new(&sanitize_css_config, vec![]);
let mut mock_data = MockRead::new( let mut mock_data =
"<style>@charset \"UTF-8\";\ MockRead::new("<div style=\"margin: 10px; padding: 10px; color: red;\"></div>");
div { margin: 10px; padding: 10px; color: red; }\
@media print { div { margin: 50px; } }</style>",
);
let mut output = vec![]; let mut output = vec![];
sanitizer sanitizer
.sanitize_fragment(&mut mock_data, &mut output) .sanitize_fragment(&mut mock_data, &mut output)
.unwrap(); .unwrap();
assert_eq!( assert_eq!(
str::from_utf8(&output).unwrap(), str::from_utf8(&output).unwrap(),
"<html><style>@charset \"UTF-8\";\ "<html><div style=\"margin: 10px; color: red;\"></div></html>"
div { margin: 10px; color: red; }</style></html>"
); );
} }
} }