Still trying to figure out css serialization

This commit is contained in:
2020-04-24 00:36:04 -04:00
parent 5b0c724bac
commit e4316f1a6f
4 changed files with 240 additions and 197 deletions

View File

@@ -1,10 +1,11 @@
use cssparser::{
AtRuleParser, AtRuleType, CowRcStr, DeclarationListParser, DeclarationParser, ParseError, Parser,
ParserInput, QualifiedRuleParser, RuleListParser, SourceLocation,
AtRuleParser, AtRuleType, CowRcStr, DeclarationListParser, DeclarationParser, ParseError,
Parser, ParserInput, QualifiedRuleParser, RuleListParser, SourceLocation, ToCss, Token,
TokenSerializationType,
};
use std::fmt;
use std::convert::Into;
use std::error::Error;
use std::fmt;
#[derive(Debug)]
pub enum CssRule {
@@ -57,19 +58,30 @@ impl<'i> AtRuleParser<'i> for CssParser {
name: CowRcStr<'i>,
input: &mut Parser<'i, 't>,
) -> Result<AtRuleType<Self::PreludeNoBlock, Self::PreludeBlock>, CssParseError<'i>> {
let position = input.position();
while input.next().is_ok() {}
// let position = input.position();
// while input.next_including_whitespace_and_comments().is_ok() {}
let mut prelude = String::new();
let mut previous_token = TokenSerializationType::nothing();
while let Ok(token) = input.next_including_whitespace_and_comments() {
let token_type = token.serialization_type();
if previous_token.needs_separator_when_before(token_type) {
prelude.push_str("/**/");
}
previous_token = token_type;
token.to_css(&mut prelude).unwrap();
// TODO: do I need to handle parse_nested_block here?
}
match_ignore_ascii_case! { &*name,
"import" | "namespace" | "charset" => {
Ok(AtRuleType::WithoutBlock(CssAtRulePrelude {
name: name.to_string(),
prelude: input.slice_from(position).to_string(),
prelude,
}))
},
_ => {
Ok(AtRuleType::WithBlock(CssAtRulePrelude {
name: name.to_string(),
prelude: input.slice_from(position).to_string(),
prelude,
}))
}
}
@@ -79,7 +91,7 @@ impl<'i> AtRuleParser<'i> for CssParser {
&mut self,
prelude: Self::PreludeBlock,
_location: SourceLocation,
input: &mut Parser<'i, 't>
input: &mut Parser<'i, 't>,
) -> Result<Self::AtRule, CssParseError<'i>> {
let rule_list_parser = RuleListParser::new_for_stylesheet(input, CssParser);
let mut rules = Vec::new();
@@ -105,7 +117,7 @@ impl<'i> AtRuleParser<'i> for CssParser {
fn rule_without_block(
&mut self,
prelude: Self::PreludeNoBlock,
_location: SourceLocation
_location: SourceLocation,
) -> Self::AtRule {
CssRule::AtRule(CssAtRule {
name: prelude.name,
@@ -124,9 +136,20 @@ impl<'i> QualifiedRuleParser<'i> for CssParser {
&mut self,
input: &mut Parser<'i, 't>,
) -> Result<Self::Prelude, CssParseError<'i>> {
let position = input.position();
while input.next().is_ok() {}
Ok(input.slice_from(position).to_string())
// let position = input.position();
let mut prelude = String::new();
let mut previous_token = TokenSerializationType::nothing();
while let Ok(token) = input.next_including_whitespace_and_comments() {
let token_type = token.serialization_type();
if previous_token.needs_separator_when_before(token_type) {
prelude.push_str("/**/");
}
previous_token = token_type;
dbg!(&token);
token.to_css(&mut prelude).unwrap();
// TODO: do I need to handle parse_nested_block here?
}
Ok(prelude)
}
fn parse_block<'t>(
@@ -184,14 +207,26 @@ impl<'i> DeclarationParser<'i> for CssDeclarationParser {
input: &mut Parser<'i, 't>,
) -> Result<Self::Declaration, ParseError<'i, CssError>> {
dbg!(&name);
let start = input.position();
input.next()?;
let value = input.slice_from(start);
// let start = input.position();
let mut value = String::new();
let mut previous_token = TokenSerializationType::nothing();
while let Ok(token) = input.next_including_whitespace_and_comments() {
let token_type = token.serialization_type();
if previous_token.needs_separator_when_before(token_type) {
value.push_str("/**/");
}
previous_token = token_type;
dbg!(&token);
token.to_css(&mut value).unwrap();
// TODO: do I need to handle parse_nested_block here?
}
// input.next_including_whitespace_and_comments()?;
// let value = input.slice_from(start);
dbg!(&value);
Ok(vec![CssDeclaration {
property: name.to_string(),
value: value.trim().to_string(),
value: value.to_string(),
}])
}
}
@@ -227,12 +262,12 @@ pub fn parse_declarations<'i>(
impl fmt::Display for CssDeclaration {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}: {};", self.property, self.value)
write!(f, "{}:{};", self.property, self.value)
}
}
impl Into<String> for CssDeclaration {
fn into(self) -> String {
format!("{}: {};", self.property, self.value)
format!("{}:{};", self.property, self.value)
}
}

52
src/css_parser_2.rs Normal file
View File

@@ -0,0 +1,52 @@
use std::cell::Ref;
use cssparser::{ParseError, Parser, ParserInput, ToCss, Token, TokenSerializationType};
use html5ever::tendril::StrTendril;
pub fn write_to(
mut previous_token: TokenSerializationType,
input: &mut Parser,
string: &mut String,
preserve_comments: bool,
) {
while let Ok(token) = if preserve_comments {
input
.next_including_whitespace_and_comments()
.map(|t| t.clone())
} else {
input.next_including_whitespace().map(|t| t.clone())
} {
let token_type = token.serialization_type();
if !preserve_comments && previous_token.needs_separator_when_before(token_type) {
string.push_str("/**/")
}
previous_token = token_type;
dbg!(&token);
token.to_css(string).unwrap();
let closing_token = match token {
Token::Function(_) | Token::ParenthesisBlock => Some(Token::CloseParenthesis),
Token::SquareBracketBlock => Some(Token::CloseSquareBracket),
Token::CurlyBracketBlock => Some(Token::CloseCurlyBracket),
_ => None,
};
if let Some(closing_token) = closing_token {
let result: Result<_, ParseError<()>> = input.parse_nested_block(|input| {
write_to(previous_token, input, string, preserve_comments);
Ok(())
});
result.unwrap();
closing_token.to_css(string).unwrap();
}
}
}
pub fn parse_and_serialize(input: Ref<StrTendril>, output: &mut String, preserve_comments: bool) {
let mut parser_input = ParserInput::new(&input);
let parser = &mut Parser::new(&mut parser_input);
write_to(
TokenSerializationType::nothing(),
parser,
output,
preserve_comments,
);
}

View File

@@ -10,14 +10,8 @@ extern crate cssparser;
extern crate string_cache;
extern crate typed_arena;
use std::collections::HashSet;
use std::io;
use html5ever::tendril::StrTendril;
use html5ever::{Attribute, LocalName, QualName};
use url::{ParseError, Url};
#[macro_use]
mod css_property {
include!(concat!(env!("OUT_DIR"), "/css_property.rs"));
@@ -30,69 +24,20 @@ mod css_at_rule {
mod arena_dom;
mod config;
mod css_parser;
mod css_parser_2;
mod sanitizer;
use arena_dom::{create_element, Arena, NodeData, Ref};
use config::default::DEFAULT_CONFIG;
use config::permissive::{ADD_ATTRIBUTES, ALL_ATTRIBUTES, ATTRIBUTES, PROTOCOLS};
use config::relaxed::{CSS_AT_RULES, CSS_PROPERTIES};
use css_at_rule::CssAtRule;
use css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
use css_property::CssProperty;
use sanitizer::Sanitizer;
fn main() {
let sanitizer = Sanitizer::new(
&DEFAULT_CONFIG,
vec![
&sanitize_style_tag_css,
&sanitize_style_attribute_css,
&remove_attributes,
&add_attributes,
&sanitize_attribute_protocols,
&add_single_elements_around_ul,
],
);
let sanitizer = Sanitizer::new(&DEFAULT_CONFIG, vec![&add_single_elements_around_ul]);
sanitizer
.sanitize_fragment(&mut io::stdin(), &mut io::stdout())
.unwrap();
}
fn css_rules_to_string(rules: Vec<CssRule>) -> String {
let mut sanitized_css = String::new();
for rule in rules {
match rule {
CssRule::StyleRule(style_rule) => {
sanitized_css += &style_rule.selectors.trim();
sanitized_css += " {\n";
for declaration in style_rule.declarations.into_iter() {
let declaration_string = &declaration.to_string();
if CSS_PROPERTIES.contains(&CssProperty::from(declaration.property)) {
sanitized_css += " ";
sanitized_css += declaration_string;
sanitized_css += " ";
}
}
sanitized_css += "\n}";
}
CssRule::AtRule(at_rule) => {
dbg!(&at_rule);
if CSS_AT_RULES.contains(&CssAtRule::from(at_rule.name.clone())) {
sanitized_css += &format!("@{} ", &at_rule.name);
sanitized_css += &at_rule.prelude.trim();
if let Some(block) = at_rule.block {
sanitized_css += " {\n";
sanitized_css += &css_rules_to_string(block);
sanitized_css += "\n}";
}
}
}
}
sanitized_css += "\n";
}
sanitized_css.trim().to_string()
}
// TODO: make separate rich and plain transformers
// DONE: add whitelist of tags, remove any not in it
// DONE: add whitelist of attributes, remove any not in it
@@ -106,128 +51,6 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
// DONE: separate this out into multiple separate transformers
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Text { ref contents } = node.data {
// TODO: seems rather expensive to lookup the parent on every Text node. Better
// solution would be to pass some sort of context from the parent that marks that this
// Text node is inside a <style>.
if let Some(parent) = node.parent.get() {
if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") {
let rules = parse_css_stylesheet(&contents.borrow());
dbg!(&rules);
let sanitized_css = css_rules_to_string(rules);
dbg!(&sanitized_css);
contents.replace(StrTendril::from(sanitized_css));
}
}
}
}
}
fn sanitize_style_attribute_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element { ref attrs, .. } = node.data {
for attr in attrs.borrow_mut().iter_mut() {
if attr.name.local == local_name!("style") {
let css_str = &attr.value;
let declarations = parse_css_style_attribute(css_str);
dbg!(&declarations);
let mut sanitized_css = String::new();
for declaration in declarations.into_iter() {
let declaration_string = &declaration.to_string();
if CSS_PROPERTIES.contains(&CssProperty::from(declaration.property)) {
sanitized_css += declaration_string;
sanitized_css += " ";
}
}
let sanitized_css = sanitized_css.trim();
dbg!(&sanitized_css);
attr.value = StrTendril::from(sanitized_css);
}
}
}
}
fn remove_attributes<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
let attrs = &mut attrs.borrow_mut();
let mut allowed_attrs: HashSet<LocalName> = ALL_ATTRIBUTES.clone();
if let Some(element_attrs) = ATTRIBUTES.get(&name.local) {
allowed_attrs = allowed_attrs.union(element_attrs).cloned().collect();
}
let mut i = 0;
while i != attrs.len() {
if !allowed_attrs.contains(&attrs[i].name.local) {
attrs.remove(i);
}
i += 1;
}
}
}
fn add_attributes<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
let attrs = &mut attrs.borrow_mut();
if let Some(add_attributes) = ADD_ATTRIBUTES.get(&name.local) {
for (name, &value) in add_attributes.iter() {
attrs.push(Attribute {
name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value),
});
}
}
}
}
fn sanitize_attribute_protocols<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
if let NodeData::Element {
ref attrs,
ref name,
..
} = node.data
{
let attrs = &mut attrs.borrow_mut();
if let Some(protocols) = PROTOCOLS.get(&name.local) {
let mut i = 0;
while i != attrs.len() {
if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) {
match Url::parse(&attrs[i].value) {
Ok(url) => {
if !allowed_protocols.contains(url.scheme()) {
attrs.remove(i);
} else {
i += 1;
}
}
Err(ParseError::RelativeUrlWithoutBase) => {
attrs[i].value = StrTendril::from(format!("http://{}", attrs[i].value));
i += 1;
}
Err(_) => {
attrs.remove(i);
}
}
} else {
i += 1;
}
}
}
}
}
fn add_single_elements_around_ul<'arena>(node: Ref<'arena>, arena: Arena<'arena>) {
if let NodeData::Element { ref name, .. } = node.data {
if let local_name!("ul") = name.local {

View File

@@ -8,6 +8,8 @@ use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName,
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
use crate::css_at_rule::CssAtRule;
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
use crate::css_parser_2::parse_and_serialize;
use crate::css_property::CssProperty;
pub struct Sanitizer<'arena> {
@@ -126,6 +128,9 @@ impl<'arena> Sanitizer<'arena> {
self.remove_attributes(node);
self.add_attributes(node);
self.sanitize_attribute_protocols(node);
self.sanitize_style_tag_css(node);
// self.sanitize_style_attribute_css(node);
// self.serialize_css_test(node);
for transformer in self.transformers.iter() {
transformer(node, &self.arena);
@@ -271,6 +276,105 @@ impl<'arena> Sanitizer<'arena> {
}
}
}
fn serialize_sanitized_css_rules(&self, rules: Vec<CssRule>) -> String {
let mut sanitized_css = String::new();
for rule in rules {
match rule {
CssRule::StyleRule(style_rule) => {
sanitized_css += &style_rule.selectors;
sanitized_css += "{";
for declaration in style_rule.declarations.into_iter() {
let declaration_string = &declaration.to_string();
if self
.config
.allowed_css_properties
.contains(&CssProperty::from(declaration.property))
{
sanitized_css += declaration_string;
}
}
sanitized_css += "}";
}
CssRule::AtRule(at_rule) => {
dbg!(&at_rule);
if self
.config
.allowed_css_at_rules
.contains(&CssAtRule::from(at_rule.name.clone()))
{
sanitized_css += &format!("@{}", &at_rule.name);
sanitized_css += &at_rule.prelude;
if let Some(block) = at_rule.block {
sanitized_css += "{";
sanitized_css += &self.serialize_sanitized_css_rules(block);
sanitized_css += "}";
}
}
}
}
}
sanitized_css
}
fn sanitize_style_tag_css(&self, node: Ref<'arena>) {
if let NodeData::Text { ref contents } = node.data {
// TODO: seems rather expensive to lookup the parent on every Text node. Better
// solution would be to pass some sort of context from the parent that marks that this
// Text node is inside a <style>.
if let Some(parent) = node.parent.get() {
if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") {
let rules = parse_css_stylesheet(&contents.borrow());
dbg!(&rules);
let sanitized_css = self.serialize_sanitized_css_rules(rules);
dbg!(&sanitized_css);
contents.replace(StrTendril::from(sanitized_css));
}
}
}
}
}
fn sanitize_style_attribute_css(&self, node: Ref<'arena>) {
if let NodeData::Element { ref attrs, .. } = node.data {
for attr in attrs.borrow_mut().iter_mut() {
if attr.name.local == local_name!("style") {
let css_str = &attr.value;
let declarations = parse_css_style_attribute(css_str);
dbg!(&declarations);
let mut sanitized_css = String::new();
for declaration in declarations.into_iter() {
let declaration_string = &declaration.to_string();
if self
.config
.allowed_css_properties
.contains(&CssProperty::from(declaration.property))
{
sanitized_css += declaration_string;
sanitized_css += " ";
}
}
let sanitized_css = sanitized_css.trim();
dbg!(&sanitized_css);
attr.value = StrTendril::from(sanitized_css);
}
}
}
}
fn serialize_css_test(&self, node: Ref<'arena>) {
if let NodeData::Text { ref contents } = node.data {
if let Some(parent) = node.parent.get() {
if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") {
let mut serialized_css = String::new();
parse_and_serialize(contents.borrow(), &mut serialized_css, true);
}
}
}
}
}
}
#[cfg(test)]
@@ -622,4 +726,33 @@ mod test {
<img src=\"http://example.com\"></img></html>"
);
}
#[test]
fn sanitize_style_tag_css() {
let mut sanitize_css_config = EMPTY_CONFIG.clone();
sanitize_css_config
.allowed_elements
.extend(vec![local_name!("html"), local_name!("style")]);
sanitize_css_config
.allowed_css_properties
.extend(vec![css_property!("margin"), css_property!("color")]);
sanitize_css_config
.allowed_css_at_rules
.extend(vec![css_at_rule!("charset")]);
let sanitizer = Sanitizer::new(&sanitize_css_config, vec![]);
let mut mock_data = MockRead::new(
"<style>@charset \"UTF-8\";\
div { margin: 10px; padding: 10px; color: red; }\
@media print { div { margin: 50px; } }</style>",
);
let mut output = vec![];
sanitizer
.sanitize_fragment(&mut mock_data, &mut output)
.unwrap();
assert_eq!(
str::from_utf8(&output).unwrap(),
"<html><style>@charset \"UTF-8\";\
div { margin: 10px; color: red; }</style></html>"
);
}
}