Save parsed CSS to the arena_dom tree

Now, user transformer functions have access to the parsed CSS rules and
declarations directly on the `NodeData`. Serialization of CSS back to a string
is deferred to when the rest of the HTML tree is serialized to a string.
This commit is contained in:
Tyler Hallada 2020-05-09 22:23:12 -04:00
parent 269f296d48
commit 519c4067b7
2 changed files with 212 additions and 106 deletions

View File

@ -24,7 +24,9 @@ use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, T
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer, TraversalScope}; use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use html5ever::tendril::StrTendril; use html5ever::tendril::StrTendril;
use html5ever::{Attribute, ExpandedName, LocalName, QualName}; use html5ever::{Attribute as HTML5everAttribute, ExpandedName, LocalName, QualName};
use crate::css_parser::{CssDeclaration, CssRule};
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> { pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
arena.alloc(Node::new(NodeData::Element { arena.alloc(Node::new(NodeData::Element {
@ -57,6 +59,23 @@ pub struct Node<'arena> {
pub data: NodeData<'arena>, pub data: NodeData<'arena>,
} }
#[derive(Debug)]
pub struct StyleAttribute {
pub name: QualName,
pub value: Vec<CssDeclaration>,
// Need to store the serialized value to the arena because html5ever expects a &str for
// attribute values during serlialization. If this is None, `serialize` will construct a String
// from serializing the `CssDeclaration`s, store it here, and then reference it with
// `.as_str()`.
pub serialized_value: Option<String>,
}
#[derive(Debug)]
pub enum Attribute {
Style(StyleAttribute),
Text(HTML5everAttribute),
}
#[derive(Debug)] #[derive(Debug)]
pub enum NodeData<'arena> { pub enum NodeData<'arena> {
Document, Document,
@ -68,6 +87,9 @@ pub enum NodeData<'arena> {
Text { Text {
contents: RefCell<StrTendril>, contents: RefCell<StrTendril>,
}, },
StyleSheet {
rules: Vec<CssRule>,
},
Comment { Comment {
contents: StrTendril, contents: StrTendril,
}, },
@ -232,6 +254,14 @@ impl<'arena> fmt::Display for NodeData<'arena> {
"Text: {}...", "Text: {}...",
&contents.borrow().chars().take(10).collect::<String>() &contents.borrow().chars().take(10).collect::<String>()
), ),
NodeData::StyleSheet { rules } => write!(
f,
"Stylesheet: {}...",
&serialize_css_rules(rules)
.chars()
.take(10)
.collect::<String>()
),
NodeData::ProcessingInstruction { .. } => write!(f, "ProcessingInstruction: ..."), NodeData::ProcessingInstruction { .. } => write!(f, "ProcessingInstruction: ..."),
NodeData::Comment { contents } => write!( NodeData::Comment { contents } => write!(
f, f,
@ -379,12 +409,17 @@ impl<'arena> TreeSink for Sink<'arena> {
fn create_element( fn create_element(
&mut self, &mut self,
name: QualName, name: QualName,
attrs: Vec<Attribute>, attrs: Vec<HTML5everAttribute>,
flags: ElementFlags, flags: ElementFlags,
) -> Ref<'arena> { ) -> Ref<'arena> {
self.new_node(NodeData::Element { self.new_node(NodeData::Element {
name, name,
attrs: RefCell::new(attrs), attrs: RefCell::new(
attrs
.into_iter()
.map(|attr| Attribute::Text(attr))
.collect(),
),
template_contents: if flags.template { template_contents: if flags.template {
Some(self.new_node(NodeData::Document)) Some(self.new_node(NodeData::Document))
} else { } else {
@ -447,7 +482,7 @@ impl<'arena> TreeSink for Sink<'arena> {
})) }))
} }
fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<Attribute>) { fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<HTML5everAttribute>) {
let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
attrs.borrow_mut() attrs.borrow_mut()
} else { } else {
@ -456,13 +491,18 @@ impl<'arena> TreeSink for Sink<'arena> {
let existing_names = existing let existing_names = existing
.iter() .iter()
.map(|e| e.name.clone()) .map(|e| match e {
Attribute::Style(attr) => attr.name.clone(),
Attribute::Text(attr) => attr.name.clone(),
})
.collect::<HashSet<_>>(); .collect::<HashSet<_>>();
existing.extend( existing.extend(attrs.into_iter().filter_map(|attr| {
attrs if !existing_names.contains(&attr.name) {
.into_iter() Some(Attribute::Text(attr))
.filter(|attr| !existing_names.contains(&attr.name)), } else {
); None
}
}));
} }
fn remove_from_parent(&mut self, target: &Ref<'arena>) { fn remove_from_parent(&mut self, target: &Ref<'arena>) {
@ -479,6 +519,47 @@ impl<'arena> TreeSink for Sink<'arena> {
} }
} }
fn serialize_css_rules(rules: &[CssRule]) -> String {
let mut serialized_rules = String::new();
for rule in rules {
match rule {
CssRule::StyleRule(style_rule) => {
serialized_rules += &style_rule.selectors;
serialized_rules += "{";
for declaration in style_rule.declarations.iter() {
serialized_rules += &declaration.to_string();
}
serialized_rules += &serialize_css_declarations(&style_rule.declarations);
serialized_rules += " }";
}
CssRule::AtRule(at_rule) => {
serialized_rules += "@";
serialized_rules += &at_rule.name;
serialized_rules += &at_rule.prelude;
if let Some(block) = &at_rule.block {
serialized_rules += "{";
serialized_rules += &serialize_css_rules(&block);
serialized_rules += " }";
} else {
serialized_rules += "; ";
}
}
}
}
serialized_rules
}
fn serialize_css_declarations(declarations: &[CssDeclaration]) -> String {
let mut serialized_declarations = String::new();
for (index, declaration) in declarations.iter().enumerate() {
serialized_declarations += &declaration.to_string();
if index != declarations.len() - 1 {
serialized_declarations += " ";
}
}
serialized_declarations
}
// Implementation adapted from implementation for RcDom: // Implementation adapted from implementation for RcDom:
// https://github.com/servo/html5ever/blob/45b2fca5c6/markup5ever/rcdom.rs#L410 // https://github.com/servo/html5ever/blob/45b2fca5c6/markup5ever/rcdom.rs#L410
impl<'arena> Serialize for Node<'arena> { impl<'arena> Serialize for Node<'arena> {
@ -498,7 +579,22 @@ impl<'arena> Serialize for Node<'arena> {
if traversal_scope == IncludeNode { if traversal_scope == IncludeNode {
serializer.start_elem( serializer.start_elem(
name.clone(), name.clone(),
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])), attrs.borrow_mut().iter_mut().map(|at| match at {
Attribute::Style(at) => {
if at.serialized_value.is_none() {
let serialized_declaration =
serialize_css_declarations(&at.value);
at.serialized_value = Some(serialized_declaration);
}
if let Some(serialized_declarations) = &at.serialized_value {
(&at.name, serialized_declarations.as_str())
} else {
panic!("Serialized style attribute value was not saved to the arena");
}
}
Attribute::Text(at) => (&at.name, &at.value[..]),
}),
)?; )?;
} }
@ -525,6 +621,9 @@ impl<'arena> Serialize for Node<'arena> {
(&IncludeNode, &NodeData::Text { ref contents }) => { (&IncludeNode, &NodeData::Text { ref contents }) => {
serializer.write_text(&contents.borrow())? serializer.write_text(&contents.borrow())?
} }
(&IncludeNode, &NodeData::StyleSheet { ref rules }) => {
serializer.write_text(&serialize_css_rules(rules))?
}
(&IncludeNode, &NodeData::Comment { ref contents }) => { (&IncludeNode, &NodeData::Comment { ref contents }) => {
serializer.write_comment(&contents)? serializer.write_comment(&contents)?
} }

View File

@ -4,11 +4,13 @@ use url::{ParseError, Url};
use html5ever::interface::tree_builder::QuirksMode; use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{format_tendril, StrTendril, TendrilSink}; use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName}; use html5ever::{
parse_document, parse_fragment, serialize, Attribute as HTML5everAttribute, LocalName, QualName,
};
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink}; use crate::arena_dom::{Arena, Attribute, Node, NodeData, Ref, Sink, StyleAttribute};
use crate::css_at_rule::CssAtRule; use crate::css_at_rule::CssAtRule;
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule}; use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule, CssStyleRule};
use crate::css_property::CssProperty; use crate::css_property::CssProperty;
pub struct Sanitizer<'arena> { pub struct Sanitizer<'arena> {
@ -139,9 +141,6 @@ impl<'arena> Sanitizer<'arena> {
self.remove_attributes(node); self.remove_attributes(node);
self.add_attributes(node); self.add_attributes(node);
self.sanitize_attribute_protocols(node); self.sanitize_attribute_protocols(node);
// TODO: save the parsed CSS syntax tree from these methods onto the arena dom so that
// user-created transformers below will have access to modify them without having to
// re-parse.
self.sanitize_style_tag_css(node); self.sanitize_style_tag_css(node);
self.sanitize_style_attribute_css(node); self.sanitize_style_attribute_css(node);
@ -160,9 +159,10 @@ impl<'arena> Sanitizer<'arena> {
fn should_unwrap_node(&self, node: Ref) -> bool { fn should_unwrap_node(&self, node: Ref) -> bool {
match node.data { match node.data {
NodeData::Document | NodeData::Text { .. } | NodeData::ProcessingInstruction { .. } => { NodeData::Document
false | NodeData::Text { .. }
} | NodeData::StyleSheet { .. }
| NodeData::ProcessingInstruction { .. } => false,
NodeData::Comment { .. } => !self.config.allow_comments, NodeData::Comment { .. } => !self.config.allow_comments,
NodeData::Doctype { .. } => !self.config.allow_doctype, NodeData::Doctype { .. } => !self.config.allow_doctype,
NodeData::Element { ref name, .. } => { NodeData::Element { ref name, .. } => {
@ -176,6 +176,7 @@ impl<'arena> Sanitizer<'arena> {
NodeData::Document NodeData::Document
| NodeData::Doctype { .. } | NodeData::Doctype { .. }
| NodeData::Text { .. } | NodeData::Text { .. }
| NodeData::StyleSheet { .. }
| NodeData::ProcessingInstruction { .. } | NodeData::ProcessingInstruction { .. }
| NodeData::Comment { .. } => false, | NodeData::Comment { .. } => false,
NodeData::Element { ref name, .. } => self NodeData::Element { ref name, .. } => self
@ -197,15 +198,17 @@ impl<'arena> Sanitizer<'arena> {
let all_allowed = &self.config.allowed_attributes; let all_allowed = &self.config.allowed_attributes;
let per_element_allowed = self.config.allowed_attributes_per_element.get(&name.local); let per_element_allowed = self.config.allowed_attributes_per_element.get(&name.local);
while i != attrs.len() { while i != attrs.len() {
if !all_allowed.contains(&attrs[i].name.local) { if let Attribute::Text(attr) = &attrs[i] {
if let Some(per_element_allowed) = per_element_allowed { if !all_allowed.contains(&attr.name.local) {
if per_element_allowed.contains(&attrs[i].name.local) { if let Some(per_element_allowed) = per_element_allowed {
i += 1; if per_element_allowed.contains(&attr.name.local) {
continue; i += 1;
continue;
}
} }
attrs.remove(i);
continue;
} }
attrs.remove(i);
continue;
} }
i += 1; i += 1;
} }
@ -225,18 +228,18 @@ impl<'arena> Sanitizer<'arena> {
self.config.add_attributes_per_element.get(&name.local); self.config.add_attributes_per_element.get(&name.local);
for (name, &value) in add_attributes.iter() { for (name, &value) in add_attributes.iter() {
attrs.push(Attribute { attrs.push(Attribute::Text(HTML5everAttribute {
name: QualName::new(None, ns!(), name.clone()), name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value), value: StrTendril::from(value),
}); }));
} }
if let Some(add_attributes_per_element) = add_attributes_per_element { if let Some(add_attributes_per_element) = add_attributes_per_element {
for (name, &value) in add_attributes_per_element.iter() { for (name, &value) in add_attributes_per_element.iter() {
attrs.push(Attribute { attrs.push(Attribute::Text(HTML5everAttribute {
name: QualName::new(None, ns!(), name.clone()), name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value), value: StrTendril::from(value),
}); }));
} }
} }
} }
@ -254,25 +257,30 @@ impl<'arena> Sanitizer<'arena> {
if let Some(protocols) = self.config.allowed_protocols.get(&name.local) { if let Some(protocols) = self.config.allowed_protocols.get(&name.local) {
let mut i = 0; let mut i = 0;
while i != attrs.len() { while i != attrs.len() {
if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) { if let Attribute::Text(attr) = &attrs[i] {
match Url::parse(&attrs[i].value) { if let Some(allowed_protocols) = protocols.get(&attr.name.local) {
Ok(url) => { match Url::parse(&attr.value) {
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme())) { Ok(url) => {
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme()))
{
attrs.remove(i);
} else {
i += 1;
}
}
Err(ParseError::RelativeUrlWithoutBase) => {
if !allowed_protocols.contains(&Protocol::Relative) {
attrs.remove(i);
} else {
i += 1;
}
}
Err(_) => {
attrs.remove(i); attrs.remove(i);
} else {
i += 1;
} }
} }
Err(ParseError::RelativeUrlWithoutBase) => { } else {
if !allowed_protocols.contains(&Protocol::Relative) { i += 1;
attrs.remove(i);
} else {
i += 1;
}
}
Err(_) => {
attrs.remove(i);
}
} }
} else { } else {
i += 1; i += 1;
@ -282,58 +290,50 @@ impl<'arena> Sanitizer<'arena> {
} }
} }
fn serialize_sanitized_css_rules(&self, rules: Vec<CssRule>) -> String { fn sanitize_css_rules(&self, rules: Vec<CssRule>) -> Vec<CssRule> {
let mut sanitized_css = String::new(); rules
for rule in rules { .into_iter()
match rule { .filter_map(|rule| match rule {
CssRule::StyleRule(style_rule) => { CssRule::StyleRule(style_rule) => Some(CssRule::StyleRule(CssStyleRule {
sanitized_css += &style_rule.selectors; selectors: style_rule.selectors,
sanitized_css += "{"; declarations: style_rule
for declaration in style_rule.declarations.into_iter() { .declarations
let declaration_string = &declaration.to_string(); .into_iter()
if self .filter(|declaration| {
.config self.config
.allowed_css_properties .allowed_css_properties
.contains(&CssProperty::from(declaration.property)) .contains(&CssProperty::from(declaration.property.as_str()))
{ })
sanitized_css += declaration_string; .collect(),
} })),
}
sanitized_css += " }";
}
CssRule::AtRule(at_rule) => { CssRule::AtRule(at_rule) => {
if self if self
.config .config
.allowed_css_at_rules .allowed_css_at_rules
.contains(&CssAtRule::from(at_rule.name.clone())) .contains(&CssAtRule::from(at_rule.name.as_str()))
{ {
sanitized_css += &format!("@{}", &at_rule.name); Some(CssRule::AtRule(at_rule))
sanitized_css += &at_rule.prelude; } else {
if let Some(block) = at_rule.block { None
sanitized_css += "{";
sanitized_css += &self.serialize_sanitized_css_rules(block);
sanitized_css += " }";
} else {
sanitized_css += "; ";
}
} }
} }
} })
} .collect()
sanitized_css
} }
fn sanitize_style_tag_css(&self, node: Ref<'arena>) { fn sanitize_style_tag_css(&'arena self, node: Ref<'arena>) {
if let NodeData::Text { ref contents } = node.data { if let NodeData::Element { ref name, .. } = node.data {
// TODO: seems rather expensive to lookup the parent on every Text node. Better if name.local == local_name!("style") {
// solution would be to pass some sort of context from the parent that marks that this // TODO: is it okay to assume <style> tags will only ever have one text node child?
// Text node is inside a <style>. if let Some(first_child) = node.first_child.take() {
if let Some(parent) = node.parent.get() { if let NodeData::Text { ref contents, .. } = first_child.data {
if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") {
let rules = parse_css_stylesheet(&contents.borrow()); let rules = parse_css_stylesheet(&contents.borrow());
let sanitized_css = self.serialize_sanitized_css_rules(rules); let sanitized_rules = self.sanitize_css_rules(rules);
contents.replace(StrTendril::from(sanitized_css)); first_child.detach();
let stylesheet = self.arena.alloc(Node::new(NodeData::StyleSheet {
rules: sanitized_rules,
}));
node.append(stylesheet);
} }
} }
} }
@ -342,25 +342,32 @@ impl<'arena> Sanitizer<'arena> {
fn sanitize_style_attribute_css(&self, node: Ref<'arena>) { fn sanitize_style_attribute_css(&self, node: Ref<'arena>) {
if let NodeData::Element { ref attrs, .. } = node.data { if let NodeData::Element { ref attrs, .. } = node.data {
for attr in attrs.borrow_mut().iter_mut() { let mut i = 0;
if attr.name.local == local_name!("style") { let attrs = &mut attrs.borrow_mut();
let css_str = &attr.value;
let declarations = parse_css_style_attribute(css_str); while i != attrs.len() {
let mut sanitized_css = String::new(); if let Attribute::Text(attr) = &attrs[i] {
for declaration in declarations.into_iter() { if attr.name.local == local_name!("style") {
let declaration_string = &declaration.to_string(); let css_str = &attr.value;
if self let mut declarations = parse_css_style_attribute(css_str);
.config declarations.retain(|declaration| {
.allowed_css_properties self.config
.contains(&CssProperty::from(declaration.property)) .allowed_css_properties
{ .contains(&CssProperty::from(declaration.property.as_str()))
sanitized_css += declaration_string; });
sanitized_css += " "; let name = attr.name.clone();
} attrs.remove(i);
attrs.insert(
i,
Attribute::Style(StyleAttribute {
name,
value: declarations,
serialized_value: None,
}),
);
} }
let sanitized_css = sanitized_css.trim();
attr.value = StrTendril::from(sanitized_css);
} }
i += 1;
} }
} }
} }