Save parsed CSS to the arena_dom tree

Now, user transformer functions have access to the parsed CSS rules and
declarations directly on the `NodeData`. Serialization of CSS back to a string
is deferred to when the rest of the HTML tree is serialized to a string.
This commit is contained in:
Tyler Hallada 2020-05-09 22:23:12 -04:00
parent 269f296d48
commit 519c4067b7
2 changed files with 212 additions and 106 deletions

View File

@ -24,7 +24,9 @@ use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, T
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use html5ever::tendril::StrTendril;
use html5ever::{Attribute, ExpandedName, LocalName, QualName};
use html5ever::{Attribute as HTML5everAttribute, ExpandedName, LocalName, QualName};
use crate::css_parser::{CssDeclaration, CssRule};
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
arena.alloc(Node::new(NodeData::Element {
@ -57,6 +59,23 @@ pub struct Node<'arena> {
pub data: NodeData<'arena>,
}
#[derive(Debug)]
pub struct StyleAttribute {
pub name: QualName,
pub value: Vec<CssDeclaration>,
// Need to store the serialized value to the arena because html5ever expects a &str for
// attribute values during serlialization. If this is None, `serialize` will construct a String
// from serializing the `CssDeclaration`s, store it here, and then reference it with
// `.as_str()`.
pub serialized_value: Option<String>,
}
#[derive(Debug)]
pub enum Attribute {
Style(StyleAttribute),
Text(HTML5everAttribute),
}
#[derive(Debug)]
pub enum NodeData<'arena> {
Document,
@ -68,6 +87,9 @@ pub enum NodeData<'arena> {
Text {
contents: RefCell<StrTendril>,
},
StyleSheet {
rules: Vec<CssRule>,
},
Comment {
contents: StrTendril,
},
@ -232,6 +254,14 @@ impl<'arena> fmt::Display for NodeData<'arena> {
"Text: {}...",
&contents.borrow().chars().take(10).collect::<String>()
),
NodeData::StyleSheet { rules } => write!(
f,
"Stylesheet: {}...",
&serialize_css_rules(rules)
.chars()
.take(10)
.collect::<String>()
),
NodeData::ProcessingInstruction { .. } => write!(f, "ProcessingInstruction: ..."),
NodeData::Comment { contents } => write!(
f,
@ -379,12 +409,17 @@ impl<'arena> TreeSink for Sink<'arena> {
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
attrs: Vec<HTML5everAttribute>,
flags: ElementFlags,
) -> Ref<'arena> {
self.new_node(NodeData::Element {
name,
attrs: RefCell::new(attrs),
attrs: RefCell::new(
attrs
.into_iter()
.map(|attr| Attribute::Text(attr))
.collect(),
),
template_contents: if flags.template {
Some(self.new_node(NodeData::Document))
} else {
@ -447,7 +482,7 @@ impl<'arena> TreeSink for Sink<'arena> {
}))
}
fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<Attribute>) {
fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<HTML5everAttribute>) {
let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
attrs.borrow_mut()
} else {
@ -456,13 +491,18 @@ impl<'arena> TreeSink for Sink<'arena> {
let existing_names = existing
.iter()
.map(|e| e.name.clone())
.map(|e| match e {
Attribute::Style(attr) => attr.name.clone(),
Attribute::Text(attr) => attr.name.clone(),
})
.collect::<HashSet<_>>();
existing.extend(
attrs
.into_iter()
.filter(|attr| !existing_names.contains(&attr.name)),
);
existing.extend(attrs.into_iter().filter_map(|attr| {
if !existing_names.contains(&attr.name) {
Some(Attribute::Text(attr))
} else {
None
}
}));
}
fn remove_from_parent(&mut self, target: &Ref<'arena>) {
@ -479,6 +519,47 @@ impl<'arena> TreeSink for Sink<'arena> {
}
}
fn serialize_css_rules(rules: &[CssRule]) -> String {
let mut serialized_rules = String::new();
for rule in rules {
match rule {
CssRule::StyleRule(style_rule) => {
serialized_rules += &style_rule.selectors;
serialized_rules += "{";
for declaration in style_rule.declarations.iter() {
serialized_rules += &declaration.to_string();
}
serialized_rules += &serialize_css_declarations(&style_rule.declarations);
serialized_rules += " }";
}
CssRule::AtRule(at_rule) => {
serialized_rules += "@";
serialized_rules += &at_rule.name;
serialized_rules += &at_rule.prelude;
if let Some(block) = &at_rule.block {
serialized_rules += "{";
serialized_rules += &serialize_css_rules(&block);
serialized_rules += " }";
} else {
serialized_rules += "; ";
}
}
}
}
serialized_rules
}
fn serialize_css_declarations(declarations: &[CssDeclaration]) -> String {
let mut serialized_declarations = String::new();
for (index, declaration) in declarations.iter().enumerate() {
serialized_declarations += &declaration.to_string();
if index != declarations.len() - 1 {
serialized_declarations += " ";
}
}
serialized_declarations
}
// Implementation adapted from implementation for RcDom:
// https://github.com/servo/html5ever/blob/45b2fca5c6/markup5ever/rcdom.rs#L410
impl<'arena> Serialize for Node<'arena> {
@ -498,7 +579,22 @@ impl<'arena> Serialize for Node<'arena> {
if traversal_scope == IncludeNode {
serializer.start_elem(
name.clone(),
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
attrs.borrow_mut().iter_mut().map(|at| match at {
Attribute::Style(at) => {
if at.serialized_value.is_none() {
let serialized_declaration =
serialize_css_declarations(&at.value);
at.serialized_value = Some(serialized_declaration);
}
if let Some(serialized_declarations) = &at.serialized_value {
(&at.name, serialized_declarations.as_str())
} else {
panic!("Serialized style attribute value was not saved to the arena");
}
}
Attribute::Text(at) => (&at.name, &at.value[..]),
}),
)?;
}
@ -525,6 +621,9 @@ impl<'arena> Serialize for Node<'arena> {
(&IncludeNode, &NodeData::Text { ref contents }) => {
serializer.write_text(&contents.borrow())?
}
(&IncludeNode, &NodeData::StyleSheet { ref rules }) => {
serializer.write_text(&serialize_css_rules(rules))?
}
(&IncludeNode, &NodeData::Comment { ref contents }) => {
serializer.write_comment(&contents)?
}

View File

@ -4,11 +4,13 @@ use url::{ParseError, Url};
use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
use html5ever::{
parse_document, parse_fragment, serialize, Attribute as HTML5everAttribute, LocalName, QualName,
};
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
use crate::arena_dom::{Arena, Attribute, Node, NodeData, Ref, Sink, StyleAttribute};
use crate::css_at_rule::CssAtRule;
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule, CssStyleRule};
use crate::css_property::CssProperty;
pub struct Sanitizer<'arena> {
@ -139,9 +141,6 @@ impl<'arena> Sanitizer<'arena> {
self.remove_attributes(node);
self.add_attributes(node);
self.sanitize_attribute_protocols(node);
// TODO: save the parsed CSS syntax tree from these methods onto the arena dom so that
// user-created transformers below will have access to modify them without having to
// re-parse.
self.sanitize_style_tag_css(node);
self.sanitize_style_attribute_css(node);
@ -160,9 +159,10 @@ impl<'arena> Sanitizer<'arena> {
fn should_unwrap_node(&self, node: Ref) -> bool {
match node.data {
NodeData::Document | NodeData::Text { .. } | NodeData::ProcessingInstruction { .. } => {
false
}
NodeData::Document
| NodeData::Text { .. }
| NodeData::StyleSheet { .. }
| NodeData::ProcessingInstruction { .. } => false,
NodeData::Comment { .. } => !self.config.allow_comments,
NodeData::Doctype { .. } => !self.config.allow_doctype,
NodeData::Element { ref name, .. } => {
@ -176,6 +176,7 @@ impl<'arena> Sanitizer<'arena> {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::Text { .. }
| NodeData::StyleSheet { .. }
| NodeData::ProcessingInstruction { .. }
| NodeData::Comment { .. } => false,
NodeData::Element { ref name, .. } => self
@ -197,15 +198,17 @@ impl<'arena> Sanitizer<'arena> {
let all_allowed = &self.config.allowed_attributes;
let per_element_allowed = self.config.allowed_attributes_per_element.get(&name.local);
while i != attrs.len() {
if !all_allowed.contains(&attrs[i].name.local) {
if let Some(per_element_allowed) = per_element_allowed {
if per_element_allowed.contains(&attrs[i].name.local) {
i += 1;
continue;
if let Attribute::Text(attr) = &attrs[i] {
if !all_allowed.contains(&attr.name.local) {
if let Some(per_element_allowed) = per_element_allowed {
if per_element_allowed.contains(&attr.name.local) {
i += 1;
continue;
}
}
attrs.remove(i);
continue;
}
attrs.remove(i);
continue;
}
i += 1;
}
@ -225,18 +228,18 @@ impl<'arena> Sanitizer<'arena> {
self.config.add_attributes_per_element.get(&name.local);
for (name, &value) in add_attributes.iter() {
attrs.push(Attribute {
attrs.push(Attribute::Text(HTML5everAttribute {
name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value),
});
}));
}
if let Some(add_attributes_per_element) = add_attributes_per_element {
for (name, &value) in add_attributes_per_element.iter() {
attrs.push(Attribute {
attrs.push(Attribute::Text(HTML5everAttribute {
name: QualName::new(None, ns!(), name.clone()),
value: StrTendril::from(value),
});
}));
}
}
}
@ -254,25 +257,30 @@ impl<'arena> Sanitizer<'arena> {
if let Some(protocols) = self.config.allowed_protocols.get(&name.local) {
let mut i = 0;
while i != attrs.len() {
if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) {
match Url::parse(&attrs[i].value) {
Ok(url) => {
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme())) {
if let Attribute::Text(attr) = &attrs[i] {
if let Some(allowed_protocols) = protocols.get(&attr.name.local) {
match Url::parse(&attr.value) {
Ok(url) => {
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme()))
{
attrs.remove(i);
} else {
i += 1;
}
}
Err(ParseError::RelativeUrlWithoutBase) => {
if !allowed_protocols.contains(&Protocol::Relative) {
attrs.remove(i);
} else {
i += 1;
}
}
Err(_) => {
attrs.remove(i);
} else {
i += 1;
}
}
Err(ParseError::RelativeUrlWithoutBase) => {
if !allowed_protocols.contains(&Protocol::Relative) {
attrs.remove(i);
} else {
i += 1;
}
}
Err(_) => {
attrs.remove(i);
}
} else {
i += 1;
}
} else {
i += 1;
@ -282,58 +290,50 @@ impl<'arena> Sanitizer<'arena> {
}
}
fn serialize_sanitized_css_rules(&self, rules: Vec<CssRule>) -> String {
let mut sanitized_css = String::new();
for rule in rules {
match rule {
CssRule::StyleRule(style_rule) => {
sanitized_css += &style_rule.selectors;
sanitized_css += "{";
for declaration in style_rule.declarations.into_iter() {
let declaration_string = &declaration.to_string();
if self
.config
.allowed_css_properties
.contains(&CssProperty::from(declaration.property))
{
sanitized_css += declaration_string;
}
}
sanitized_css += " }";
}
fn sanitize_css_rules(&self, rules: Vec<CssRule>) -> Vec<CssRule> {
rules
.into_iter()
.filter_map(|rule| match rule {
CssRule::StyleRule(style_rule) => Some(CssRule::StyleRule(CssStyleRule {
selectors: style_rule.selectors,
declarations: style_rule
.declarations
.into_iter()
.filter(|declaration| {
self.config
.allowed_css_properties
.contains(&CssProperty::from(declaration.property.as_str()))
})
.collect(),
})),
CssRule::AtRule(at_rule) => {
if self
.config
.allowed_css_at_rules
.contains(&CssAtRule::from(at_rule.name.clone()))
.contains(&CssAtRule::from(at_rule.name.as_str()))
{
sanitized_css += &format!("@{}", &at_rule.name);
sanitized_css += &at_rule.prelude;
if let Some(block) = at_rule.block {
sanitized_css += "{";
sanitized_css += &self.serialize_sanitized_css_rules(block);
sanitized_css += " }";
} else {
sanitized_css += "; ";
}
Some(CssRule::AtRule(at_rule))
} else {
None
}
}
}
}
sanitized_css
})
.collect()
}
fn sanitize_style_tag_css(&self, node: Ref<'arena>) {
if let NodeData::Text { ref contents } = node.data {
// TODO: seems rather expensive to lookup the parent on every Text node. Better
// solution would be to pass some sort of context from the parent that marks that this
// Text node is inside a <style>.
if let Some(parent) = node.parent.get() {
if let NodeData::Element { ref name, .. } = parent.data {
if name.local == local_name!("style") {
fn sanitize_style_tag_css(&'arena self, node: Ref<'arena>) {
if let NodeData::Element { ref name, .. } = node.data {
if name.local == local_name!("style") {
// TODO: is it okay to assume <style> tags will only ever have one text node child?
if let Some(first_child) = node.first_child.take() {
if let NodeData::Text { ref contents, .. } = first_child.data {
let rules = parse_css_stylesheet(&contents.borrow());
let sanitized_css = self.serialize_sanitized_css_rules(rules);
contents.replace(StrTendril::from(sanitized_css));
let sanitized_rules = self.sanitize_css_rules(rules);
first_child.detach();
let stylesheet = self.arena.alloc(Node::new(NodeData::StyleSheet {
rules: sanitized_rules,
}));
node.append(stylesheet);
}
}
}
@ -342,25 +342,32 @@ impl<'arena> Sanitizer<'arena> {
fn sanitize_style_attribute_css(&self, node: Ref<'arena>) {
if let NodeData::Element { ref attrs, .. } = node.data {
for attr in attrs.borrow_mut().iter_mut() {
if attr.name.local == local_name!("style") {
let css_str = &attr.value;
let declarations = parse_css_style_attribute(css_str);
let mut sanitized_css = String::new();
for declaration in declarations.into_iter() {
let declaration_string = &declaration.to_string();
if self
.config
.allowed_css_properties
.contains(&CssProperty::from(declaration.property))
{
sanitized_css += declaration_string;
sanitized_css += " ";
}
let mut i = 0;
let attrs = &mut attrs.borrow_mut();
while i != attrs.len() {
if let Attribute::Text(attr) = &attrs[i] {
if attr.name.local == local_name!("style") {
let css_str = &attr.value;
let mut declarations = parse_css_style_attribute(css_str);
declarations.retain(|declaration| {
self.config
.allowed_css_properties
.contains(&CssProperty::from(declaration.property.as_str()))
});
let name = attr.name.clone();
attrs.remove(i);
attrs.insert(
i,
Attribute::Style(StyleAttribute {
name,
value: declarations,
serialized_value: None,
}),
);
}
let sanitized_css = sanitized_css.trim();
attr.value = StrTendril::from(sanitized_css);
}
i += 1;
}
}
}