Save parsed CSS to the arena_dom tree
Now, user transformer functions have access to the parsed CSS rules and declarations directly on the `NodeData`. Serialization of CSS back to a string is deferred to when the rest of the HTML tree is serialized to a string.
This commit is contained in:
parent
269f296d48
commit
519c4067b7
121
src/arena_dom.rs
121
src/arena_dom.rs
@ -24,7 +24,9 @@ use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, T
|
||||
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
|
||||
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
|
||||
use html5ever::tendril::StrTendril;
|
||||
use html5ever::{Attribute, ExpandedName, LocalName, QualName};
|
||||
use html5ever::{Attribute as HTML5everAttribute, ExpandedName, LocalName, QualName};
|
||||
|
||||
use crate::css_parser::{CssDeclaration, CssRule};
|
||||
|
||||
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
|
||||
arena.alloc(Node::new(NodeData::Element {
|
||||
@ -57,6 +59,23 @@ pub struct Node<'arena> {
|
||||
pub data: NodeData<'arena>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct StyleAttribute {
|
||||
pub name: QualName,
|
||||
pub value: Vec<CssDeclaration>,
|
||||
// Need to store the serialized value to the arena because html5ever expects a &str for
|
||||
// attribute values during serlialization. If this is None, `serialize` will construct a String
|
||||
// from serializing the `CssDeclaration`s, store it here, and then reference it with
|
||||
// `.as_str()`.
|
||||
pub serialized_value: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Attribute {
|
||||
Style(StyleAttribute),
|
||||
Text(HTML5everAttribute),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum NodeData<'arena> {
|
||||
Document,
|
||||
@ -68,6 +87,9 @@ pub enum NodeData<'arena> {
|
||||
Text {
|
||||
contents: RefCell<StrTendril>,
|
||||
},
|
||||
StyleSheet {
|
||||
rules: Vec<CssRule>,
|
||||
},
|
||||
Comment {
|
||||
contents: StrTendril,
|
||||
},
|
||||
@ -232,6 +254,14 @@ impl<'arena> fmt::Display for NodeData<'arena> {
|
||||
"Text: {}...",
|
||||
&contents.borrow().chars().take(10).collect::<String>()
|
||||
),
|
||||
NodeData::StyleSheet { rules } => write!(
|
||||
f,
|
||||
"Stylesheet: {}...",
|
||||
&serialize_css_rules(rules)
|
||||
.chars()
|
||||
.take(10)
|
||||
.collect::<String>()
|
||||
),
|
||||
NodeData::ProcessingInstruction { .. } => write!(f, "ProcessingInstruction: ..."),
|
||||
NodeData::Comment { contents } => write!(
|
||||
f,
|
||||
@ -379,12 +409,17 @@ impl<'arena> TreeSink for Sink<'arena> {
|
||||
fn create_element(
|
||||
&mut self,
|
||||
name: QualName,
|
||||
attrs: Vec<Attribute>,
|
||||
attrs: Vec<HTML5everAttribute>,
|
||||
flags: ElementFlags,
|
||||
) -> Ref<'arena> {
|
||||
self.new_node(NodeData::Element {
|
||||
name,
|
||||
attrs: RefCell::new(attrs),
|
||||
attrs: RefCell::new(
|
||||
attrs
|
||||
.into_iter()
|
||||
.map(|attr| Attribute::Text(attr))
|
||||
.collect(),
|
||||
),
|
||||
template_contents: if flags.template {
|
||||
Some(self.new_node(NodeData::Document))
|
||||
} else {
|
||||
@ -447,7 +482,7 @@ impl<'arena> TreeSink for Sink<'arena> {
|
||||
}))
|
||||
}
|
||||
|
||||
fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<Attribute>) {
|
||||
fn add_attrs_if_missing(&mut self, target: &Ref<'arena>, attrs: Vec<HTML5everAttribute>) {
|
||||
let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
|
||||
attrs.borrow_mut()
|
||||
} else {
|
||||
@ -456,13 +491,18 @@ impl<'arena> TreeSink for Sink<'arena> {
|
||||
|
||||
let existing_names = existing
|
||||
.iter()
|
||||
.map(|e| e.name.clone())
|
||||
.map(|e| match e {
|
||||
Attribute::Style(attr) => attr.name.clone(),
|
||||
Attribute::Text(attr) => attr.name.clone(),
|
||||
})
|
||||
.collect::<HashSet<_>>();
|
||||
existing.extend(
|
||||
attrs
|
||||
.into_iter()
|
||||
.filter(|attr| !existing_names.contains(&attr.name)),
|
||||
);
|
||||
existing.extend(attrs.into_iter().filter_map(|attr| {
|
||||
if !existing_names.contains(&attr.name) {
|
||||
Some(Attribute::Text(attr))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
fn remove_from_parent(&mut self, target: &Ref<'arena>) {
|
||||
@ -479,6 +519,47 @@ impl<'arena> TreeSink for Sink<'arena> {
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_css_rules(rules: &[CssRule]) -> String {
|
||||
let mut serialized_rules = String::new();
|
||||
for rule in rules {
|
||||
match rule {
|
||||
CssRule::StyleRule(style_rule) => {
|
||||
serialized_rules += &style_rule.selectors;
|
||||
serialized_rules += "{";
|
||||
for declaration in style_rule.declarations.iter() {
|
||||
serialized_rules += &declaration.to_string();
|
||||
}
|
||||
serialized_rules += &serialize_css_declarations(&style_rule.declarations);
|
||||
serialized_rules += " }";
|
||||
}
|
||||
CssRule::AtRule(at_rule) => {
|
||||
serialized_rules += "@";
|
||||
serialized_rules += &at_rule.name;
|
||||
serialized_rules += &at_rule.prelude;
|
||||
if let Some(block) = &at_rule.block {
|
||||
serialized_rules += "{";
|
||||
serialized_rules += &serialize_css_rules(&block);
|
||||
serialized_rules += " }";
|
||||
} else {
|
||||
serialized_rules += "; ";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
serialized_rules
|
||||
}
|
||||
|
||||
fn serialize_css_declarations(declarations: &[CssDeclaration]) -> String {
|
||||
let mut serialized_declarations = String::new();
|
||||
for (index, declaration) in declarations.iter().enumerate() {
|
||||
serialized_declarations += &declaration.to_string();
|
||||
if index != declarations.len() - 1 {
|
||||
serialized_declarations += " ";
|
||||
}
|
||||
}
|
||||
serialized_declarations
|
||||
}
|
||||
|
||||
// Implementation adapted from implementation for RcDom:
|
||||
// https://github.com/servo/html5ever/blob/45b2fca5c6/markup5ever/rcdom.rs#L410
|
||||
impl<'arena> Serialize for Node<'arena> {
|
||||
@ -498,7 +579,22 @@ impl<'arena> Serialize for Node<'arena> {
|
||||
if traversal_scope == IncludeNode {
|
||||
serializer.start_elem(
|
||||
name.clone(),
|
||||
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
|
||||
attrs.borrow_mut().iter_mut().map(|at| match at {
|
||||
Attribute::Style(at) => {
|
||||
if at.serialized_value.is_none() {
|
||||
let serialized_declaration =
|
||||
serialize_css_declarations(&at.value);
|
||||
at.serialized_value = Some(serialized_declaration);
|
||||
}
|
||||
|
||||
if let Some(serialized_declarations) = &at.serialized_value {
|
||||
(&at.name, serialized_declarations.as_str())
|
||||
} else {
|
||||
panic!("Serialized style attribute value was not saved to the arena");
|
||||
}
|
||||
}
|
||||
Attribute::Text(at) => (&at.name, &at.value[..]),
|
||||
}),
|
||||
)?;
|
||||
}
|
||||
|
||||
@ -525,6 +621,9 @@ impl<'arena> Serialize for Node<'arena> {
|
||||
(&IncludeNode, &NodeData::Text { ref contents }) => {
|
||||
serializer.write_text(&contents.borrow())?
|
||||
}
|
||||
(&IncludeNode, &NodeData::StyleSheet { ref rules }) => {
|
||||
serializer.write_text(&serialize_css_rules(rules))?
|
||||
}
|
||||
(&IncludeNode, &NodeData::Comment { ref contents }) => {
|
||||
serializer.write_comment(&contents)?
|
||||
}
|
||||
|
149
src/sanitizer.rs
149
src/sanitizer.rs
@ -4,11 +4,13 @@ use url::{ParseError, Url};
|
||||
|
||||
use html5ever::interface::tree_builder::QuirksMode;
|
||||
use html5ever::tendril::{format_tendril, StrTendril, TendrilSink};
|
||||
use html5ever::{parse_document, parse_fragment, serialize, Attribute, LocalName, QualName};
|
||||
use html5ever::{
|
||||
parse_document, parse_fragment, serialize, Attribute as HTML5everAttribute, LocalName, QualName,
|
||||
};
|
||||
|
||||
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
||||
use crate::arena_dom::{Arena, Attribute, Node, NodeData, Ref, Sink, StyleAttribute};
|
||||
use crate::css_at_rule::CssAtRule;
|
||||
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule};
|
||||
use crate::css_parser::{parse_css_style_attribute, parse_css_stylesheet, CssRule, CssStyleRule};
|
||||
use crate::css_property::CssProperty;
|
||||
|
||||
pub struct Sanitizer<'arena> {
|
||||
@ -139,9 +141,6 @@ impl<'arena> Sanitizer<'arena> {
|
||||
self.remove_attributes(node);
|
||||
self.add_attributes(node);
|
||||
self.sanitize_attribute_protocols(node);
|
||||
// TODO: save the parsed CSS syntax tree from these methods onto the arena dom so that
|
||||
// user-created transformers below will have access to modify them without having to
|
||||
// re-parse.
|
||||
self.sanitize_style_tag_css(node);
|
||||
self.sanitize_style_attribute_css(node);
|
||||
|
||||
@ -160,9 +159,10 @@ impl<'arena> Sanitizer<'arena> {
|
||||
|
||||
fn should_unwrap_node(&self, node: Ref) -> bool {
|
||||
match node.data {
|
||||
NodeData::Document | NodeData::Text { .. } | NodeData::ProcessingInstruction { .. } => {
|
||||
false
|
||||
}
|
||||
NodeData::Document
|
||||
| NodeData::Text { .. }
|
||||
| NodeData::StyleSheet { .. }
|
||||
| NodeData::ProcessingInstruction { .. } => false,
|
||||
NodeData::Comment { .. } => !self.config.allow_comments,
|
||||
NodeData::Doctype { .. } => !self.config.allow_doctype,
|
||||
NodeData::Element { ref name, .. } => {
|
||||
@ -176,6 +176,7 @@ impl<'arena> Sanitizer<'arena> {
|
||||
NodeData::Document
|
||||
| NodeData::Doctype { .. }
|
||||
| NodeData::Text { .. }
|
||||
| NodeData::StyleSheet { .. }
|
||||
| NodeData::ProcessingInstruction { .. }
|
||||
| NodeData::Comment { .. } => false,
|
||||
NodeData::Element { ref name, .. } => self
|
||||
@ -197,9 +198,10 @@ impl<'arena> Sanitizer<'arena> {
|
||||
let all_allowed = &self.config.allowed_attributes;
|
||||
let per_element_allowed = self.config.allowed_attributes_per_element.get(&name.local);
|
||||
while i != attrs.len() {
|
||||
if !all_allowed.contains(&attrs[i].name.local) {
|
||||
if let Attribute::Text(attr) = &attrs[i] {
|
||||
if !all_allowed.contains(&attr.name.local) {
|
||||
if let Some(per_element_allowed) = per_element_allowed {
|
||||
if per_element_allowed.contains(&attrs[i].name.local) {
|
||||
if per_element_allowed.contains(&attr.name.local) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
@ -207,6 +209,7 @@ impl<'arena> Sanitizer<'arena> {
|
||||
attrs.remove(i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
@ -225,18 +228,18 @@ impl<'arena> Sanitizer<'arena> {
|
||||
self.config.add_attributes_per_element.get(&name.local);
|
||||
|
||||
for (name, &value) in add_attributes.iter() {
|
||||
attrs.push(Attribute {
|
||||
attrs.push(Attribute::Text(HTML5everAttribute {
|
||||
name: QualName::new(None, ns!(), name.clone()),
|
||||
value: StrTendril::from(value),
|
||||
});
|
||||
}));
|
||||
}
|
||||
|
||||
if let Some(add_attributes_per_element) = add_attributes_per_element {
|
||||
for (name, &value) in add_attributes_per_element.iter() {
|
||||
attrs.push(Attribute {
|
||||
attrs.push(Attribute::Text(HTML5everAttribute {
|
||||
name: QualName::new(None, ns!(), name.clone()),
|
||||
value: StrTendril::from(value),
|
||||
});
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -254,10 +257,12 @@ impl<'arena> Sanitizer<'arena> {
|
||||
if let Some(protocols) = self.config.allowed_protocols.get(&name.local) {
|
||||
let mut i = 0;
|
||||
while i != attrs.len() {
|
||||
if let Some(allowed_protocols) = protocols.get(&attrs[i].name.local) {
|
||||
match Url::parse(&attrs[i].value) {
|
||||
if let Attribute::Text(attr) = &attrs[i] {
|
||||
if let Some(allowed_protocols) = protocols.get(&attr.name.local) {
|
||||
match Url::parse(&attr.value) {
|
||||
Ok(url) => {
|
||||
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme())) {
|
||||
if !allowed_protocols.contains(&Protocol::Scheme(url.scheme()))
|
||||
{
|
||||
attrs.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
@ -277,63 +282,58 @@ impl<'arena> Sanitizer<'arena> {
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_sanitized_css_rules(&self, rules: Vec<CssRule>) -> String {
|
||||
let mut sanitized_css = String::new();
|
||||
for rule in rules {
|
||||
match rule {
|
||||
CssRule::StyleRule(style_rule) => {
|
||||
sanitized_css += &style_rule.selectors;
|
||||
sanitized_css += "{";
|
||||
for declaration in style_rule.declarations.into_iter() {
|
||||
let declaration_string = &declaration.to_string();
|
||||
if self
|
||||
.config
|
||||
fn sanitize_css_rules(&self, rules: Vec<CssRule>) -> Vec<CssRule> {
|
||||
rules
|
||||
.into_iter()
|
||||
.filter_map(|rule| match rule {
|
||||
CssRule::StyleRule(style_rule) => Some(CssRule::StyleRule(CssStyleRule {
|
||||
selectors: style_rule.selectors,
|
||||
declarations: style_rule
|
||||
.declarations
|
||||
.into_iter()
|
||||
.filter(|declaration| {
|
||||
self.config
|
||||
.allowed_css_properties
|
||||
.contains(&CssProperty::from(declaration.property))
|
||||
{
|
||||
sanitized_css += declaration_string;
|
||||
}
|
||||
}
|
||||
sanitized_css += " }";
|
||||
}
|
||||
.contains(&CssProperty::from(declaration.property.as_str()))
|
||||
})
|
||||
.collect(),
|
||||
})),
|
||||
CssRule::AtRule(at_rule) => {
|
||||
if self
|
||||
.config
|
||||
.allowed_css_at_rules
|
||||
.contains(&CssAtRule::from(at_rule.name.clone()))
|
||||
.contains(&CssAtRule::from(at_rule.name.as_str()))
|
||||
{
|
||||
sanitized_css += &format!("@{}", &at_rule.name);
|
||||
sanitized_css += &at_rule.prelude;
|
||||
if let Some(block) = at_rule.block {
|
||||
sanitized_css += "{";
|
||||
sanitized_css += &self.serialize_sanitized_css_rules(block);
|
||||
sanitized_css += " }";
|
||||
Some(CssRule::AtRule(at_rule))
|
||||
} else {
|
||||
sanitized_css += "; ";
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sanitized_css
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn sanitize_style_tag_css(&self, node: Ref<'arena>) {
|
||||
if let NodeData::Text { ref contents } = node.data {
|
||||
// TODO: seems rather expensive to lookup the parent on every Text node. Better
|
||||
// solution would be to pass some sort of context from the parent that marks that this
|
||||
// Text node is inside a <style>.
|
||||
if let Some(parent) = node.parent.get() {
|
||||
if let NodeData::Element { ref name, .. } = parent.data {
|
||||
fn sanitize_style_tag_css(&'arena self, node: Ref<'arena>) {
|
||||
if let NodeData::Element { ref name, .. } = node.data {
|
||||
if name.local == local_name!("style") {
|
||||
// TODO: is it okay to assume <style> tags will only ever have one text node child?
|
||||
if let Some(first_child) = node.first_child.take() {
|
||||
if let NodeData::Text { ref contents, .. } = first_child.data {
|
||||
let rules = parse_css_stylesheet(&contents.borrow());
|
||||
let sanitized_css = self.serialize_sanitized_css_rules(rules);
|
||||
contents.replace(StrTendril::from(sanitized_css));
|
||||
let sanitized_rules = self.sanitize_css_rules(rules);
|
||||
first_child.detach();
|
||||
let stylesheet = self.arena.alloc(Node::new(NodeData::StyleSheet {
|
||||
rules: sanitized_rules,
|
||||
}));
|
||||
node.append(stylesheet);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -342,25 +342,32 @@ impl<'arena> Sanitizer<'arena> {
|
||||
|
||||
fn sanitize_style_attribute_css(&self, node: Ref<'arena>) {
|
||||
if let NodeData::Element { ref attrs, .. } = node.data {
|
||||
for attr in attrs.borrow_mut().iter_mut() {
|
||||
let mut i = 0;
|
||||
let attrs = &mut attrs.borrow_mut();
|
||||
|
||||
while i != attrs.len() {
|
||||
if let Attribute::Text(attr) = &attrs[i] {
|
||||
if attr.name.local == local_name!("style") {
|
||||
let css_str = &attr.value;
|
||||
let declarations = parse_css_style_attribute(css_str);
|
||||
let mut sanitized_css = String::new();
|
||||
for declaration in declarations.into_iter() {
|
||||
let declaration_string = &declaration.to_string();
|
||||
if self
|
||||
.config
|
||||
let mut declarations = parse_css_style_attribute(css_str);
|
||||
declarations.retain(|declaration| {
|
||||
self.config
|
||||
.allowed_css_properties
|
||||
.contains(&CssProperty::from(declaration.property))
|
||||
{
|
||||
sanitized_css += declaration_string;
|
||||
sanitized_css += " ";
|
||||
.contains(&CssProperty::from(declaration.property.as_str()))
|
||||
});
|
||||
let name = attr.name.clone();
|
||||
attrs.remove(i);
|
||||
attrs.insert(
|
||||
i,
|
||||
Attribute::Style(StyleAttribute {
|
||||
name,
|
||||
value: declarations,
|
||||
serialized_value: None,
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
let sanitized_css = sanitized_css.trim();
|
||||
attr.value = StrTendril::from(sanitized_css);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user