Parse HTML fragments

This commit is contained in:
Tyler Hallada 2020-04-19 15:41:16 -04:00
parent 28caafb41c
commit 446aff77af
3 changed files with 71 additions and 60 deletions

View File

@ -16,7 +16,6 @@ extern crate typed_arena;
use std::borrow::Cow; use std::borrow::Cow;
use std::cell::{Cell, RefCell}; use std::cell::{Cell, RefCell};
use std::collections::HashSet; use std::collections::HashSet;
use std::default::Default;
use std::fmt; use std::fmt;
use std::io; use std::io;
use std::ptr; use std::ptr;
@ -24,24 +23,12 @@ use std::ptr;
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer, TraversalScope}; use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::tendril::StrTendril;
use html5ever::{parse_document, Attribute, ExpandedName, LocalName, QualName}; use html5ever::{Attribute, ExpandedName, LocalName, QualName};
// TODO: does this function really belong here?
pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
let sink = Sink {
arena,
document: arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
parse_document(sink, Default::default())
.from_utf8()
.one(bytes)
}
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> { pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
arena.alloc(Node::new(NodeData::Element { arena.alloc(Node::new(NodeData::Element {
name: QualName::new(None, ns!(), LocalName::from(name)), name: QualName::new(None, ns!(html), LocalName::from(name)),
attrs: RefCell::new(vec![]), attrs: RefCell::new(vec![]),
template_contents: None, template_contents: None,
mathml_annotation_xml_integration_point: false, mathml_annotation_xml_integration_point: false,
@ -55,9 +42,9 @@ pub type Ref<'arena> = &'arena Node<'arena>;
pub type Link<'arena> = Cell<Option<Ref<'arena>>>; pub type Link<'arena> = Cell<Option<Ref<'arena>>>;
pub struct Sink<'arena> { pub struct Sink<'arena> {
arena: Arena<'arena>, pub arena: Arena<'arena>,
document: Ref<'arena>, pub document: Ref<'arena>,
quirks_mode: QuirksMode, pub quirks_mode: QuirksMode,
} }
#[derive(Debug)] #[derive(Debug)]

View File

@ -53,7 +53,7 @@ fn main() {
&add_single_elements_around_ul, &add_single_elements_around_ul,
], ],
); );
let root = transformer.parse(&mut io::stdin()).unwrap(); let root = transformer.parse_fragment(&mut io::stdin()).unwrap();
transformer.traverse(root); transformer.traverse(root);
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed") serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
} }
@ -103,7 +103,7 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
// DONE: add whitelist of CSS properties, remove any not in it // DONE: add whitelist of CSS properties, remove any not in it
// TODO: scope selectors in rich formatter // TODO: scope selectors in rich formatter
// TODO: add class attributes to elements in rich formatter // TODO: add class attributes to elements in rich formatter
// TODO: separate this out into multiple separate transformers // DONE: separate this out into multiple separate transformers
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation // TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements. // detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) { fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {

View File

@ -1,10 +1,12 @@
extern crate typed_arena;
use std::io::{Error, Read}; use std::io::{Error, Read};
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref}; use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, parse_fragment, QualName};
// TODO: What are the performance implications of using a vec of boxed closures instead of one use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
// TODO: What are the performance implications of using a vec of closures instead of one
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)? // transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
// TODO: how to integrate CSS parsing and transforming? // TODO: how to integrate CSS parsing and transforming?
pub struct Transformer<'arena, T> pub struct Transformer<'arena, T>
@ -31,10 +33,35 @@ where
} }
} }
pub fn parse(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> { pub fn parse_document(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
let mut bytes = Vec::new(); let mut bytes = Vec::new();
data.read_to_end(&mut bytes)?; data.read_to_end(&mut bytes)?;
Ok(html5ever_parse_slice_into_arena(&bytes, &self.arena)) let sink = Sink {
arena: &self.arena,
document: self.arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
Ok(parse_document(sink, Default::default())
.from_utf8()
.one(&bytes[..]))
}
pub fn parse_fragment(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
let mut bytes = Vec::new();
data.read_to_end(&mut bytes)?;
let sink = Sink {
arena: &self.arena,
document: self.arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
Ok(parse_fragment(
sink,
Default::default(),
QualName::new(None, ns!(html), local_name!("body")),
vec![],
)
.from_utf8()
.one(&bytes[..]))
} }
pub fn traverse(&'arena self, node: Ref<'arena>) { pub fn traverse(&'arena self, node: Ref<'arena>) {
@ -92,39 +119,39 @@ mod test {
} }
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool { // fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
// if let NodeData::Element { ref name, .. } = node.data { // if let NodeData::Element { ref name, .. } = node.data {
// if name.local == LocalName::from(tag_name) { // if name.local == LocalName::from(tag_name) {
// return true; // return true;
// } // }
// } // }
// if let Some(child) = node.first_child.get() { // if let Some(child) = node.first_child.get() {
// if node_contains_tag(child, tag_name) { // if node_contains_tag(child, tag_name) {
// return true; // return true;
// } // }
// } // }
// if let Some(sibling) = node.next_sibling.get() { // if let Some(sibling) = node.next_sibling.get() {
// if node_contains_tag(sibling, tag_name) { // if node_contains_tag(sibling, tag_name) {
// return true; // return true;
// } // }
// } // }
// false // false
// } // }
// fn count_nodes(node: Ref) -> usize { // fn count_nodes(node: Ref) -> usize {
// let mut count = 1; // let mut count = 1;
// if let Some(child) = node.first_child.get() { // if let Some(child) = node.first_child.get() {
// count += count_nodes(child); // count += count_nodes(child);
// } // }
// if let Some(sibling) = node.next_sibling.get() { // if let Some(sibling) = node.next_sibling.get() {
// count += count_nodes(sibling); // count += count_nodes(sibling);
// } // }
// count // count
// } // }
fn assert_serialized_html_eq(node: Ref, expected: &str) { fn assert_serialized_html_eq(node: Ref, expected: &str) {
@ -137,9 +164,9 @@ mod test {
fn traversal() { fn traversal() {
let transformer = Transformer::new(|_| false, vec![&|_, _| {}]); let transformer = Transformer::new(|_| false, vec![&|_, _| {}]);
let mut mock_data = MockRead::new("<div></div>"); let mut mock_data = MockRead::new("<div></div>");
let root = transformer.parse(&mut mock_data).unwrap(); let root = transformer.parse_fragment(&mut mock_data).unwrap();
transformer.traverse(root); transformer.traverse(root);
assert_serialized_html_eq(root, "<html><head></head><body><div></div></body></html>"); assert_serialized_html_eq(root, "<html><div></div></html>");
} }
#[test] #[test]
@ -154,9 +181,9 @@ mod test {
vec![&|_, _| {}], vec![&|_, _| {}],
); );
let mut mock_data = MockRead::new("<div></div>"); let mut mock_data = MockRead::new("<div></div>");
let root = transformer.parse(&mut mock_data).unwrap(); let root = transformer.parse_fragment(&mut mock_data).unwrap();
transformer.traverse(root); transformer.traverse(root);
assert_serialized_html_eq(root, "<html><head></head><body></body></html>"); assert_serialized_html_eq(root, "<html></html>");
} }
#[test] #[test]
@ -172,11 +199,8 @@ mod test {
}], }],
); );
let mut mock_data = MockRead::new("<div></div>"); let mut mock_data = MockRead::new("<div></div>");
let root = transformer.parse(&mut mock_data).unwrap(); let root = transformer.parse_fragment(&mut mock_data).unwrap();
transformer.traverse(root); transformer.traverse(root);
assert_serialized_html_eq( assert_serialized_html_eq(root, "<html><div></div><span></span></html>");
root,
"<html><head></head><body><div></div><span></span></body></html>",
);
} }
} }