Parse HTML fragments

This commit is contained in:
Tyler Hallada 2020-04-19 15:41:16 -04:00
parent 28caafb41c
commit 446aff77af
3 changed files with 71 additions and 60 deletions

View File

@ -16,7 +16,6 @@ extern crate typed_arena;
use std::borrow::Cow;
use std::cell::{Cell, RefCell};
use std::collections::HashSet;
use std::default::Default;
use std::fmt;
use std::io;
use std::ptr;
@ -24,24 +23,12 @@ use std::ptr;
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{parse_document, Attribute, ExpandedName, LocalName, QualName};
// TODO: does this function really belong here?
pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
let sink = Sink {
arena,
document: arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
parse_document(sink, Default::default())
.from_utf8()
.one(bytes)
}
use html5ever::tendril::StrTendril;
use html5ever::{Attribute, ExpandedName, LocalName, QualName};
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
arena.alloc(Node::new(NodeData::Element {
name: QualName::new(None, ns!(), LocalName::from(name)),
name: QualName::new(None, ns!(html), LocalName::from(name)),
attrs: RefCell::new(vec![]),
template_contents: None,
mathml_annotation_xml_integration_point: false,
@ -55,9 +42,9 @@ pub type Ref<'arena> = &'arena Node<'arena>;
pub type Link<'arena> = Cell<Option<Ref<'arena>>>;
pub struct Sink<'arena> {
arena: Arena<'arena>,
document: Ref<'arena>,
quirks_mode: QuirksMode,
pub arena: Arena<'arena>,
pub document: Ref<'arena>,
pub quirks_mode: QuirksMode,
}
#[derive(Debug)]

View File

@ -53,7 +53,7 @@ fn main() {
&add_single_elements_around_ul,
],
);
let root = transformer.parse(&mut io::stdin()).unwrap();
let root = transformer.parse_fragment(&mut io::stdin()).unwrap();
transformer.traverse(root);
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
}
@ -103,7 +103,7 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
// DONE: add whitelist of CSS properties, remove any not in it
// TODO: scope selectors in rich formatter
// TODO: add class attributes to elements in rich formatter
// TODO: separate this out into multiple separate transformers
// DONE: separate this out into multiple separate transformers
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {

View File

@ -1,10 +1,12 @@
extern crate typed_arena;
use std::io::{Error, Read};
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref};
use html5ever::interface::tree_builder::QuirksMode;
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, parse_fragment, QualName};
// TODO: What are the performance implications of using a vec of boxed closures instead of one
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
// TODO: What are the performance implications of using a vec of closures instead of one
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
// TODO: how to integrate CSS parsing and transforming?
pub struct Transformer<'arena, T>
@ -31,10 +33,35 @@ where
}
}
pub fn parse(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
pub fn parse_document(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
let mut bytes = Vec::new();
data.read_to_end(&mut bytes)?;
Ok(html5ever_parse_slice_into_arena(&bytes, &self.arena))
let sink = Sink {
arena: &self.arena,
document: self.arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
Ok(parse_document(sink, Default::default())
.from_utf8()
.one(&bytes[..]))
}
pub fn parse_fragment(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
let mut bytes = Vec::new();
data.read_to_end(&mut bytes)?;
let sink = Sink {
arena: &self.arena,
document: self.arena.alloc(Node::new(NodeData::Document)),
quirks_mode: QuirksMode::NoQuirks,
};
Ok(parse_fragment(
sink,
Default::default(),
QualName::new(None, ns!(html), local_name!("body")),
vec![],
)
.from_utf8()
.one(&bytes[..]))
}
pub fn traverse(&'arena self, node: Ref<'arena>) {
@ -92,39 +119,39 @@ mod test {
}
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
// if let NodeData::Element { ref name, .. } = node.data {
// if name.local == LocalName::from(tag_name) {
// return true;
// }
// }
// if let NodeData::Element { ref name, .. } = node.data {
// if name.local == LocalName::from(tag_name) {
// return true;
// }
// }
// if let Some(child) = node.first_child.get() {
// if node_contains_tag(child, tag_name) {
// return true;
// }
// }
// if let Some(child) = node.first_child.get() {
// if node_contains_tag(child, tag_name) {
// return true;
// }
// }
// if let Some(sibling) = node.next_sibling.get() {
// if node_contains_tag(sibling, tag_name) {
// return true;
// }
// }
// if let Some(sibling) = node.next_sibling.get() {
// if node_contains_tag(sibling, tag_name) {
// return true;
// }
// }
// false
// false
// }
// fn count_nodes(node: Ref) -> usize {
// let mut count = 1;
// let mut count = 1;
// if let Some(child) = node.first_child.get() {
// count += count_nodes(child);
// }
// if let Some(child) = node.first_child.get() {
// count += count_nodes(child);
// }
// if let Some(sibling) = node.next_sibling.get() {
// count += count_nodes(sibling);
// }
// if let Some(sibling) = node.next_sibling.get() {
// count += count_nodes(sibling);
// }
// count
// count
// }
fn assert_serialized_html_eq(node: Ref, expected: &str) {
@ -137,9 +164,9 @@ mod test {
fn traversal() {
let transformer = Transformer::new(|_| false, vec![&|_, _| {}]);
let mut mock_data = MockRead::new("<div></div>");
let root = transformer.parse(&mut mock_data).unwrap();
let root = transformer.parse_fragment(&mut mock_data).unwrap();
transformer.traverse(root);
assert_serialized_html_eq(root, "<html><head></head><body><div></div></body></html>");
assert_serialized_html_eq(root, "<html><div></div></html>");
}
#[test]
@ -154,9 +181,9 @@ mod test {
vec![&|_, _| {}],
);
let mut mock_data = MockRead::new("<div></div>");
let root = transformer.parse(&mut mock_data).unwrap();
let root = transformer.parse_fragment(&mut mock_data).unwrap();
transformer.traverse(root);
assert_serialized_html_eq(root, "<html><head></head><body></body></html>");
assert_serialized_html_eq(root, "<html></html>");
}
#[test]
@ -172,11 +199,8 @@ mod test {
}],
);
let mut mock_data = MockRead::new("<div></div>");
let root = transformer.parse(&mut mock_data).unwrap();
let root = transformer.parse_fragment(&mut mock_data).unwrap();
transformer.traverse(root);
assert_serialized_html_eq(
root,
"<html><head></head><body><div></div><span></span></body></html>",
);
assert_serialized_html_eq(root, "<html><div></div><span></span></html>");
}
}