Parse HTML fragments
This commit is contained in:
parent
28caafb41c
commit
446aff77af
@ -16,7 +16,6 @@ extern crate typed_arena;
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cell::{Cell, RefCell};
|
use std::cell::{Cell, RefCell};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::default::Default;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::ptr;
|
use std::ptr;
|
||||||
@ -24,24 +23,12 @@ use std::ptr;
|
|||||||
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
|
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
|
||||||
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
|
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
|
||||||
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
|
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
|
||||||
use html5ever::tendril::{StrTendril, TendrilSink};
|
use html5ever::tendril::StrTendril;
|
||||||
use html5ever::{parse_document, Attribute, ExpandedName, LocalName, QualName};
|
use html5ever::{Attribute, ExpandedName, LocalName, QualName};
|
||||||
|
|
||||||
// TODO: does this function really belong here?
|
|
||||||
pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
|
|
||||||
let sink = Sink {
|
|
||||||
arena,
|
|
||||||
document: arena.alloc(Node::new(NodeData::Document)),
|
|
||||||
quirks_mode: QuirksMode::NoQuirks,
|
|
||||||
};
|
|
||||||
parse_document(sink, Default::default())
|
|
||||||
.from_utf8()
|
|
||||||
.one(bytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
|
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
|
||||||
arena.alloc(Node::new(NodeData::Element {
|
arena.alloc(Node::new(NodeData::Element {
|
||||||
name: QualName::new(None, ns!(), LocalName::from(name)),
|
name: QualName::new(None, ns!(html), LocalName::from(name)),
|
||||||
attrs: RefCell::new(vec![]),
|
attrs: RefCell::new(vec![]),
|
||||||
template_contents: None,
|
template_contents: None,
|
||||||
mathml_annotation_xml_integration_point: false,
|
mathml_annotation_xml_integration_point: false,
|
||||||
@ -55,9 +42,9 @@ pub type Ref<'arena> = &'arena Node<'arena>;
|
|||||||
pub type Link<'arena> = Cell<Option<Ref<'arena>>>;
|
pub type Link<'arena> = Cell<Option<Ref<'arena>>>;
|
||||||
|
|
||||||
pub struct Sink<'arena> {
|
pub struct Sink<'arena> {
|
||||||
arena: Arena<'arena>,
|
pub arena: Arena<'arena>,
|
||||||
document: Ref<'arena>,
|
pub document: Ref<'arena>,
|
||||||
quirks_mode: QuirksMode,
|
pub quirks_mode: QuirksMode,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
@ -53,7 +53,7 @@ fn main() {
|
|||||||
&add_single_elements_around_ul,
|
&add_single_elements_around_ul,
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
let root = transformer.parse(&mut io::stdin()).unwrap();
|
let root = transformer.parse_fragment(&mut io::stdin()).unwrap();
|
||||||
transformer.traverse(root);
|
transformer.traverse(root);
|
||||||
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
|
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
|
||||||
}
|
}
|
||||||
@ -103,7 +103,7 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
|||||||
// DONE: add whitelist of CSS properties, remove any not in it
|
// DONE: add whitelist of CSS properties, remove any not in it
|
||||||
// TODO: scope selectors in rich formatter
|
// TODO: scope selectors in rich formatter
|
||||||
// TODO: add class attributes to elements in rich formatter
|
// TODO: add class attributes to elements in rich formatter
|
||||||
// TODO: separate this out into multiple separate transformers
|
// DONE: separate this out into multiple separate transformers
|
||||||
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
|
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
|
||||||
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
|
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
|
||||||
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
|
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
extern crate typed_arena;
|
|
||||||
|
|
||||||
use std::io::{Error, Read};
|
use std::io::{Error, Read};
|
||||||
|
|
||||||
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref};
|
use html5ever::interface::tree_builder::QuirksMode;
|
||||||
|
use html5ever::tendril::TendrilSink;
|
||||||
|
use html5ever::{parse_document, parse_fragment, QualName};
|
||||||
|
|
||||||
// TODO: What are the performance implications of using a vec of boxed closures instead of one
|
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
||||||
|
|
||||||
|
// TODO: What are the performance implications of using a vec of closures instead of one
|
||||||
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
|
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
|
||||||
// TODO: how to integrate CSS parsing and transforming?
|
// TODO: how to integrate CSS parsing and transforming?
|
||||||
pub struct Transformer<'arena, T>
|
pub struct Transformer<'arena, T>
|
||||||
@ -31,10 +33,35 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
pub fn parse_document(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
data.read_to_end(&mut bytes)?;
|
data.read_to_end(&mut bytes)?;
|
||||||
Ok(html5ever_parse_slice_into_arena(&bytes, &self.arena))
|
let sink = Sink {
|
||||||
|
arena: &self.arena,
|
||||||
|
document: self.arena.alloc(Node::new(NodeData::Document)),
|
||||||
|
quirks_mode: QuirksMode::NoQuirks,
|
||||||
|
};
|
||||||
|
Ok(parse_document(sink, Default::default())
|
||||||
|
.from_utf8()
|
||||||
|
.one(&bytes[..]))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_fragment(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
data.read_to_end(&mut bytes)?;
|
||||||
|
let sink = Sink {
|
||||||
|
arena: &self.arena,
|
||||||
|
document: self.arena.alloc(Node::new(NodeData::Document)),
|
||||||
|
quirks_mode: QuirksMode::NoQuirks,
|
||||||
|
};
|
||||||
|
Ok(parse_fragment(
|
||||||
|
sink,
|
||||||
|
Default::default(),
|
||||||
|
QualName::new(None, ns!(html), local_name!("body")),
|
||||||
|
vec![],
|
||||||
|
)
|
||||||
|
.from_utf8()
|
||||||
|
.one(&bytes[..]))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn traverse(&'arena self, node: Ref<'arena>) {
|
pub fn traverse(&'arena self, node: Ref<'arena>) {
|
||||||
@ -92,39 +119,39 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
|
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
|
||||||
// if let NodeData::Element { ref name, .. } = node.data {
|
// if let NodeData::Element { ref name, .. } = node.data {
|
||||||
// if name.local == LocalName::from(tag_name) {
|
// if name.local == LocalName::from(tag_name) {
|
||||||
// return true;
|
// return true;
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// if let Some(child) = node.first_child.get() {
|
// if let Some(child) = node.first_child.get() {
|
||||||
// if node_contains_tag(child, tag_name) {
|
// if node_contains_tag(child, tag_name) {
|
||||||
// return true;
|
// return true;
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// if let Some(sibling) = node.next_sibling.get() {
|
// if let Some(sibling) = node.next_sibling.get() {
|
||||||
// if node_contains_tag(sibling, tag_name) {
|
// if node_contains_tag(sibling, tag_name) {
|
||||||
// return true;
|
// return true;
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// false
|
// false
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// fn count_nodes(node: Ref) -> usize {
|
// fn count_nodes(node: Ref) -> usize {
|
||||||
// let mut count = 1;
|
// let mut count = 1;
|
||||||
|
|
||||||
// if let Some(child) = node.first_child.get() {
|
// if let Some(child) = node.first_child.get() {
|
||||||
// count += count_nodes(child);
|
// count += count_nodes(child);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// if let Some(sibling) = node.next_sibling.get() {
|
// if let Some(sibling) = node.next_sibling.get() {
|
||||||
// count += count_nodes(sibling);
|
// count += count_nodes(sibling);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// count
|
// count
|
||||||
// }
|
// }
|
||||||
|
|
||||||
fn assert_serialized_html_eq(node: Ref, expected: &str) {
|
fn assert_serialized_html_eq(node: Ref, expected: &str) {
|
||||||
@ -137,9 +164,9 @@ mod test {
|
|||||||
fn traversal() {
|
fn traversal() {
|
||||||
let transformer = Transformer::new(|_| false, vec![&|_, _| {}]);
|
let transformer = Transformer::new(|_| false, vec![&|_, _| {}]);
|
||||||
let mut mock_data = MockRead::new("<div></div>");
|
let mut mock_data = MockRead::new("<div></div>");
|
||||||
let root = transformer.parse(&mut mock_data).unwrap();
|
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
||||||
transformer.traverse(root);
|
transformer.traverse(root);
|
||||||
assert_serialized_html_eq(root, "<html><head></head><body><div></div></body></html>");
|
assert_serialized_html_eq(root, "<html><div></div></html>");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -154,9 +181,9 @@ mod test {
|
|||||||
vec![&|_, _| {}],
|
vec![&|_, _| {}],
|
||||||
);
|
);
|
||||||
let mut mock_data = MockRead::new("<div></div>");
|
let mut mock_data = MockRead::new("<div></div>");
|
||||||
let root = transformer.parse(&mut mock_data).unwrap();
|
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
||||||
transformer.traverse(root);
|
transformer.traverse(root);
|
||||||
assert_serialized_html_eq(root, "<html><head></head><body></body></html>");
|
assert_serialized_html_eq(root, "<html></html>");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -172,11 +199,8 @@ mod test {
|
|||||||
}],
|
}],
|
||||||
);
|
);
|
||||||
let mut mock_data = MockRead::new("<div></div>");
|
let mut mock_data = MockRead::new("<div></div>");
|
||||||
let root = transformer.parse(&mut mock_data).unwrap();
|
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
||||||
transformer.traverse(root);
|
transformer.traverse(root);
|
||||||
assert_serialized_html_eq(
|
assert_serialized_html_eq(root, "<html><div></div><span></span></html>");
|
||||||
root,
|
|
||||||
"<html><head></head><body><div></div><span></span></body></html>",
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user