Parse HTML fragments
This commit is contained in:
parent
28caafb41c
commit
446aff77af
@ -16,7 +16,6 @@ extern crate typed_arena;
|
||||
use std::borrow::Cow;
|
||||
use std::cell::{Cell, RefCell};
|
||||
use std::collections::HashSet;
|
||||
use std::default::Default;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::ptr;
|
||||
@ -24,24 +23,12 @@ use std::ptr;
|
||||
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
|
||||
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
|
||||
use html5ever::serialize::{Serialize, Serializer, TraversalScope};
|
||||
use html5ever::tendril::{StrTendril, TendrilSink};
|
||||
use html5ever::{parse_document, Attribute, ExpandedName, LocalName, QualName};
|
||||
|
||||
// TODO: does this function really belong here?
|
||||
pub fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> {
|
||||
let sink = Sink {
|
||||
arena,
|
||||
document: arena.alloc(Node::new(NodeData::Document)),
|
||||
quirks_mode: QuirksMode::NoQuirks,
|
||||
};
|
||||
parse_document(sink, Default::default())
|
||||
.from_utf8()
|
||||
.one(bytes)
|
||||
}
|
||||
use html5ever::tendril::StrTendril;
|
||||
use html5ever::{Attribute, ExpandedName, LocalName, QualName};
|
||||
|
||||
pub fn create_element<'arena>(arena: Arena<'arena>, name: &str) -> Ref<'arena> {
|
||||
arena.alloc(Node::new(NodeData::Element {
|
||||
name: QualName::new(None, ns!(), LocalName::from(name)),
|
||||
name: QualName::new(None, ns!(html), LocalName::from(name)),
|
||||
attrs: RefCell::new(vec![]),
|
||||
template_contents: None,
|
||||
mathml_annotation_xml_integration_point: false,
|
||||
@ -55,9 +42,9 @@ pub type Ref<'arena> = &'arena Node<'arena>;
|
||||
pub type Link<'arena> = Cell<Option<Ref<'arena>>>;
|
||||
|
||||
pub struct Sink<'arena> {
|
||||
arena: Arena<'arena>,
|
||||
document: Ref<'arena>,
|
||||
quirks_mode: QuirksMode,
|
||||
pub arena: Arena<'arena>,
|
||||
pub document: Ref<'arena>,
|
||||
pub quirks_mode: QuirksMode,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -53,7 +53,7 @@ fn main() {
|
||||
&add_single_elements_around_ul,
|
||||
],
|
||||
);
|
||||
let root = transformer.parse(&mut io::stdin()).unwrap();
|
||||
let root = transformer.parse_fragment(&mut io::stdin()).unwrap();
|
||||
transformer.traverse(root);
|
||||
serialize(&mut io::stdout(), root, Default::default()).expect("serialization failed")
|
||||
}
|
||||
@ -103,7 +103,7 @@ fn css_rules_to_string(rules: Vec<CssRule>) -> String {
|
||||
// DONE: add whitelist of CSS properties, remove any not in it
|
||||
// TODO: scope selectors in rich formatter
|
||||
// TODO: add class attributes to elements in rich formatter
|
||||
// TODO: separate this out into multiple separate transformers
|
||||
// DONE: separate this out into multiple separate transformers
|
||||
// TODO: find a way to avoid passing the arena to transformer functions. It's an implementation
|
||||
// detail that doesn't need to be exposed. Also, it's only needed for creating new elements.
|
||||
fn sanitize_style_tag_css<'arena>(node: Ref<'arena>, _: Arena<'arena>) {
|
||||
|
@ -1,10 +1,12 @@
|
||||
extern crate typed_arena;
|
||||
|
||||
use std::io::{Error, Read};
|
||||
|
||||
use crate::arena_dom::{html5ever_parse_slice_into_arena, Arena, Node, Ref};
|
||||
use html5ever::interface::tree_builder::QuirksMode;
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use html5ever::{parse_document, parse_fragment, QualName};
|
||||
|
||||
// TODO: What are the performance implications of using a vec of boxed closures instead of one
|
||||
use crate::arena_dom::{Arena, Node, NodeData, Ref, Sink};
|
||||
|
||||
// TODO: What are the performance implications of using a vec of closures instead of one
|
||||
// transformer function who's size is known at compile time (U: Fn(Ref<'arena>) -> bool)?
|
||||
// TODO: how to integrate CSS parsing and transforming?
|
||||
pub struct Transformer<'arena, T>
|
||||
@ -31,10 +33,35 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||
pub fn parse_document(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||
let mut bytes = Vec::new();
|
||||
data.read_to_end(&mut bytes)?;
|
||||
Ok(html5ever_parse_slice_into_arena(&bytes, &self.arena))
|
||||
let sink = Sink {
|
||||
arena: &self.arena,
|
||||
document: self.arena.alloc(Node::new(NodeData::Document)),
|
||||
quirks_mode: QuirksMode::NoQuirks,
|
||||
};
|
||||
Ok(parse_document(sink, Default::default())
|
||||
.from_utf8()
|
||||
.one(&bytes[..]))
|
||||
}
|
||||
|
||||
pub fn parse_fragment(&'arena self, data: &mut impl Read) -> Result<Ref<'arena>, Error> {
|
||||
let mut bytes = Vec::new();
|
||||
data.read_to_end(&mut bytes)?;
|
||||
let sink = Sink {
|
||||
arena: &self.arena,
|
||||
document: self.arena.alloc(Node::new(NodeData::Document)),
|
||||
quirks_mode: QuirksMode::NoQuirks,
|
||||
};
|
||||
Ok(parse_fragment(
|
||||
sink,
|
||||
Default::default(),
|
||||
QualName::new(None, ns!(html), local_name!("body")),
|
||||
vec![],
|
||||
)
|
||||
.from_utf8()
|
||||
.one(&bytes[..]))
|
||||
}
|
||||
|
||||
pub fn traverse(&'arena self, node: Ref<'arena>) {
|
||||
@ -92,39 +119,39 @@ mod test {
|
||||
}
|
||||
|
||||
// fn node_contains_tag<'arena>(node: Ref<'arena>, tag_name: &str) -> bool {
|
||||
// if let NodeData::Element { ref name, .. } = node.data {
|
||||
// if name.local == LocalName::from(tag_name) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// if let NodeData::Element { ref name, .. } = node.data {
|
||||
// if name.local == LocalName::from(tag_name) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
|
||||
// if let Some(child) = node.first_child.get() {
|
||||
// if node_contains_tag(child, tag_name) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// if let Some(child) = node.first_child.get() {
|
||||
// if node_contains_tag(child, tag_name) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
|
||||
// if let Some(sibling) = node.next_sibling.get() {
|
||||
// if node_contains_tag(sibling, tag_name) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// if let Some(sibling) = node.next_sibling.get() {
|
||||
// if node_contains_tag(sibling, tag_name) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
|
||||
// false
|
||||
// false
|
||||
// }
|
||||
|
||||
// fn count_nodes(node: Ref) -> usize {
|
||||
// let mut count = 1;
|
||||
// let mut count = 1;
|
||||
|
||||
// if let Some(child) = node.first_child.get() {
|
||||
// count += count_nodes(child);
|
||||
// }
|
||||
// if let Some(child) = node.first_child.get() {
|
||||
// count += count_nodes(child);
|
||||
// }
|
||||
|
||||
// if let Some(sibling) = node.next_sibling.get() {
|
||||
// count += count_nodes(sibling);
|
||||
// }
|
||||
// if let Some(sibling) = node.next_sibling.get() {
|
||||
// count += count_nodes(sibling);
|
||||
// }
|
||||
|
||||
// count
|
||||
// count
|
||||
// }
|
||||
|
||||
fn assert_serialized_html_eq(node: Ref, expected: &str) {
|
||||
@ -137,9 +164,9 @@ mod test {
|
||||
fn traversal() {
|
||||
let transformer = Transformer::new(|_| false, vec![&|_, _| {}]);
|
||||
let mut mock_data = MockRead::new("<div></div>");
|
||||
let root = transformer.parse(&mut mock_data).unwrap();
|
||||
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
||||
transformer.traverse(root);
|
||||
assert_serialized_html_eq(root, "<html><head></head><body><div></div></body></html>");
|
||||
assert_serialized_html_eq(root, "<html><div></div></html>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -154,9 +181,9 @@ mod test {
|
||||
vec![&|_, _| {}],
|
||||
);
|
||||
let mut mock_data = MockRead::new("<div></div>");
|
||||
let root = transformer.parse(&mut mock_data).unwrap();
|
||||
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
||||
transformer.traverse(root);
|
||||
assert_serialized_html_eq(root, "<html><head></head><body></body></html>");
|
||||
assert_serialized_html_eq(root, "<html></html>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -172,11 +199,8 @@ mod test {
|
||||
}],
|
||||
);
|
||||
let mut mock_data = MockRead::new("<div></div>");
|
||||
let root = transformer.parse(&mut mock_data).unwrap();
|
||||
let root = transformer.parse_fragment(&mut mock_data).unwrap();
|
||||
transformer.traverse(root);
|
||||
assert_serialized_html_eq(
|
||||
root,
|
||||
"<html><head></head><body><div></div><span></span></body></html>",
|
||||
);
|
||||
assert_serialized_html_eq(root, "<html><div></div><span></span></html>");
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user