diff --git a/html5ever/src/lib.rs b/html5ever/src/lib.rs index 4b8029d9..11e3fef0 100644 --- a/html5ever/src/lib.rs +++ b/html5ever/src/lib.rs @@ -29,6 +29,12 @@ mod util { pub mod str; } +pub trait Sendable { + type SendableSelf: Send; + fn get_sendable(&self) -> Self::SendableSelf; + fn get_self_from_sendable(sendable: Self::SendableSelf) -> Self; +} + pub mod serialize; pub mod tokenizer; pub mod tree_builder; diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs index be7fbe3a..317ac4f6 100644 --- a/html5ever/src/tokenizer/char_ref/mod.rs +++ b/html5ever/src/tokenizer/char_ref/mod.rs @@ -7,10 +7,11 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use super::{Tokenizer, TokenSink}; +use super::{Tokenizer, TokenSink, Sendable}; use buffer_queue::BufferQueue; use data; -use tendril::StrTendril; +use tendril::{SendTendril, StrTendril}; +use tendril::fmt::UTF8; use util::str::{is_ascii_alnum}; use std::char::from_u32; @@ -20,6 +21,7 @@ pub use self::Status::*; use self::State::*; //§ tokenizing-character-references +#[derive(Clone, Copy)] pub struct CharRef { /// The resulting character(s) pub chars: [char; 2], @@ -34,7 +36,7 @@ pub enum Status { Done, } -#[derive(Debug)] +#[derive(Clone, Copy, Debug)] enum State { Begin, Octothorpe, @@ -44,6 +46,22 @@ enum State { BogusName, } +pub struct SendableCharRefTokenizer { + state: State, + addnl_allowed: Option, + result: Option, + + num: u32, + num_too_big: bool, + seen_digit: bool, + hex_marker: Option, + + name_buf_opt: Option>, + name_match: Option<(u32, u32)>, + name_len: usize, +} + +#[derive(Clone)] pub struct CharRefTokenizer { state: State, addnl_allowed: Option, @@ -110,6 +128,40 @@ impl CharRefTokenizer { } } +impl Sendable for CharRefTokenizer { + type SendableSelf = SendableCharRefTokenizer; + + fn get_sendable(&self) -> Self::SendableSelf { + SendableCharRefTokenizer { + state: self.state, + addnl_allowed: self.addnl_allowed, + result: self.result, + num: self.num, + num_too_big: self.num_too_big, + seen_digit: self.seen_digit, + hex_marker: self.hex_marker, + name_buf_opt: self.name_buf_opt.clone().map(|s| SendTendril::from(s)), + name_match: self.name_match, + name_len: self.name_len + } + } + + fn get_self_from_sendable(sendable_self: Self::SendableSelf) -> Self { + CharRefTokenizer { + state: sendable_self.state, + addnl_allowed: sendable_self.addnl_allowed, + result: sendable_self.result, + num: sendable_self.num, + num_too_big: sendable_self.num_too_big, + seen_digit: sendable_self.seen_digit, + hex_marker: sendable_self.hex_marker, + name_buf_opt: sendable_self.name_buf_opt.clone().map(|s| StrTendril::from(s)), + name_match: sendable_self.name_match, + name_len: sendable_self.name_len + } + } +} + impl CharRefTokenizer { pub fn step( &mut self, diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index c4301714..3082a39c 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -19,7 +19,7 @@ use self::states::{Escaped, DoubleEscaped}; use self::states::{Unquoted, SingleQuoted, DoubleQuoted}; use self::states::{DoctypeIdKind, Public, System}; -use self::char_ref::{CharRef, CharRefTokenizer}; +use self::char_ref::{CharRef, CharRefTokenizer, SendableCharRefTokenizer}; use util::str::lower_ascii_letter; @@ -30,9 +30,12 @@ use std::borrow::Cow::{self, Borrowed}; use std::collections::BTreeMap; use {LocalName, QualName, Attribute, SmallCharSet}; -use tendril::StrTendril; +use tendril::{SendTendril, StrTendril}; +use tendril::fmt::UTF8; pub use buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet}; +use super::Sendable; + pub mod states; mod interface; mod char_ref; @@ -95,6 +98,38 @@ impl Default for TokenizerOpts { } } +/// Similar to Tokenizer, except this type uses SendTendril instead of StrTendril. +pub struct SendableTokenizer { + opts: TokenizerOpts, + sink: Sink, + state: states::State, + at_eof: bool, + char_ref_tokenizer: Option, + current_char: char, + reconsume: bool, + ignore_lf: bool, + discard_bom: bool, + current_tag_kind: TagKind, + current_tag_name: SendTendril, + current_tag_self_closing: bool, + current_tag_attrs: Vec<(QualName, SendTendril)>, + current_attr_name: SendTendril, + current_attr_value: SendTendril, + current_comment: SendTendril, + + /// current doctype's fields + curr_doctype_name: Option>, + curr_doctype_public_id: Option>, + curr_doctype_system_id: Option>, + curr_doctype_force_quirks: bool, + + last_start_tag_name: Option, + temp_buf: SendTendril, + state_profile: BTreeMap, + time_in_sink: u64, + current_line: u64, +} + /// The HTML tokenizer. pub struct Tokenizer { /// Options controlling the behavior of the tokenizer. @@ -559,6 +594,97 @@ impl Tokenizer { self.process_token_and_continue(ParseError(error)); } } + +impl Sendable for Tokenizer +{ + type SendableSelf = SendableTokenizer<::SendableSelf>; + + /// Returns an instance containing the necessary information required to + /// create a Tokenizer with the exact same state. Instances of this + /// type can be sent between threads. + fn get_sendable(&self) -> Self::SendableSelf { + let mut sendable_current_tag_attrs = vec!(); + let mut current_tag_attrs = self.current_tag_attrs.iter(); + while let Some(attr) = current_tag_attrs.next() { + sendable_current_tag_attrs.push((attr.name.clone(), SendTendril::from(attr.value.clone()))); + } + + SendableTokenizer { + opts: self.opts.clone(), + sink: self.sink.get_sendable(), + state: self.state, + char_ref_tokenizer: self.char_ref_tokenizer.clone().map(|tok| tok.get_sendable()), + at_eof: self.at_eof, + current_char: self.current_char, + reconsume: self.reconsume, + ignore_lf: self.ignore_lf, + discard_bom: self.discard_bom, + current_tag_kind: self.current_tag_kind, + current_tag_name: SendTendril::from(self.current_tag_name.clone()), + current_tag_self_closing: self.current_tag_self_closing, + current_tag_attrs: sendable_current_tag_attrs, + current_attr_name: SendTendril::from(self.current_attr_name.clone()), + current_attr_value: SendTendril::from(self.current_attr_value.clone()), + current_comment: SendTendril::from(self.current_comment.clone()), + + curr_doctype_name: self.current_doctype.name.clone().map(|s| SendTendril::from(s)), + curr_doctype_public_id: self.current_doctype.public_id.clone().map(|s| SendTendril::from(s)), + curr_doctype_system_id: self.current_doctype.system_id.clone().map(|s| SendTendril::from(s)), + curr_doctype_force_quirks: self.current_doctype.force_quirks, + + last_start_tag_name: self.last_start_tag_name.clone(), + temp_buf: SendTendril::from(self.temp_buf.clone()), + state_profile: self.state_profile.clone(), + time_in_sink: self.time_in_sink, + current_line: self.current_line + } + } + + fn get_self_from_sendable(sendable_self: Self::SendableSelf) -> Self { + let mut current_tag_attrs = vec!(); + let mut sendable_current_tag_attrs = sendable_self.current_tag_attrs.iter(); + while let Some(attr) = sendable_current_tag_attrs.next() { + let (name, value) = attr.clone(); + current_tag_attrs.push(Attribute { + name: name, + value: StrTendril::from(value), + }); + } + + Tokenizer { + opts: sendable_self.opts, + sink: Sink::get_self_from_sendable(sendable_self.sink), + state: sendable_self.state, + char_ref_tokenizer: sendable_self.char_ref_tokenizer + .map(|tok| Box::new(CharRefTokenizer::get_self_from_sendable(tok))), + at_eof: sendable_self.at_eof, + current_char: sendable_self.current_char, + reconsume: sendable_self.reconsume, + ignore_lf: sendable_self.ignore_lf, + discard_bom: sendable_self.discard_bom, + current_tag_kind: sendable_self.current_tag_kind, + current_tag_name: StrTendril::from(sendable_self.current_tag_name), + current_tag_self_closing: sendable_self.current_tag_self_closing, + current_tag_attrs: current_tag_attrs, + current_attr_name: StrTendril::from(sendable_self.current_attr_name), + current_attr_value: StrTendril::from(sendable_self.current_attr_value), + current_comment: StrTendril::from(sendable_self.current_comment), + + current_doctype: Doctype { + name: sendable_self.curr_doctype_name.map(|s| StrTendril::from(s)), + public_id: sendable_self.curr_doctype_public_id.map(|s| StrTendril::from(s)), + system_id: sendable_self.curr_doctype_system_id.map(|s| StrTendril::from(s)), + force_quirks: sendable_self.curr_doctype_force_quirks, + }, + + last_start_tag_name: sendable_self.last_start_tag_name, + temp_buf: StrTendril::from(sendable_self.temp_buf), + state_profile: sendable_self.state_profile, + time_in_sink: sendable_self.time_in_sink, + current_line: sendable_self.current_line + } + } +} //§ END // Shorthand for common state machine behaviors. diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 445e0a36..7b5440b5 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -18,7 +18,8 @@ pub use interface::{TreeSink, Tracer, NextParserState, create_element, ElementFl use self::types::*; use {ExpandedName, QualName, LocalName, Namespace}; -use tendril::StrTendril; +use tendril::{SendTendril, StrTendril}; +use tendril::fmt::UTF8; use tokenizer; use tokenizer::{Doctype, StartTag, Tag, EndTag, TokenSink, TokenSinkResult}; @@ -39,6 +40,8 @@ use tree_builder::types::*; use tree_builder::tag_sets::*; use util::str::to_escaped_string; +use super::Sendable; + pub use self::PushFlag::*; #[macro_use] mod tag_sets; @@ -84,6 +87,27 @@ impl Default for TreeBuilderOpts { } } +/// Similar to TreeBuilder, except this type uses SendTendril instead of StrTendril. +pub struct SendableTreeBuilder { + opts: TreeBuilderOpts, + sink: Sink, + mode: InsertionMode, + orig_mode: Option, + template_modes: Vec, + pending_table_text: Vec<(SplitStatus, SendTendril)>, + quirks_mode: QuirksMode, + doc_handle: Handle, + open_elems: Vec, + active_formatting: Vec>, + head_elem: Option, + form_elem: Option, + frameset_ok: bool, + ignore_lf: bool, + foster_parenting: bool, + context_elem: Option, + current_line: u64, +} + /// The HTML tree builder. pub struct TreeBuilder { /// Options controlling the behavior of the tree builder. @@ -408,6 +432,132 @@ impl TreeBuilder } } +impl Sendable for TreeBuilder + where Handle: Clone + Send, + Sink: TreeSink + Sendable, +{ + type SendableSelf = SendableTreeBuilder::SendableSelf>; + + /// Returns an instance containing the necessary information required to + /// create a TreeBuilder with the exact same state. Instances of this + /// type can be sent between threads. + fn get_sendable(&self) -> Self::SendableSelf { + let mut sendable_pending_table_text = vec!(); + let mut pending_table_text = self.pending_table_text.iter(); + while let Some(elem) = pending_table_text.next() { + let (split_status, ref str) = *elem; + sendable_pending_table_text.push((split_status, SendTendril::from(str.clone()))); + } + + let mut sendable_active_formatting = vec!(); + let mut active_formatting = self.active_formatting.iter(); + while let Some(elem) = active_formatting.next() { + let sendable_elem = match *elem { + FormatEntry::Element(ref handle, ref tag) => { + let mut sendable_attrs = vec!(); + let mut attrs = tag.attrs.iter(); + while let Some(ref attr) = attrs.next() { + sendable_attrs.push((attr.name.clone(), SendTendril::from(attr.value.clone()))); + } + SendableFormatEntry::Element { + handle: handle.clone(), + tag_kind: tag.kind, + tag_name: tag.name.clone(), + tag_self_closing: tag.self_closing, + tag_attrs: sendable_attrs + } + }, + FormatEntry::Marker => SendableFormatEntry::Marker, + }; + sendable_active_formatting.push(sendable_elem); + } + + SendableTreeBuilder { + opts: self.opts, + sink: self.sink.get_sendable(), + mode: self.mode, + orig_mode: self.orig_mode, + template_modes: self.template_modes.clone(), + pending_table_text: sendable_pending_table_text, + quirks_mode: self.quirks_mode, + doc_handle: self.doc_handle.clone(), + open_elems: self.open_elems.clone(), + active_formatting: sendable_active_formatting, + head_elem: self.head_elem.clone(), + form_elem: self.form_elem.clone(), + frameset_ok: self.frameset_ok, + ignore_lf: self.ignore_lf, + foster_parenting: self.foster_parenting, + context_elem: self.context_elem.clone(), + current_line: self.current_line + } + } + + fn get_self_from_sendable(sendable_self: Self::SendableSelf) -> Self { + let mut pending_table_text = vec!(); + let mut sendable_pending_table_text = sendable_self.pending_table_text.iter(); + while let Some(elem) = sendable_pending_table_text.next() { + let (split_status, str_tendril) = elem.clone(); + pending_table_text.push((split_status, StrTendril::from(str_tendril))); + } + + let mut active_formatting = vec!(); + let mut sendable_active_formatting = sendable_self.active_formatting.iter(); + while let Some(sendable_elem) = sendable_active_formatting.next() { + let elem = match sendable_elem.clone() { + SendableFormatEntry::Element { + handle, + tag_kind, + tag_name, + tag_self_closing, + tag_attrs, + } => { + let mut attrs = vec!(); + let mut tag_attrs = tag_attrs.iter(); + while let Some(attr) = tag_attrs.next() { + let (name, value) = attr.clone(); + attrs.push(Attribute { + name: name, + value: StrTendril::from(value), + }); + } + FormatEntry::Element( + handle, + Tag { + kind: tag_kind, + name: tag_name, + self_closing: tag_self_closing, + attrs: attrs, + } + ) + }, + SendableFormatEntry::Marker => FormatEntry::Marker, + }; + active_formatting.push(elem); + } + + TreeBuilder { + opts: sendable_self.opts, + sink: Sink::get_self_from_sendable(sendable_self.sink), + mode: sendable_self.mode, + orig_mode: sendable_self.orig_mode, + template_modes: sendable_self.template_modes, + pending_table_text: pending_table_text, + quirks_mode: sendable_self.quirks_mode, + doc_handle: sendable_self.doc_handle, + open_elems: sendable_self.open_elems, + active_formatting: active_formatting, + head_elem: sendable_self.head_elem, + form_elem: sendable_self.form_elem, + frameset_ok: sendable_self.frameset_ok, + ignore_lf: sendable_self.ignore_lf, + foster_parenting: sendable_self.foster_parenting, + context_elem: sendable_self.context_elem, + current_line: sendable_self.current_line + } + } +} + impl TokenSink for TreeBuilder where Handle: Clone, diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs index 1192472c..0bc51da1 100644 --- a/html5ever/src/tree_builder/types.rs +++ b/html5ever/src/tree_builder/types.rs @@ -9,10 +9,13 @@ //! Types used within the tree builder code. Not exported to users. +use {LocalName, QualName}; use tokenizer::Tag; use tokenizer::states::RawKind; +use tokenizer::TagKind; -use tendril::StrTendril; +use tendril::{SendTendril, StrTendril}; +use tendril::fmt::UTF8; pub use self::InsertionMode::*; pub use self::SplitStatus::*; @@ -77,6 +80,18 @@ pub enum ProcessResult { ToRawData(RawKind), } +#[derive(Clone)] +pub enum SendableFormatEntry { + Element { + handle: Handle, + tag_kind: TagKind, + tag_name: LocalName, + tag_self_closing: bool, + tag_attrs: Vec<(QualName, SendTendril)>, + }, + Marker +} + pub enum FormatEntry { Element(Handle, Tag), Marker, diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs index ab32961a..cc99a9ea 100644 --- a/markup5ever/util/buffer_queue.rs +++ b/markup5ever/util/buffer_queue.rs @@ -19,6 +19,7 @@ //! [`BufferQueue`]: struct.BufferQueue.html +use std::cmp::Ordering; use std::collections::VecDeque; use tendril::StrTendril; @@ -47,6 +48,8 @@ pub enum SetResult { pub struct BufferQueue { /// Buffers to process. buffers: VecDeque, + /// Used during speculative parsing. + recorded_len: Option, } impl BufferQueue { @@ -55,6 +58,31 @@ impl BufferQueue { pub fn new() -> BufferQueue { BufferQueue { buffers: VecDeque::with_capacity(16), + recorded_len: None, + } + } + + pub fn notify_speculative_parsing_has_started(&mut self) { + self.recorded_len = Some(self.buffers.len()); + } + + /// During speculative parsing, the async tokenizer might mutate network input's contents. + /// Also, some chunks might be pushed onto network input. This method is used to + /// update network input accordingly. + pub fn update_with_new_data(&mut self, new_data: Option>) { + let recorded_len = self.recorded_len.expect("This should contain some value!"); + self.recorded_len.take(); + let mut new_data = match new_data { + None => return, + Some(data) => data, + }; + + let len = self.buffers.len(); + assert_ne!(len.cmp(&recorded_len), Ordering::Less); + self.buffers = self.buffers.split_off(recorded_len); + new_data.append(&mut self.buffers); + while let Some(chunk) = new_data.pop_front() { + self.buffers.push_back(chunk); } }