diff --git a/src/wp-includes/html-api/class-wp-html-element-stack-item.php b/src/wp-includes/html-api/class-wp-html-element-stack-item.php new file mode 100644 index 0000000000000..7f1222ad58dbc --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-element-stack-item.php @@ -0,0 +1,46 @@ +bookmark_name = $bookmark_name; + $this->element = $element; + $this->flags = $flags; + $this->related_item = $related_item; + } +} diff --git a/src/wp-includes/html-api/class-wp-html-element-stack.php b/src/wp-includes/html-api/class-wp-html-element-stack.php new file mode 100644 index 0000000000000..f95a7753dcd11 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-element-stack.php @@ -0,0 +1,306 @@ += $this->count() ) { + return null; + } + + return $this->stack[ $this->count() - $nth_from_bottom - 1 ]; + } + + /** + * Add an item to the top of the stack. + * + * @TODO: Do we need to insertion-sort these? + * + * @param WP_HTML_Element_Stack_Item $stack_item + * @return void + */ + public function push( $stack_item ) { + $this->stack[] = $stack_item; + } + + /** + * Removes an item of a given value from the stack. + * + * @TODO: Should this be done by index? What performance + * tradeoffs are we making here by isolating the + * index from the callee? It's safer, but is it + * worth the cost? + * + * @TODO: Measure performance against `remove_at( $nth_from_bottom )`. + * + * @param string $stack_item Which item to remove. + * @return bool Whether the item was removed. + */ + public function remove( $stack_item ) { + for ( $i = $this->count() - 1; $i >= 0; $i++ ) { + if ( $this->stack[ $i ] === $stack_item ) { + array_splice( $this->stack, $i, 1 ); + return true; + } + } + + return false; + } + + public function count() { + return count( $this->stack ); + } + + /** + * Returns the bottom-most node on the stack. + * + * @return WP_HTML_Element_Stack_Item|null + */ + public function current_node() { + $count = $this->count(); + + return $count > 0 + ? $this->stack[ $count - 1 ] + : null; + } + + /** + * Returns whether the given element is on the stack. + * + * @param string $element the ::class name of the element to check for. + * @return boolean whether the given element is on the stack. + */ + public function has_element( $element ) { + for ( $i = 0; $i < $this->count(); $i++ ) { + if ( $this->peek( $i )->element === $element ) { + return true; + } + } + + return false; + } + + /** + * Returns whether an element is in a specific scope. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-the-specific-scope + * + * @param string $target_node The target node. + * @param string[] $termination_list List of elements that terminate the search. + * @return bool + */ + public function has_element_in_specific_scope( $target_node, $termination_list ) { + $i = $this->count(); + if ( $i === 0 ) { + return false; + } + + $node = $this->stack[ --$i ]; + + if ( $node->element === $target_node ) { + return true; + } + + if ( in_array( $target_node, $termination_list, true ) ) { + return false; + } + + while ( $i > 0 && null !== ( $node = $this->stack[ --$i ] ) ) { + if ( $node->element === $target_node ) { + return true; + } + + if ( in_array( $target_node, $termination_list, true ) ) { + return false; + } + } + + return false; + } + + /** + * Returns whether a given element is in a particular scope. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-scope + * + * @param string $element + * @return bool + */ + public function has_element_in_particular_scope( $element ) { + return $this->has_element_in_specific_scope( $element, array( + WP_HTMLAppletElement::class, + WP_HTMLCaptionElement::class, + WP_HTMLHtmlElement::class, + WP_HTMLTableElement::class, + WP_HTMLTdElement::class, + WP_HTMLThElement::class, + WP_HTMLMarqueeElement::class, + WP_HTMLObjectElement::class, + WP_HTMLTemplateElement::class, + WP_MathML_Mi_Element::class, + WP_MathML_Mo_Element::class, + WP_MathML_Mn_Element::class, + WP_MathML_Ms_Element::class, + WP_MathML_Mtext_Element::class, + WP_MathML_Annotation_Xml_Element::class, + WP_SVG_ForeignObject_Element::class, + WP_SVG_Description_Element::class, + WP_SVG_Title_Element::class, + ) ); + } + + /** + * Returns whether a given element is in list item scope. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope + * + * @param $element + * @return void + */ + public function has_element_in_list_item_scope( $element ) { + return $this->has_element_in_specific_scope( $element, array( + WP_HTMLAppletElement::class, + WP_HTMLCaptionElement::class, + WP_HTMLHtmlElement::class, + WP_HTMLTableElement::class, + WP_HTMLTdElement::class, + WP_HTMLThElement::class, + WP_HTMLMarqueeElement::class, + WP_HTMLObjectElement::class, + WP_HTMLTemplateElement::class, + WP_MathML_Mi_Element::class, + WP_MathML_Mo_Element::class, + WP_MathML_Mn_Element::class, + WP_MathML_Ms_Element::class, + WP_MathML_Mtext_Element::class, + WP_MathML_Annotation_Xml_Element::class, + WP_SVG_ForeignObject_Element::class, + WP_SVG_Description_Element::class, + WP_SVG_Title_Element::class, + + // Additionally these elements. + WP_HTMLOlElement::class, + WP_HTMLUlElement::class, + ) ); + } + + /** + * Returns whether a given element is in button scope. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope + * + * @param string $element + * @return boolean + */ + public function has_element_in_button_scope( $element ) { + return $this->has_element_in_specific_scope( $element, array( + WP_HTMLAppletElement::class, + WP_HTMLCaptionElement::class, + WP_HTMLHtmlElement::class, + WP_HTMLTableElement::class, + WP_HTMLTdElement::class, + WP_HTMLThElement::class, + WP_HTMLMarqueeElement::class, + WP_HTMLObjectElement::class, + WP_HTMLTemplateElement::class, + WP_MathML_Mi_Element::class, + WP_MathML_Mo_Element::class, + WP_MathML_Mn_Element::class, + WP_MathML_Ms_Element::class, + WP_MathML_Mtext_Element::class, + WP_MathML_Annotation_Xml_Element::class, + WP_SVG_ForeignObject_Element::class, + WP_SVG_Description_Element::class, + WP_SVG_Title_Element::class, + + // Additionally these elements. + WP_HTMLButtonElement::class, + ) ); + } + + /** + * Returns whether the given element is in table scope. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-table-scope + * + * @param string $element + * @return bool + */ + public function has_element_in_table_scope( $element ) { + return $this->has_element_in_specific_scope( $element, array( + WP_HTMLAppletElement::class, + WP_HTMLCaptionElement::class, + WP_HTMLHtmlElement::class, + WP_HTMLTableElement::class, + WP_HTMLTdElement::class, + WP_HTMLThElement::class, + WP_HTMLMarqueeElement::class, + WP_HTMLObjectElement::class, + WP_HTMLTemplateElement::class, + WP_MathML_Mi_Element::class, + WP_MathML_Mo_Element::class, + WP_MathML_Mn_Element::class, + WP_MathML_Ms_Element::class, + WP_MathML_Mtext_Element::class, + WP_MathML_Annotation_Xml_Element::class, + WP_SVG_ForeignObject_Element::class, + WP_SVG_Description_Element::class, + WP_SVG_Title_Element::class, + + // Additionally these elements. + WP_HTMLHtmlElement::class, + WP_HTMLTableElement::class, + WP_HTMLTemplateElement::class, + ) ); + } + + /** + * Returns whether a given element is in select scope. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-select-scope + * + * @param string $element + * @return bool + */ + public function has_element_in_select_scope( $element ) { + return $this->has_element_in_specific_scope( $element, array( + WP_HTMLAppletElement::class, + WP_HTMLCaptionElement::class, + WP_HTMLHtmlElement::class, + WP_HTMLTableElement::class, + WP_HTMLTdElement::class, + WP_HTMLThElement::class, + WP_HTMLMarqueeElement::class, + WP_HTMLObjectElement::class, + WP_HTMLTemplateElement::class, + WP_MathML_Mi_Element::class, + WP_MathML_Mo_Element::class, + WP_MathML_Mn_Element::class, + WP_MathML_Ms_Element::class, + WP_MathML_Mtext_Element::class, + WP_MathML_Annotation_Xml_Element::class, + WP_SVG_ForeignObject_Element::class, + WP_SVG_Description_Element::class, + WP_SVG_Title_Element::class, + + // Additionally these elements. + WP_HTMLOptgroupElement::class, + WP_HTMLOptionElement::class, + ) ); + } +} diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php new file mode 100644 index 0000000000000..748e3eabdb318 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -0,0 +1,1271 @@ + 'visit' ); + + /** + * @var int Unique id for creating bookmarks. + */ + private $bookmark_id = 0; + + /** + * @var WP_HTML_Element_Stack Refers to element opening tags. + */ + private $tag_openers = null; + + /** + * @var WP_HTML_Element_Stack Refers to element closing tags. + */ + private $tag_closers = null; + + /** + * Used to handle mis-nested formatting element tags. + * + * @see https://html.spec.whatwg.org/#the-list-of-active-formatting-elements + * + * @var WP_HTML_Element_Stack + */ + private $active_formatting_elements = null; + + /** + * @var string Tree construction insertion mode. + */ + private $insertion_mode = 'initial'; + + /** + * Context node initializing HTML fragment parsing, if in that mode. + * + * @var [string, array]|null + */ + private $context_node = null; + + /** + * Points to the HEAD element once one has been parsed, either implicitly or explicitly. + * + * @TODO: Implement this. + * + * @see https://html.spec.whatwg.org/#head-element-pointer + * + * @var null + */ + private $head_element_pointer = null; + + /** + * Points to the last form element that was opened and whose end tag has not yet been seen, if any. + * + * @TODO: Implement this. + * + * This is used to make form controls associate with forms in the face of dramatically + * bad markup, for historical reasons. It is ignored inside template elements. + * + * @see https://html.spec.whatwg.org/#form-element-pointer + * + * @var null + */ + private $form_element_pointer = null; + + /** + * Original insertion mode when entering 'text' or 'in-table-text' modes. + * + * Not implemented yet. + * + * @var string|null + */ + private $original_insertion_mode = null; + + /** + * Specifies whether FRAMESET elements can be processed in the current mode. + * + * @see https://html.spec.whatwg.org/#frameset-ok-flag + * + * @var bool + */ + private $frameset_ok = true; + + /** + * Stack of template insertion modes. + * + * Not implemented yet. + * + * @var string[] + */ + private $template_insertion_mode_stack = array(); + + /** + * Indicates if the stack of open elements contains a TEMPLATE element. + * + * This is an optimization to bypass scanning the stack of open elements + * in less common cases. + * + * @var bool + */ + private $template_element_is_on_stack_of_open_elements = false; + + /** + * Create an HTML processor in the full HTML parsing mode. + * + * Use this for cases where you have an entire HTML document from + * the start to the end. If you have a section of HTML that's part + * of a bigger document, then use `createFragment()` instead. + * + * @TODO: Proper version number. + * @since 6.3.0 + * + * @param string $html Input HTML document to process. + * @param string $encoding Text encoding of the document; only supported value is 'utf-8'. + * @return WP_HTML_Processor|null The created processor if successfull, otherwise null. + */ + public static function createDocument( $html, $encoding = 'utf-8' ) { + $options = array( + 'parser_mode' => self::FULL_PARSER, + 'insertion_mode' => self::INITIAL_MODE, + 'context_node' => null, + ); + + return new self( $html, $options, $encoding ); + } + + /** + * Create an HTML processor in the fragment parsing mode. + * + * Use this for cases where you are processing chunks of HTML that + * will be found within a bigger HTML document, such as rendered + * block output that exists within a post, `the_content` inside a + * rendered site layout. + * + * Fragment parsing occurs within a context, which is an HTML element + * that the document will eventually be placed in. It becomes important + * when special elements have different rules than others, such as inside + * a TEXTAREA or a TITLE tag where things that look like tags are text, + * or inside a SCRIPT tag where things that look like HTML syntax are JS. + * + * The context value should be a representation of the tag into which the + * HTML is found. For most cases this will be the body element. The HTML + * form is provided because a context element may have attributes that + * impact the parse, such as with a SCRIPT tag and its `type` attribute. + * + * @TODO: Proper version number. + * @since 6.3.0 + * + * @param string $html Input HTML fragment to process. + * @param string $context Context element for the fragment, defaults to `
`. + * @param string $encoding Text encoding of the document; only supported value is 'utf-8'. + * @return WP_HTML_Processor|null The created processor if successfull, otherwise null. + */ + public static function createFragment( $html, $context = '', $encoding = 'utf8' ) { + $p = new WP_HTML_Tag_Processor( $context ); + if ( ! $p->next_tag() ) { + return null; + } + + $context_node = WP_HTML_Spec::element_info( $p->next_tag() ); + $context_attributes = array(); + foreach ( $p->get_attribute_names_with_prefix( '' ) as $attribute_name ) { + $context_attributes[ $attribute_name ] = $p->get_attribute( $attribute_name ); + } + + // @TODO: we have to manually "pump" the tokenizer to skip the initial content. + $h = new self( $html ); + switch ( $context_node ) { + case WP_HTMLTitleElement::class: + case WP_HTMLTextareaElement::class: + $h-> + } + + $options = array( + 'parser_mode' => self::FRAGMENT_PARSER, + 'insertion_mode' => null, + 'context_node' => array( $context_node, $context_attributes ), + ); + + return $h; + } + + /** + * Create a new HTML Processor for reading and modifying HTML structure. + * + * ## Initial mode + * + * Most invocations of the HTML parser operate in the "fragment parsing" mode, + * which assumes that the given HTML document existing within an existing HTML + * document. For example, block HTML exists within a larger document, and some + * inner block HTML might exist within a TABLE element, which holds special + * parsing rules. + * + * The parser can operate in a full parsing mode or the fragment parsing mode, + * and it's important to indicate which is necessary when creating the HTML + * processor. + * + * Example + * // Parse an entire HTML document + * $p = new WP_HTML_Processor( $html, array( 'full', WP_HTML_Processor::INITIAL ) ); + * + * // Parse a full HTML document, but inside a BODY element. E.g. when parsing `post_content`. + * $p = new WP_HTML_Processor( $html, array( 'full', WP_HTML_Processor::IN_BODY ) ); + * + * // Parse a chunk of HTML provided inside a post's block content. + * $p = new WP_HTML_Processor( $html, array( 'fragment', '' ) ); + * + * // Parse a chunk of HTML provided inside a post's block content, using the default initial mode. + * $p = new WP_HTML_Processor( $html ); + * + * // Parse a chunk of HTML known to exist within a TEXTAREA element. E.g. when parsing code input. + * $p = new WP_HTML_Processor( $html, array( 'fragment', '