From dc3abb1958dcb1e4d561bdc9b38f9f1ad69c94fa Mon Sep 17 00:00:00 2001 From: Adam Roben Date: Fri, 30 Jan 2015 17:04:17 -0500 Subject: [PATCH] Match elements on qualified names, not just local names There are many places where the HTML spec says to look for an HTML element (i.e., an element in the HTML namespace) with a given tag, but where we were only looking at tag names and ignoring the namespace name. Now we always use qualified names where required. This is implemented using a new GumboQualName type, which is just a bit field that combines a GumboNamespaceEnum with a GumboTag. A series of macros make it easy to construct and inspect these bit fields. It's currently represented by a uintptr_t. This is larger than necessary; only 9 bits are required. At first I attempted to use a `short` but the compiler didn't like that being used with varargs, so then I tried an `unsigned int` but the compiler didn't like casting that to a pointer, so here we are. I also tried `typedef enum { QNSIZE = SHORT_MAX } GumboQualName` to induce more compiler warnings when mistakenly passing a tag to a function that expects a qualified name. This worked nicely but generated warnings about missing cases in switch statements. I'm not sure what the best option is, though I am quite tempted by the extra warnings the enum provides. --- src/parser.c | 597 ++++++++++++++++++++++++------------------------ tests/parser.cc | 84 +++++++ 2 files changed, 381 insertions(+), 300 deletions(-) diff --git a/src/parser.c b/src/parser.c index a97e54b4..47e9481d 100644 --- a/src/parser.c +++ b/src/parser.c @@ -38,6 +38,15 @@ #define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 } #define TERMINATOR { "", 0 } +typedef uintptr_t GumboQualName; +#define QUAL_NAME(namespace, tag) (GumboQualName)(((namespace) << 8) | (tag)) +#define QUAL_NAME_TAG(qualname) (GumboTag)((qualname) & 0xFF) +#define QUAL_NAME_LAST QUAL_NAME(GUMBO_NAMESPACE_HTML, GUMBO_TAG_LAST) +#define NODE_QUAL_NAME(node) QUAL_NAME(node->v.element.tag_namespace, node->v.element.tag) +#define HTML_QN(tag) QUAL_NAME(GUMBO_NAMESPACE_HTML, GUMBO_TAG_ ## tag) +#define MATHML_QN(tag) QUAL_NAME(GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ ## tag) +#define SVG_QN(tag) QUAL_NAME(GUMBO_NAMESPACE_SVG, GUMBO_TAG_ ## tag) + static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); } @@ -549,31 +558,31 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { static GumboInsertionMode get_appropriate_insertion_mode( const GumboNode* node, bool is_last) { assert(node->type == GUMBO_NODE_ELEMENT); - switch (node->v.element.tag) { - case GUMBO_TAG_SELECT: + switch (NODE_QUAL_NAME(node)) { + case HTML_QN(SELECT): return GUMBO_INSERTION_MODE_IN_SELECT; - case GUMBO_TAG_TD: - case GUMBO_TAG_TH: + case HTML_QN(TD): + case HTML_QN(TH): return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL; - case GUMBO_TAG_TR: + case HTML_QN(TR): return GUMBO_INSERTION_MODE_IN_ROW; - case GUMBO_TAG_TBODY: - case GUMBO_TAG_THEAD: - case GUMBO_TAG_TFOOT: + case HTML_QN(TBODY): + case HTML_QN(THEAD): + case HTML_QN(TFOOT): return GUMBO_INSERTION_MODE_IN_TABLE_BODY; - case GUMBO_TAG_CAPTION: + case HTML_QN(CAPTION): return GUMBO_INSERTION_MODE_IN_CAPTION; - case GUMBO_TAG_COLGROUP: + case HTML_QN(COLGROUP): return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; - case GUMBO_TAG_TABLE: + case HTML_QN(TABLE): return GUMBO_INSERTION_MODE_IN_TABLE; - case GUMBO_TAG_HEAD: - case GUMBO_TAG_BODY: + case HTML_QN(HEAD): + case HTML_QN(BODY): return GUMBO_INSERTION_MODE_IN_BODY; - case GUMBO_TAG_FRAMESET: + case HTML_QN(FRAMESET): return GUMBO_INSERTION_MODE_IN_FRAMESET; - case GUMBO_TAG_HTML: + case HTML_QN(HTML): return GUMBO_INSERTION_MODE_BEFORE_HEAD; default: return is_last ? @@ -673,46 +682,44 @@ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { } // Like tag_in, but checks for the tag of a node, rather than a token. -static bool node_tag_in(const GumboNode* node, ...) { +static bool node_qualname_in(const GumboNode* node, ...) { assert(node != NULL); if (node->type != GUMBO_NODE_ELEMENT) { return false; } - GumboTag node_tag = node->v.element.tag; + GumboQualName node_qualname = NODE_QUAL_NAME(node); - va_list tags; - va_start(tags, node); + va_list qualnames; + va_start(qualnames, node); bool result = false; - for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST; - tag = va_arg(tags, GumboTag)) { - assert(tag <= GUMBO_TAG_LAST); - if (tag == node_tag) { + for (GumboQualName qualname = va_arg(qualnames, GumboQualName); qualname != QUAL_NAME_LAST; + qualname = va_arg(qualnames, GumboQualName)) { + assert(QUAL_NAME_TAG(qualname) <= GUMBO_TAG_LAST); + if (qualname == node_qualname) { result = true; break; } } - va_end(tags); + va_end(qualnames); return result; } -// Like node_tag_in, but for the single-tag case. -static bool node_tag_is(const GumboNode* node, GumboTag tag) { - return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag; +// Like node_qualname_in, but for the single-tag case. +static bool node_qualname_is(const GumboNode* node, GumboQualName qualname) { + return node->type == GUMBO_NODE_ELEMENT && NODE_QUAL_NAME(node) == qualname; } // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point static bool is_mathml_integration_point(const GumboNode* node) { - return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, - GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) && - node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML; + return node_qualname_in(node, MATHML_QN(MI), MATHML_QN(MO), MATHML_QN(MN), + MATHML_QN(MS), MATHML_QN(MTEXT), QUAL_NAME_LAST); } // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point static bool is_html_integration_point(const GumboNode* node) { - return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, - GUMBO_TAG_TITLE, GUMBO_TAG_LAST) && - node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) || - (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && ( + return node_qualname_in(node, SVG_QN(FOREIGNOBJECT), SVG_QN(DESC), + SVG_QN(TITLE), QUAL_NAME_LAST) || + (node_qualname_is(node, MATHML_QN(ANNOTATION_XML)) && ( attribute_matches(&node->v.element.attributes, "encoding", "text/html") || attribute_matches(&node->v.element.attributes, @@ -767,10 +774,10 @@ static void foster_parent_element(GumboParser* parser, GumboNode* node) { node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED; GumboNode* foster_parent_element = open_elements->data[0]; assert(foster_parent_element->type == GUMBO_NODE_ELEMENT); - assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML)); + assert(node_qualname_is(foster_parent_element, HTML_QN(HTML))); for (int i = open_elements->length; --i > 1; ) { GumboNode* table_element = open_elements->data[i]; - if (node_tag_is(table_element, GUMBO_TAG_TABLE)) { + if (node_qualname_is(table_element, HTML_QN(TABLE))) { foster_parent_element = table_element->parent; if (!foster_parent_element || foster_parent_element->type != GUMBO_NODE_ELEMENT) { @@ -816,9 +823,9 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { state->_current_token->original_text.data - buffer_state->_start_original_text; text_node_data->start_pos = buffer_state->_start_position; - if (state->_foster_parent_insertions && node_tag_in( - get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY, - GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) { + if (state->_foster_parent_insertions && node_qualname_in( + get_current_node(parser), HTML_QN(TABLE), HTML_QN(TBODY), HTML_QN(TFOOT), + HTML_QN(THEAD), HTML_QN(TR), QUAL_NAME_LAST)) { foster_parent_element(parser, text_node); } else { append_node( @@ -846,7 +853,7 @@ static GumboNode* pop_current_node(GumboParser* parser) { GumboParserState* state = parser->_parser_state; maybe_flush_text_node_buffer(parser); if (state->_open_elements.length > 0) { - assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); + assert(node_qualname_is(state->_open_elements.data[0], HTML_QN(HTML))); gumbo_debug( "Popping %s node.\n", gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); @@ -858,10 +865,10 @@ static GumboNode* pop_current_node(GumboParser* parser) { } assert(current_node->type == GUMBO_NODE_ELEMENT); bool is_closed_body_or_html_tag = - (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) || - (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag); + (node_qualname_is(current_node, HTML_QN(BODY)) && state->_closed_body_tag) || + (node_qualname_is(current_node, HTML_QN(HTML)) && state->_closed_html_tag); if ((state->_current_token->type != GUMBO_TOKEN_END_TAG || - !node_tag_is(current_node, state->_current_token->v.end_tag)) && + current_node->v.element.tag != state->_current_token->v.end_tag) && !is_closed_body_or_html_tag) { current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; } @@ -885,25 +892,24 @@ static void append_comment_node( // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context static void clear_stack_to_table_row_context(GumboParser* parser) { - while (!node_tag_in(get_current_node(parser), - GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) { + while (!node_qualname_in(get_current_node(parser), HTML_QN(HTML), HTML_QN(TR), + QUAL_NAME_LAST)) { pop_current_node(parser); } } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context static void clear_stack_to_table_context(GumboParser* parser) { - while (!node_tag_in(get_current_node(parser), - GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) { + while (!node_qualname_in(get_current_node(parser), HTML_QN(HTML), HTML_QN(TABLE), + QUAL_NAME_LAST)) { pop_current_node(parser); } } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context void clear_stack_to_table_body_context(GumboParser* parser) { - while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML, - GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, - GUMBO_TAG_LAST)) { + while (!node_qualname_in(get_current_node(parser), HTML_QN(HTML), HTML_QN(TBODY), + HTML_QN(TFOOT), HTML_QN(THEAD), QUAL_NAME_LAST)) { pop_current_node(parser); } } @@ -966,9 +972,9 @@ static void insert_element(GumboParser* parser, GumboNode* node, if (!is_reconstructing_formatting_elements) { maybe_flush_text_node_buffer(parser); } - if (state->_foster_parent_insertions && node_tag_in( - get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY, - GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) { + if (state->_foster_parent_insertions && node_qualname_in( + get_current_node(parser), HTML_QN(TABLE), HTML_QN(TBODY), HTML_QN(TFOOT), + HTML_QN(THEAD), HTML_QN(TR), QUAL_NAME_LAST)) { foster_parent_element(parser, node); gumbo_vector_add(parser, (void*) node, &state->_open_elements); return; @@ -1073,7 +1079,7 @@ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) { if (node == &kActiveFormattingScopeMarker) { return false; } - if (node_tag_is(node, GUMBO_TAG_A)) { + if (node_qualname_is(node, HTML_QN(A))) { *anchor_index = i; return true; } @@ -1097,10 +1103,8 @@ static int count_formatting_elements_of_tag( break; } assert(node->type == GUMBO_NODE_ELEMENT); - GumboElement* element = &node->v.element; - if (node_tag_is(node, desired_element->tag) && - element->tag_namespace == desired_element->tag_namespace && - all_attributes_match(&element->attributes, + if (node_qualname_is(node, NODE_QUAL_NAME(desired_node)) && + all_attributes_match(&node->v.element.attributes, &desired_element->attributes)) { num_identical_elements++; *earliest_matching_index = i; @@ -1269,28 +1273,28 @@ static GumboQuirksModeEnum compute_quirks_mode( // The following functions are all defined by the "has an element in __ scope" // sections of the HTML5 spec: // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope -// The basic idea behind them is that they check for an element of the given tag -// name, contained within a scope formed by a set of other tag names. For -// example, "has an element in list scope" looks for an element of the given tag -// within the nearest enclosing
    or