From 5f77e9852df155b42f03f587aa4702c8d2ed729a Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 15:23:51 -0800 Subject: [PATCH 01/12] Add a token type for CDATA. --- src/parser.c | 2 +- src/token_type.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parser.c b/src/parser.c index a97e54b4..e12fd9c1 100644 --- a/src/parser.c +++ b/src/parser.c @@ -336,7 +336,7 @@ typedef struct _TextNodeBufferState { // The source position of the start of this text node. GumboSourcePosition _start_position; - // The type of node that will be inserted (TEXT or WHITESPACE). + // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). GumboNodeType _type; } TextNodeBufferState; diff --git a/src/token_type.h b/src/token_type.h index 5874d1a2..eeab5078 100644 --- a/src/token_type.h +++ b/src/token_type.h @@ -29,6 +29,7 @@ typedef enum { GUMBO_TOKEN_COMMENT, GUMBO_TOKEN_WHITESPACE, GUMBO_TOKEN_CHARACTER, + GUMBO_TOKEN_CDATA, GUMBO_TOKEN_NULL, GUMBO_TOKEN_EOF } GumboTokenType; From 2adaeb0c9d7c5295382e93d16b8a48d178f913a1 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 15:29:48 -0800 Subject: [PATCH 02/12] Add a state flag for whether the tokenizer is in a cdata section, and set it as appropriate. --- src/tokenizer.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.c b/src/tokenizer.c index dcaf8165..a9b1a72b 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState { // markup declaration state. bool _is_current_node_foreign; + // A flag indicating whether the tokenizer is in a CDATA section. If so, then + // text tokens emitted will be GUMBO_TOKEN_CDATA. + bool _is_in_cdata; + // Certain states (notably character references) may emit two character tokens // at once, but the contract for lex() fills in only one token at a time. The // extra character is buffered here, and then this is checked on entry to @@ -475,7 +479,11 @@ static void finish_doctype_system_id(GumboParser* parser) { // Writes a single specified character to the output token. static void emit_char(GumboParser* parser, int c, GumboToken* output) { - output->type = get_char_token_type(c); + if (parser->_tokenizer_state->_is_in_cdata) { + output->type = GUMBO_TOKEN_CDATA; + } else { + output->type = get_char_token_type(c); + } output->v.character = c; finish_token(parser, output); } @@ -850,6 +858,7 @@ void gumbo_tokenizer_state_init( gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); tokenizer->_reconsume_current_input = false; tokenizer->_is_current_node_foreign = false; + tokenizer->_is_in_cdata = false; tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST; tokenizer->_buffered_emit_char = kGumboNoChar; @@ -2041,6 +2050,7 @@ static StateResult handle_markup_declaration_state( utf8iterator_maybe_consume_match( &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA); + tokenizer->_is_in_cdata = true; tokenizer->_reconsume_current_input = true; } else { tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE); @@ -2813,6 +2823,7 @@ static StateResult handle_cdata_state( tokenizer->_reconsume_current_input = true; reset_token_start_point(tokenizer); gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); + tokenizer->_is_in_cdata = true; return NEXT_CHAR; } else { return emit_current_char(parser, output); From 9662fd0cd9333776e1914e25469e57df9abc6da0 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 20:41:25 -0800 Subject: [PATCH 03/12] Add CDATA handling to parser, including a test for it. --- src/parser.c | 12 +++++++++--- src/tokenizer.c | 12 ++++++------ tests/parser.cc | 15 +++++++++++++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/parser.c b/src/parser.c index e12fd9c1..17dcf55c 100644 --- a/src/parser.c +++ b/src/parser.c @@ -806,7 +806,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { } assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || - buffer_state->_type == GUMBO_NODE_TEXT); + buffer_state->_type == GUMBO_NODE_TEXT || + buffer_state->_type == GUMBO_NODE_CDATA); GumboNode* text_node = create_node(parser, buffer_state->_type); GumboText* text_node_data = &text_node->v.text; text_node_data->text = gumbo_string_buffer_to_string( @@ -1035,7 +1036,8 @@ static GumboNode* insert_foreign_element( static void insert_text_token(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_CHARACTER); + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { // Initialize position fields. @@ -1046,6 +1048,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) { parser, token->v.character, &buffer_state->_buffer); if (token->type == GUMBO_TOKEN_CHARACTER) { buffer_state->_type = GUMBO_NODE_TEXT; + } else if (token->type == GUMBO_TOKEN_CDATA) { + buffer_state->_type = GUMBO_NODE_CDATA; } gumbo_debug("Inserting text token '%c'.\n", token->v.character); } @@ -2307,7 +2311,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); return true; - } else if (token->type == GUMBO_TOKEN_CHARACTER) { + } else if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); set_frameset_not_ok(parser); @@ -3638,6 +3643,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { case GUMBO_TOKEN_WHITESPACE: insert_text_token(parser, token); return true; + case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_CHARACTER: insert_text_token(parser, token); set_frameset_not_ok(parser); diff --git a/src/tokenizer.c b/src/tokenizer.c index a9b1a72b..a281ead2 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -319,7 +319,11 @@ static int ensure_lowercase(int c) { return c >= 'A' && c <= 'Z' ? c + 0x20 : c; } -static GumboTokenType get_char_token_type(int c) { +static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { + if (is_in_cdata && c != -1) { + return GUMBO_TOKEN_CDATA; + } + switch (c) { case '\t': case '\n': @@ -479,11 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) { // Writes a single specified character to the output token. static void emit_char(GumboParser* parser, int c, GumboToken* output) { - if (parser->_tokenizer_state->_is_in_cdata) { - output->type = GUMBO_TOKEN_CDATA; - } else { - output->type = get_char_token_type(c); - } + output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c); output->v.character = c; finish_token(parser, output); } diff --git a/tests/parser.cc b/tests/parser.cc index 4299b389..e5836eec 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1522,6 +1522,21 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) { ASSERT_EQ(1, GetChildCount(li2)); } +TEST_F(GumboParserTest, CData) { + Parse("this is text"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* svg = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(svg)); + + GumboNode* cdata = GetChild(svg, 0); + ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); + EXPECT_STREQ("this is text", cdata->v.text.text); +} + TEST_F(GumboParserTest, FormattingTagsInHeading) { Parse("

This is old

text"); From 025480ac42600b199edc0698be9cb844f0efd3cd Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Sun, 9 Nov 2014 20:48:32 -0800 Subject: [PATCH 04/12] Add test for CDATA sections not in foreign content. --- tests/parser.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/parser.cc b/tests/parser.cc index e5836eec..51600222 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1537,6 +1537,21 @@ TEST_F(GumboParserTest, CData) { EXPECT_STREQ("this is text", cdata->v.text.text); } +TEST_F(GumboParserTest, CDataInBody) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* div = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(div)); + + GumboNode* cdata = GetChild(div, 0); + ASSERT_EQ(GUMBO_NODE_COMMENT, cdata->type); + EXPECT_STREQ("[CDATA[this is text]]", cdata->v.text.text); +} + TEST_F(GumboParserTest, FormattingTagsInHeading) { Parse("

This is old

text"); From a14277248948ecc1dd4a2405e0c38440864b8390 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 10 Nov 2014 13:09:38 -0800 Subject: [PATCH 05/12] Fix a couple comment issues (line-wrapping, unfinished comments) in utf8.c --- src/utf8.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/utf8.c b/src/utf8.c index a5c5b0e2..a6a30376 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) { decode(&state, &code_point, (uint32_t) (unsigned char) (*c)); if (state == UTF8_ACCEPT) { iter->_width = c - iter->_start + 1; - // This is the special handling for carriage returns that is mandated by the - // HTML5 spec. Since we're looking for particular 7-bit literal characters, - // we operate in terms of chars and only need a check for iter overrun, - // instead of having to read in a full next code point. + // This is the special handling for carriage returns that is mandated by + // the HTML5 spec. Since we're looking for particular 7-bit literal + // characters, we operate in terms of chars and only need a check for iter + // overrun, instead of having to read in a full next code point. // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream if (code_point == '\r') { assert(iter->_width == 1); @@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) { return; } } - // If we got here without exiting early, then we've reached the end of the iterator. - // Add an error for truncated input, set the width to consume the rest of the - // iterator, and emit a replacement character. The next time we enter this method, - // it will detect that there's no input to consume and + // If we got here without exiting early, then we've reached the end of the + // iterator. Add an error for truncated input, set the width to consume the + // rest of the iterator, and emit a replacement character. The next time we + // enter this method, it will detect that there's no input to consume and + // output an EOF. iter->_current = kUtf8ReplacementChar; iter->_width = iter->_end - iter->_start; add_error(iter, GUMBO_ERR_UTF8_TRUNCATED); From a688ca249d59294bd3938d01a34619fc4f7863a9 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 10 Nov 2014 13:10:25 -0800 Subject: [PATCH 06/12] Print the decimal value of the current character in the debug output for lexing, to ease debugging non-printable characters. --- src/tokenizer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.c b/src/tokenizer.c index a281ead2..f88f9d4c 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -2940,7 +2940,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) { assert(!tokenizer->_temporary_buffer_emit); assert(tokenizer->_buffered_emit_char == kGumboNoChar); int c = utf8iterator_current(&tokenizer->_input); - gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state); + gumbo_debug("Lexing character '%c' (%d) in state %d.\n", + c, c, tokenizer->_state); StateResult result = dispatch_table[tokenizer->_state](parser, tokenizer, c, output); // We need to clear reconsume_current_input before returning to prevent From 191311af43048e7fed194a5c0a0066aeedbab09d Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 15 Dec 2014 11:41:18 -0800 Subject: [PATCH 07/12] Add test for unsafe cdata. --- tests/parser.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/parser.cc b/tests/parser.cc index 51600222..89f24f33 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1537,6 +1537,22 @@ TEST_F(GumboParserTest, CData) { EXPECT_STREQ("this is text", cdata->v.text.text); } +TEST_F(GumboParserTest, CDataUnsafe) { + Parse("\0filler\0text\0"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* svg = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(svg)); + + GumboNode* cdata = GetChild(svg, 0); + ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); + // \xEF\xBF\xBD = unicode replacement char + EXPECT_STREQ("fillertext", cdata->v.text.text); +} + TEST_F(GumboParserTest, CDataInBody) { Parse("
"); From 566890cd41b9af2910b16718aa32e482e71a01e6 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Tue, 10 Feb 2015 18:17:06 -0800 Subject: [PATCH 08/12] Fix missing case statement for GUMBO_TOKEN_CDATA in handle_parser_error. (The whole error handling really needs to be redone, it's not very helpful to users.) --- src/error.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/error.c b/src/error.c index 3239a0b6..0cae4639 100644 --- a/src/error.c +++ b/src/error.c @@ -106,6 +106,7 @@ static void handle_parser_error(GumboParser* parser, // But just in case... print_message(parser, output, "Comments aren't legal here"); return; + case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_WHITESPACE: case GUMBO_TOKEN_CHARACTER: print_message(parser, output, "Character tokens aren't legal here"); From 484cff340338acbcd66b546eeb8a2724dbd76f6a Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Tue, 10 Feb 2015 18:24:33 -0800 Subject: [PATCH 09/12] Additional debugging instructions. --- DEBUGGING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DEBUGGING.md b/DEBUGGING.md index 262ba1f1..8b8a56df 100644 --- a/DEBUGGING.md +++ b/DEBUGGING.md @@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core The same goes for core dumps in other example binaries. +To run only a single unit test, pass the --gtest_filter='TestName' flag to the +lt-gumbo_test binary. + Assertions ========== From c629587a9936186e9fd0b92047d0489aaf2ffa03 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 16 Feb 2015 18:06:12 -0800 Subject: [PATCH 10/12] Add a test for utf8iterator_maybe_consume_match followed by a null. --- tests/utf8.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/utf8.cc b/tests/utf8.cc index 479e4bc8..a98c69c2 100644 --- a/tests/utf8.cc +++ b/tests/utf8.cc @@ -556,6 +556,21 @@ TEST_F(Utf8Test, MatchesCaseInsensitive) { EXPECT_EQ(-1, utf8iterator_current(&input_)); } +TEST_F(Utf8Test, MatchFollowedByNullByte) { + // Can't use ResetText, as the implicit strlen will choke on the null. + text_ = "CDATA\0f"; + utf8iterator_init(&parser_, text_, 7, &input_); + + EXPECT_TRUE(utf8iterator_maybe_consume_match( + &input_, "cdata", sizeof("cdata") - 1, false)); + + EXPECT_EQ(0, utf8iterator_current(&input_)); + EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_)); + utf8iterator_next(&input_); + EXPECT_EQ('f', utf8iterator_current(&input_)); + EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_)); +} + TEST_F(Utf8Test, MarkReset) { ResetText("this is a test"); Advance(5); From bab2d4a342bf20cf5000cca8ab6831275604238a Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 16 Feb 2015 21:56:51 -0800 Subject: [PATCH 11/12] Update parser and tokenizer tests with testcases for null CDATA, and make sure their input mechanisms can accept this without relying on strlen. --- tests/parser.cc | 6 +++++- tests/tokenizer.cc | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/parser.cc b/tests/parser.cc index 72b60b41..b87f60c0 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1538,7 +1538,11 @@ TEST_F(GumboParserTest, CData) { } TEST_F(GumboParserTest, CDataUnsafe) { - Parse("\0filler\0text\0"); + // Can't use Parse() because of the strlen + output_ = gumbo_parse_with_options( + &options_, "\0filler\0text\0", + sizeof("\0filler\0text\0") - 1); + root_ = output_->document; GumboNode* body; GetAndAssertBody(root_, &body); diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc index 532bad98..2e4b04ac 100644 --- a/tests/tokenizer.cc +++ b/tests/tokenizer.cc @@ -450,6 +450,24 @@ TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) { EXPECT_EQ('>', token_.v.character); } +TEST_F(GumboTokenizerTest, CData) { + // SetInput uses strlen and so can't handle nulls. + text_ = "\0filler\0text\0"; + gumbo_tokenizer_state_destroy(&parser_); + gumbo_tokenizer_state_init( + &parser_, text_, sizeof("\0filler\0text\0") - 1); + gumbo_tokenizer_set_is_current_node_foreign(&parser_, true); + + EXPECT_TRUE(gumbo_lex(&parser_, &token_)); + EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type); + EXPECT_EQ(0, token_.v.character); + + gumbo_token_destroy(&parser_, &token_); + EXPECT_TRUE(gumbo_lex(&parser_, &token_)); + EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type); + EXPECT_EQ('f', token_.v.character); +} + TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) { SetInput(" */"); Advance(1); From ece6a44bbfa6d5e355b996cfa8a89ef3497947cc Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Mon, 16 Feb 2015 22:35:41 -0800 Subject: [PATCH 12/12] Fix handling of nulls in CDATA sections. --- src/parser.c | 2 +- src/tokenizer.c | 2 +- tests/parser.cc | 3 ++- tests/tokenizer.cc | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/parser.c b/src/parser.c index b2c1ad8b..9296e5d8 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1021,6 +1021,7 @@ static GumboNode* insert_foreign_element( static void insert_text_token(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_WHITESPACE || token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { @@ -3490,7 +3491,6 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { switch (token->type) { case GUMBO_TOKEN_NULL: parser_add_parse_error(parser, token); - token->type = GUMBO_TOKEN_CHARACTER; token->v.character = kUtf8ReplacementChar; insert_text_token(parser, token); return false; diff --git a/src/tokenizer.c b/src/tokenizer.c index 89c22d13..8c9272c0 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -320,7 +320,7 @@ static int ensure_lowercase(int c) { } static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { - if (is_in_cdata && c != -1) { + if (is_in_cdata && c > 0) { return GUMBO_TOKEN_CDATA; } diff --git a/tests/parser.cc b/tests/parser.cc index b87f60c0..590f549a 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1554,7 +1554,8 @@ TEST_F(GumboParserTest, CDataUnsafe) { GumboNode* cdata = GetChild(svg, 0); ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); // \xEF\xBF\xBD = unicode replacement char - EXPECT_STREQ("fillertext", cdata->v.text.text); + EXPECT_STREQ("\xEF\xBF\xBD" "filler\xEF\xBF\xBD" "text\xEF\xBF\xBD", + cdata->v.text.text); } TEST_F(GumboParserTest, CDataInBody) { diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc index 2e4b04ac..916494e2 100644 --- a/tests/tokenizer.cc +++ b/tests/tokenizer.cc @@ -459,7 +459,7 @@ TEST_F(GumboTokenizerTest, CData) { gumbo_tokenizer_set_is_current_node_foreign(&parser_, true); EXPECT_TRUE(gumbo_lex(&parser_, &token_)); - EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type); + EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type); EXPECT_EQ(0, token_.v.character); gumbo_token_destroy(&parser_, &token_);