Merge pull request #273 from nostrademons/cdata

Handle CDATA properly
google · Feb 17, 2015 · 7a55fdc · 7a55fdc
2 parents 12ce725 + ece6a44
commit 7a55fdc
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 16 deletions.
diff --git a/DEBUGGING.md b/DEBUGGING.md
@@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core
 
 The same goes for core dumps in other example binaries.
 
+To run only a single unit test, pass the --gtest_filter='TestName' flag to the
+lt-gumbo_test binary.
+
 Assertions
 ==========
 

diff --git a/src/error.c b/src/error.c
@@ -106,6 +106,7 @@ static void handle_parser_error(GumboParser* parser,
       // But just in case...
       print_message(parser, output, "Comments aren't legal here");
       return;
+    case GUMBO_TOKEN_CDATA:
     case GUMBO_TOKEN_WHITESPACE:
     case GUMBO_TOKEN_CHARACTER:
       print_message(parser, output, "Character tokens aren't legal here");

diff --git a/src/parser.c b/src/parser.c
@@ -345,7 +345,7 @@ typedef struct _TextNodeBufferState {
   // The source position of the start of this text node.
   GumboSourcePosition _start_position;
 
-  // The type of node that will be inserted (TEXT or WHITESPACE).
+  // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
   GumboNodeType _type;
 } TextNodeBufferState;
 
@@ -793,7 +793,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
   }
 
   assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
-         buffer_state->_type == GUMBO_NODE_TEXT);
+         buffer_state->_type == GUMBO_NODE_TEXT ||
+         buffer_state->_type == GUMBO_NODE_CDATA);
   GumboNode* text_node = create_node(parser, buffer_state->_type);
   GumboText* text_node_data = &text_node->v.text;
   text_node_data->text = gumbo_string_buffer_to_string(
@@ -1019,7 +1020,9 @@ static GumboNode* insert_foreign_element(
 
 static void insert_text_token(GumboParser* parser, GumboToken* token) {
   assert(token->type == GUMBO_TOKEN_WHITESPACE ||
-         token->type == GUMBO_TOKEN_CHARACTER);
+         token->type == GUMBO_TOKEN_CHARACTER ||
+         token->type == GUMBO_TOKEN_NULL ||
+         token->type == GUMBO_TOKEN_CDATA);
   TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
   if (buffer_state->_buffer.length == 0) {
     // Initialize position fields.
@@ -1030,6 +1033,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
       parser, token->v.character, &buffer_state->_buffer);
   if (token->type == GUMBO_TOKEN_CHARACTER) {
     buffer_state->_type = GUMBO_NODE_TEXT;
+  } else if (token->type == GUMBO_TOKEN_CDATA) {
+    buffer_state->_type = GUMBO_NODE_CDATA;
   }
   gumbo_debug("Inserting text token '%c'.\n", token->v.character);
 }
@@ -2207,7 +2212,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
     reconstruct_active_formatting_elements(parser);
     insert_text_token(parser, token);
     return true;
-  } else if (token->type == GUMBO_TOKEN_CHARACTER) {
+  } else if (token->type == GUMBO_TOKEN_CHARACTER ||
+             token->type == GUMBO_TOKEN_CDATA) {
     reconstruct_active_formatting_elements(parser);
     insert_text_token(parser, token);
     set_frameset_not_ok(parser);
@@ -3485,13 +3491,13 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
   switch (token->type) {
     case GUMBO_TOKEN_NULL:
       parser_add_parse_error(parser, token);
-      token->type = GUMBO_TOKEN_CHARACTER;
       token->v.character = kUtf8ReplacementChar;
       insert_text_token(parser, token);
       return false;
     case GUMBO_TOKEN_WHITESPACE:
       insert_text_token(parser, token);
       return true;
+    case GUMBO_TOKEN_CDATA:
     case GUMBO_TOKEN_CHARACTER:
       insert_text_token(parser, token);
       set_frameset_not_ok(parser);

diff --git a/src/token_type.h b/src/token_type.h
@@ -29,6 +29,7 @@ typedef enum {
   GUMBO_TOKEN_COMMENT,
   GUMBO_TOKEN_WHITESPACE,
   GUMBO_TOKEN_CHARACTER,
+  GUMBO_TOKEN_CDATA,
   GUMBO_TOKEN_NULL,
   GUMBO_TOKEN_EOF
 } GumboTokenType;

diff --git a/src/tokenizer.c b/src/tokenizer.c
@@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState {
   // markup declaration state.
   bool _is_current_node_foreign;
 
+  // A flag indicating whether the tokenizer is in a CDATA section.  If so, then
+  // text tokens emitted will be GUMBO_TOKEN_CDATA.
+  bool _is_in_cdata;
+
   // Certain states (notably character references) may emit two character tokens
   // at once, but the contract for lex() fills in only one token at a time.  The
   // extra character is buffered here, and then this is checked on entry to
@@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
   return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
 }
 
-static GumboTokenType get_char_token_type(int c) {
+static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
+  if (is_in_cdata && c > 0) {
+    return GUMBO_TOKEN_CDATA;
+  }
+
   switch (c) {
     case '\t':
     case '\n':
@@ -475,7 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
 
 // Writes a single specified character to the output token.
 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
-  output->type = get_char_token_type(c);
+  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
   output->v.character = c;
   finish_token(parser, output);
 }
@@ -850,6 +858,7 @@ void gumbo_tokenizer_state_init(
   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
   tokenizer->_reconsume_current_input = false;
   tokenizer->_is_current_node_foreign = false;
+  tokenizer->_is_in_cdata = false;
   tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
 
   tokenizer->_buffered_emit_char = kGumboNoChar;
@@ -2041,6 +2050,7 @@ static StateResult handle_markup_declaration_state(
              utf8iterator_maybe_consume_match(
                 &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
+    tokenizer->_is_in_cdata = true;
     tokenizer->_reconsume_current_input = true;
   } else {
     tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
@@ -2814,6 +2824,7 @@ static StateResult handle_cdata_state(
     tokenizer->_reconsume_current_input = true;
     reset_token_start_point(tokenizer);
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+    tokenizer->_is_in_cdata = true;
     return NEXT_CHAR;
   } else {
     return emit_current_char(parser, output);
@@ -2930,7 +2941,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
     assert(!tokenizer->_temporary_buffer_emit);
     assert(tokenizer->_buffered_emit_char == kGumboNoChar);
     int c = utf8iterator_current(&tokenizer->_input);
-    gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
+    gumbo_debug("Lexing character '%c' (%d) in state %d.\n",
+        c, c, tokenizer->_state);
     StateResult result =
         dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
     // We need to clear reconsume_current_input before returning to prevent

diff --git a/src/utf8.c b/src/utf8.c
@@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) {
     decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
     if (state == UTF8_ACCEPT) {
       iter->_width = c - iter->_start + 1;
-      // This is the special handling for carriage returns that is mandated by the
-      // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
-      // we operate in terms of chars and only need a check for iter overrun,
-      // instead of having to read in a full next code point.
+      // This is the special handling for carriage returns that is mandated by
+      // the HTML5 spec.  Since we're looking for particular 7-bit literal
+      // characters, we operate in terms of chars and only need a check for iter
+      // overrun, instead of having to read in a full next code point.
       // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
       if (code_point == '\r') {
         assert(iter->_width == 1);
@@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) {
       return;
     }
   }
-  // If we got here without exiting early, then we've reached the end of the iterator.
-  // Add an error for truncated input, set the width to consume the rest of the
-  // iterator, and emit a replacement character.  The next time we enter this method,
-  // it will detect that there's no input to consume and 
+  // If we got here without exiting early, then we've reached the end of the
+  // iterator.  Add an error for truncated input, set the width to consume the
+  // rest of the iterator, and emit a replacement character.  The next time we
+  // enter this method, it will detect that there's no input to consume and
+  // output an EOF.
   iter->_current = kUtf8ReplacementChar;
   iter->_width = iter->_end - iter->_start;
   add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);

diff --git a/tests/parser.cc b/tests/parser.cc
@@ -1522,6 +1522,57 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) {
   ASSERT_EQ(1, GetChildCount(li2));
 }
 
+TEST_F(GumboParserTest, CData) {
+  Parse("<svg><![CDATA[this is text]]></svg>");
+
+  GumboNode* body;
+  GetAndAssertBody(root_, &body);
+  ASSERT_EQ(1, GetChildCount(body));
+
+  GumboNode* svg = GetChild(body, 0);
+  ASSERT_EQ(1, GetChildCount(svg));
+
+  GumboNode* cdata = GetChild(svg, 0);
+  ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
+  EXPECT_STREQ("this is text", cdata->v.text.text);
+}
+
+TEST_F(GumboParserTest, CDataUnsafe) {
+  // Can't use Parse() because of the strlen
+  output_ = gumbo_parse_with_options(
+      &options_, "<svg><![CDATA[\0filler\0text\0]]>",
+      sizeof("<svg><![CDATA[\0filler\0text\0]]>") - 1);
+  root_ = output_->document;
+
+  GumboNode* body;
+  GetAndAssertBody(root_, &body);
+  ASSERT_EQ(1, GetChildCount(body));
+
+  GumboNode* svg = GetChild(body, 0);
+  ASSERT_EQ(1, GetChildCount(svg));
+
+  GumboNode* cdata = GetChild(svg, 0);
+  ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
+  // \xEF\xBF\xBD = unicode replacement char
+  EXPECT_STREQ("\xEF\xBF\xBD" "filler\xEF\xBF\xBD" "text\xEF\xBF\xBD",
+      cdata->v.text.text);
+}
+
+TEST_F(GumboParserTest, CDataInBody) {
+  Parse("<div><![CDATA[this is text]]></div>");
+
+  GumboNode* body;
+  GetAndAssertBody(root_, &body);
+  ASSERT_EQ(1, GetChildCount(body));
+
+  GumboNode* div = GetChild(body, 0);
+  ASSERT_EQ(1, GetChildCount(div));
+
+  GumboNode* cdata = GetChild(div, 0);
+  ASSERT_EQ(GUMBO_NODE_COMMENT, cdata->type);
+  EXPECT_STREQ("[CDATA[this is text]]", cdata->v.text.text);
+}
+
 TEST_F(GumboParserTest, FormattingTagsInHeading) {
   Parse("<h2>This is <b>old</h2>text");
 

diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc
@@ -450,6 +450,24 @@ TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
   EXPECT_EQ('>', token_.v.character);
 }
 
+TEST_F(GumboTokenizerTest, CData) {
+  // SetInput uses strlen and so can't handle nulls.
+  text_ = "<![CDATA[\0filler\0text\0]]>";
+  gumbo_tokenizer_state_destroy(&parser_);
+  gumbo_tokenizer_state_init(
+      &parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
+  gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
+
+  EXPECT_TRUE(gumbo_lex(&parser_, &token_));
+  EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
+  EXPECT_EQ(0, token_.v.character);
+
+  gumbo_token_destroy(&parser_, &token_);
+  EXPECT_TRUE(gumbo_lex(&parser_, &token_));
+  EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
+  EXPECT_EQ('f', token_.v.character);
+}
+
 TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
   SetInput("<style>/* For <head> */</style>");
   Advance(1);

diff --git a/tests/utf8.cc b/tests/utf8.cc
@@ -556,6 +556,21 @@ TEST_F(Utf8Test, MatchesCaseInsensitive) {
   EXPECT_EQ(-1, utf8iterator_current(&input_));
 }
 
+TEST_F(Utf8Test, MatchFollowedByNullByte) {
+  // Can't use ResetText, as the implicit strlen will choke on the null.
+  text_ = "CDATA\0f";
+  utf8iterator_init(&parser_, text_, 7, &input_);
+
+  EXPECT_TRUE(utf8iterator_maybe_consume_match(
+        &input_, "cdata", sizeof("cdata") - 1, false));
+
+  EXPECT_EQ(0, utf8iterator_current(&input_));
+  EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
+  utf8iterator_next(&input_);
+  EXPECT_EQ('f', utf8iterator_current(&input_));
+  EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
+}
+
 TEST_F(Utf8Test, MarkReset) {
   ResetText("this is a test");
   Advance(5);