From 5f77e9852df155b42f03f587aa4702c8d2ed729a Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Sun, 9 Nov 2014 15:23:51 -0800
Subject: [PATCH 01/12] Add a token type for CDATA.

---
 src/parser.c     | 2 +-
 src/token_type.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/parser.c b/src/parser.c
index a97e54b4..e12fd9c1 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -336,7 +336,7 @@ typedef struct _TextNodeBufferState {
   // The source position of the start of this text node.
   GumboSourcePosition _start_position;
 
-  // The type of node that will be inserted (TEXT or WHITESPACE).
+  // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
   GumboNodeType _type;
 } TextNodeBufferState;
 
diff --git a/src/token_type.h b/src/token_type.h
index 5874d1a2..eeab5078 100644
--- a/src/token_type.h
+++ b/src/token_type.h
@@ -29,6 +29,7 @@ typedef enum {
   GUMBO_TOKEN_COMMENT,
   GUMBO_TOKEN_WHITESPACE,
   GUMBO_TOKEN_CHARACTER,
+  GUMBO_TOKEN_CDATA,
   GUMBO_TOKEN_NULL,
   GUMBO_TOKEN_EOF
 } GumboTokenType;

From 2adaeb0c9d7c5295382e93d16b8a48d178f913a1 Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Sun, 9 Nov 2014 15:29:48 -0800
Subject: [PATCH 02/12] Add a state flag for whether the tokenizer is in a
 cdata section, and set it as appropriate.

---
 src/tokenizer.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/tokenizer.c b/src/tokenizer.c
index dcaf8165..a9b1a72b 100644
--- a/src/tokenizer.c
+++ b/src/tokenizer.c
@@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState {
   // markup declaration state.
   bool _is_current_node_foreign;
 
+  // A flag indicating whether the tokenizer is in a CDATA section.  If so, then
+  // text tokens emitted will be GUMBO_TOKEN_CDATA.
+  bool _is_in_cdata;
+
   // Certain states (notably character references) may emit two character tokens
   // at once, but the contract for lex() fills in only one token at a time.  The
   // extra character is buffered here, and then this is checked on entry to
@@ -475,7 +479,11 @@ static void finish_doctype_system_id(GumboParser* parser) {
 
 // Writes a single specified character to the output token.
 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
-  output->type = get_char_token_type(c);
+  if (parser->_tokenizer_state->_is_in_cdata) {
+    output->type = GUMBO_TOKEN_CDATA;
+  } else {
+    output->type = get_char_token_type(c);
+  }
   output->v.character = c;
   finish_token(parser, output);
 }
@@ -850,6 +858,7 @@ void gumbo_tokenizer_state_init(
   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
   tokenizer->_reconsume_current_input = false;
   tokenizer->_is_current_node_foreign = false;
+  tokenizer->_is_in_cdata = false;
   tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
 
   tokenizer->_buffered_emit_char = kGumboNoChar;
@@ -2041,6 +2050,7 @@ static StateResult handle_markup_declaration_state(
              utf8iterator_maybe_consume_match(
                 &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
+    tokenizer->_is_in_cdata = true;
     tokenizer->_reconsume_current_input = true;
   } else {
     tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
@@ -2813,6 +2823,7 @@ static StateResult handle_cdata_state(
     tokenizer->_reconsume_current_input = true;
     reset_token_start_point(tokenizer);
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+    tokenizer->_is_in_cdata = true;
     return NEXT_CHAR;
   } else {
     return emit_current_char(parser, output);

From 9662fd0cd9333776e1914e25469e57df9abc6da0 Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Sun, 9 Nov 2014 20:41:25 -0800
Subject: [PATCH 03/12] Add CDATA handling to parser, including a test for it.

---
 src/parser.c    | 12 +++++++++---
 src/tokenizer.c | 12 ++++++------
 tests/parser.cc | 15 +++++++++++++++
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/src/parser.c b/src/parser.c
index e12fd9c1..17dcf55c 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -806,7 +806,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
   }
 
   assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
-         buffer_state->_type == GUMBO_NODE_TEXT);
+         buffer_state->_type == GUMBO_NODE_TEXT ||
+         buffer_state->_type == GUMBO_NODE_CDATA);
   GumboNode* text_node = create_node(parser, buffer_state->_type);
   GumboText* text_node_data = &text_node->v.text;
   text_node_data->text = gumbo_string_buffer_to_string(
@@ -1035,7 +1036,8 @@ static GumboNode* insert_foreign_element(
 
 static void insert_text_token(GumboParser* parser, GumboToken* token) {
   assert(token->type == GUMBO_TOKEN_WHITESPACE ||
-         token->type == GUMBO_TOKEN_CHARACTER);
+         token->type == GUMBO_TOKEN_CHARACTER ||
+         token->type == GUMBO_TOKEN_CDATA);
   TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
   if (buffer_state->_buffer.length == 0) {
     // Initialize position fields.
@@ -1046,6 +1048,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
       parser, token->v.character, &buffer_state->_buffer);
   if (token->type == GUMBO_TOKEN_CHARACTER) {
     buffer_state->_type = GUMBO_NODE_TEXT;
+  } else if (token->type == GUMBO_TOKEN_CDATA) {
+    buffer_state->_type = GUMBO_NODE_CDATA;
   }
   gumbo_debug("Inserting text token '%c'.\n", token->v.character);
 }
@@ -2307,7 +2311,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
     reconstruct_active_formatting_elements(parser);
     insert_text_token(parser, token);
     return true;
-  } else if (token->type == GUMBO_TOKEN_CHARACTER) {
+  } else if (token->type == GUMBO_TOKEN_CHARACTER ||
+             token->type == GUMBO_TOKEN_CDATA) {
     reconstruct_active_formatting_elements(parser);
     insert_text_token(parser, token);
     set_frameset_not_ok(parser);
@@ -3638,6 +3643,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
     case GUMBO_TOKEN_WHITESPACE:
       insert_text_token(parser, token);
       return true;
+    case GUMBO_TOKEN_CDATA:
     case GUMBO_TOKEN_CHARACTER:
       insert_text_token(parser, token);
       set_frameset_not_ok(parser);
diff --git a/src/tokenizer.c b/src/tokenizer.c
index a9b1a72b..a281ead2 100644
--- a/src/tokenizer.c
+++ b/src/tokenizer.c
@@ -319,7 +319,11 @@ static int ensure_lowercase(int c) {
   return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
 }
 
-static GumboTokenType get_char_token_type(int c) {
+static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
+  if (is_in_cdata && c != -1) {
+    return GUMBO_TOKEN_CDATA;
+  }
+
   switch (c) {
     case '\t':
     case '\n':
@@ -479,11 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
 
 // Writes a single specified character to the output token.
 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
-  if (parser->_tokenizer_state->_is_in_cdata) {
-    output->type = GUMBO_TOKEN_CDATA;
-  } else {
-    output->type = get_char_token_type(c);
-  }
+  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
   output->v.character = c;
   finish_token(parser, output);
 }
diff --git a/tests/parser.cc b/tests/parser.cc
index 4299b389..e5836eec 100644
--- a/tests/parser.cc
+++ b/tests/parser.cc
@@ -1522,6 +1522,21 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) {
   ASSERT_EQ(1, GetChildCount(li2));
 }
 
+TEST_F(GumboParserTest, CData) {
+  Parse("<svg><![CDATA[this is text]]></svg>");
+
+  GumboNode* body;
+  GetAndAssertBody(root_, &body);
+  ASSERT_EQ(1, GetChildCount(body));
+
+  GumboNode* svg = GetChild(body, 0);
+  ASSERT_EQ(1, GetChildCount(svg));
+
+  GumboNode* cdata = GetChild(svg, 0);
+  ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
+  EXPECT_STREQ("this is text", cdata->v.text.text);
+}
+
 TEST_F(GumboParserTest, FormattingTagsInHeading) {
   Parse("<h2>This is <b>old</h2>text");
 

From 025480ac42600b199edc0698be9cb844f0efd3cd Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Sun, 9 Nov 2014 20:48:32 -0800
Subject: [PATCH 04/12] Add test for CDATA sections not in foreign content.

---
 tests/parser.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/parser.cc b/tests/parser.cc
index e5836eec..51600222 100644
--- a/tests/parser.cc
+++ b/tests/parser.cc
@@ -1537,6 +1537,21 @@ TEST_F(GumboParserTest, CData) {
   EXPECT_STREQ("this is text", cdata->v.text.text);
 }
 
+TEST_F(GumboParserTest, CDataInBody) {
+  Parse("<div><![CDATA[this is text]]></div>");
+
+  GumboNode* body;
+  GetAndAssertBody(root_, &body);
+  ASSERT_EQ(1, GetChildCount(body));
+
+  GumboNode* div = GetChild(body, 0);
+  ASSERT_EQ(1, GetChildCount(div));
+
+  GumboNode* cdata = GetChild(div, 0);
+  ASSERT_EQ(GUMBO_NODE_COMMENT, cdata->type);
+  EXPECT_STREQ("[CDATA[this is text]]", cdata->v.text.text);
+}
+
 TEST_F(GumboParserTest, FormattingTagsInHeading) {
   Parse("<h2>This is <b>old</h2>text");
 

From a14277248948ecc1dd4a2405e0c38440864b8390 Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Mon, 10 Nov 2014 13:09:38 -0800
Subject: [PATCH 05/12] Fix a couple comment issues (line-wrapping, unfinished
 comments) in utf8.c

---
 src/utf8.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/utf8.c b/src/utf8.c
index a5c5b0e2..a6a30376 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) {
     decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
     if (state == UTF8_ACCEPT) {
       iter->_width = c - iter->_start + 1;
-      // This is the special handling for carriage returns that is mandated by the
-      // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
-      // we operate in terms of chars and only need a check for iter overrun,
-      // instead of having to read in a full next code point.
+      // This is the special handling for carriage returns that is mandated by
+      // the HTML5 spec.  Since we're looking for particular 7-bit literal
+      // characters, we operate in terms of chars and only need a check for iter
+      // overrun, instead of having to read in a full next code point.
       // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
       if (code_point == '\r') {
         assert(iter->_width == 1);
@@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) {
       return;
     }
   }
-  // If we got here without exiting early, then we've reached the end of the iterator.
-  // Add an error for truncated input, set the width to consume the rest of the
-  // iterator, and emit a replacement character.  The next time we enter this method,
-  // it will detect that there's no input to consume and 
+  // If we got here without exiting early, then we've reached the end of the
+  // iterator.  Add an error for truncated input, set the width to consume the
+  // rest of the iterator, and emit a replacement character.  The next time we
+  // enter this method, it will detect that there's no input to consume and
+  // output an EOF.
   iter->_current = kUtf8ReplacementChar;
   iter->_width = iter->_end - iter->_start;
   add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);

From a688ca249d59294bd3938d01a34619fc4f7863a9 Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Mon, 10 Nov 2014 13:10:25 -0800
Subject: [PATCH 06/12] Print the decimal value of the current character in the
 debug output for lexing, to ease debugging non-printable characters.

---
 src/tokenizer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tokenizer.c b/src/tokenizer.c
index a281ead2..f88f9d4c 100644
--- a/src/tokenizer.c
+++ b/src/tokenizer.c
@@ -2940,7 +2940,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
     assert(!tokenizer->_temporary_buffer_emit);
     assert(tokenizer->_buffered_emit_char == kGumboNoChar);
     int c = utf8iterator_current(&tokenizer->_input);
-    gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
+    gumbo_debug("Lexing character '%c' (%d) in state %d.\n",
+        c, c, tokenizer->_state);
     StateResult result =
         dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
     // We need to clear reconsume_current_input before returning to prevent

From 191311af43048e7fed194a5c0a0066aeedbab09d Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Mon, 15 Dec 2014 11:41:18 -0800
Subject: [PATCH 07/12] Add test for unsafe cdata.

---
 tests/parser.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/parser.cc b/tests/parser.cc
index 51600222..89f24f33 100644
--- a/tests/parser.cc
+++ b/tests/parser.cc
@@ -1537,6 +1537,22 @@ TEST_F(GumboParserTest, CData) {
   EXPECT_STREQ("this is text", cdata->v.text.text);
 }
 
+TEST_F(GumboParserTest, CDataUnsafe) {
+  Parse("<svg><![CDATA[\0filler\0text\0]]>");
+
+  GumboNode* body;
+  GetAndAssertBody(root_, &body);
+  ASSERT_EQ(1, GetChildCount(body));
+
+  GumboNode* svg = GetChild(body, 0);
+  ASSERT_EQ(1, GetChildCount(svg));
+
+  GumboNode* cdata = GetChild(svg, 0);
+  ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
+  // \xEF\xBF\xBD = unicode replacement char
+  EXPECT_STREQ("fillertext", cdata->v.text.text);
+}
+
 TEST_F(GumboParserTest, CDataInBody) {
   Parse("<div><![CDATA[this is text]]></div>");
 

From 566890cd41b9af2910b16718aa32e482e71a01e6 Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Tue, 10 Feb 2015 18:17:06 -0800
Subject: [PATCH 08/12] Fix missing case statement for GUMBO_TOKEN_CDATA in
 handle_parser_error.  (The whole error handling really needs to be redone,
 it's not very helpful to users.)

---
 src/error.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/error.c b/src/error.c
index 3239a0b6..0cae4639 100644
--- a/src/error.c
+++ b/src/error.c
@@ -106,6 +106,7 @@ static void handle_parser_error(GumboParser* parser,
       // But just in case...
       print_message(parser, output, "Comments aren't legal here");
       return;
+    case GUMBO_TOKEN_CDATA:
     case GUMBO_TOKEN_WHITESPACE:
     case GUMBO_TOKEN_CHARACTER:
       print_message(parser, output, "Character tokens aren't legal here");

From 484cff340338acbcd66b546eeb8a2724dbd76f6a Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Tue, 10 Feb 2015 18:24:33 -0800
Subject: [PATCH 09/12] Additional debugging instructions.

---
 DEBUGGING.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/DEBUGGING.md b/DEBUGGING.md
index 262ba1f1..8b8a56df 100644
--- a/DEBUGGING.md
+++ b/DEBUGGING.md
@@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core
 
 The same goes for core dumps in other example binaries.
 
+To run only a single unit test, pass the --gtest_filter='TestName' flag to the
+lt-gumbo_test binary.
+
 Assertions
 ==========
 

From c629587a9936186e9fd0b92047d0489aaf2ffa03 Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Mon, 16 Feb 2015 18:06:12 -0800
Subject: [PATCH 10/12] Add a test for utf8iterator_maybe_consume_match
 followed by a null.

---
 tests/utf8.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/utf8.cc b/tests/utf8.cc
index 479e4bc8..a98c69c2 100644
--- a/tests/utf8.cc
+++ b/tests/utf8.cc
@@ -556,6 +556,21 @@ TEST_F(Utf8Test, MatchesCaseInsensitive) {
   EXPECT_EQ(-1, utf8iterator_current(&input_));
 }
 
+TEST_F(Utf8Test, MatchFollowedByNullByte) {
+  // Can't use ResetText, as the implicit strlen will choke on the null.
+  text_ = "CDATA\0f";
+  utf8iterator_init(&parser_, text_, 7, &input_);
+
+  EXPECT_TRUE(utf8iterator_maybe_consume_match(
+        &input_, "cdata", sizeof("cdata") - 1, false));
+
+  EXPECT_EQ(0, utf8iterator_current(&input_));
+  EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
+  utf8iterator_next(&input_);
+  EXPECT_EQ('f', utf8iterator_current(&input_));
+  EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
+}
+
 TEST_F(Utf8Test, MarkReset) {
   ResetText("this is a test");
   Advance(5);

From bab2d4a342bf20cf5000cca8ab6831275604238a Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Mon, 16 Feb 2015 21:56:51 -0800
Subject: [PATCH 11/12] Update parser and tokenizer tests with testcases for
 null CDATA, and make sure their input mechanisms can accept this without
 relying on strlen.

---
 tests/parser.cc    |  6 +++++-
 tests/tokenizer.cc | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tests/parser.cc b/tests/parser.cc
index 72b60b41..b87f60c0 100644
--- a/tests/parser.cc
+++ b/tests/parser.cc
@@ -1538,7 +1538,11 @@ TEST_F(GumboParserTest, CData) {
 }
 
 TEST_F(GumboParserTest, CDataUnsafe) {
-  Parse("<svg><![CDATA[\0filler\0text\0]]>");
+  // Can't use Parse() because of the strlen
+  output_ = gumbo_parse_with_options(
+      &options_, "<svg><![CDATA[\0filler\0text\0]]>",
+      sizeof("<svg><![CDATA[\0filler\0text\0]]>") - 1);
+  root_ = output_->document;
 
   GumboNode* body;
   GetAndAssertBody(root_, &body);
diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc
index 532bad98..2e4b04ac 100644
--- a/tests/tokenizer.cc
+++ b/tests/tokenizer.cc
@@ -450,6 +450,24 @@ TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
   EXPECT_EQ('>', token_.v.character);
 }
 
+TEST_F(GumboTokenizerTest, CData) {
+  // SetInput uses strlen and so can't handle nulls.
+  text_ = "<![CDATA[\0filler\0text\0]]>";
+  gumbo_tokenizer_state_destroy(&parser_);
+  gumbo_tokenizer_state_init(
+      &parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
+  gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
+
+  EXPECT_TRUE(gumbo_lex(&parser_, &token_));
+  EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
+  EXPECT_EQ(0, token_.v.character);
+
+  gumbo_token_destroy(&parser_, &token_);
+  EXPECT_TRUE(gumbo_lex(&parser_, &token_));
+  EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
+  EXPECT_EQ('f', token_.v.character);
+}
+
 TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
   SetInput("<style>/* For <head> */</style>");
   Advance(1);

From ece6a44bbfa6d5e355b996cfa8a89ef3497947cc Mon Sep 17 00:00:00 2001
From: Jonathan Tang <jonathan.d.tang@gmail.com>
Date: Mon, 16 Feb 2015 22:35:41 -0800
Subject: [PATCH 12/12] Fix handling of nulls in CDATA sections.

---
 src/parser.c       | 2 +-
 src/tokenizer.c    | 2 +-
 tests/parser.cc    | 3 ++-
 tests/tokenizer.cc | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/parser.c b/src/parser.c
index b2c1ad8b..9296e5d8 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1021,6 +1021,7 @@ static GumboNode* insert_foreign_element(
 static void insert_text_token(GumboParser* parser, GumboToken* token) {
   assert(token->type == GUMBO_TOKEN_WHITESPACE ||
          token->type == GUMBO_TOKEN_CHARACTER ||
+         token->type == GUMBO_TOKEN_NULL ||
          token->type == GUMBO_TOKEN_CDATA);
   TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
   if (buffer_state->_buffer.length == 0) {
@@ -3490,7 +3491,6 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
   switch (token->type) {
     case GUMBO_TOKEN_NULL:
       parser_add_parse_error(parser, token);
-      token->type = GUMBO_TOKEN_CHARACTER;
       token->v.character = kUtf8ReplacementChar;
       insert_text_token(parser, token);
       return false;
diff --git a/src/tokenizer.c b/src/tokenizer.c
index 89c22d13..8c9272c0 100644
--- a/src/tokenizer.c
+++ b/src/tokenizer.c
@@ -320,7 +320,7 @@ static int ensure_lowercase(int c) {
 }
 
 static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
-  if (is_in_cdata && c != -1) {
+  if (is_in_cdata && c > 0) {
     return GUMBO_TOKEN_CDATA;
   }
 
diff --git a/tests/parser.cc b/tests/parser.cc
index b87f60c0..590f549a 100644
--- a/tests/parser.cc
+++ b/tests/parser.cc
@@ -1554,7 +1554,8 @@ TEST_F(GumboParserTest, CDataUnsafe) {
   GumboNode* cdata = GetChild(svg, 0);
   ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
   // \xEF\xBF\xBD = unicode replacement char
-  EXPECT_STREQ("fillertext", cdata->v.text.text);
+  EXPECT_STREQ("\xEF\xBF\xBD" "filler\xEF\xBF\xBD" "text\xEF\xBF\xBD",
+      cdata->v.text.text);
 }
 
 TEST_F(GumboParserTest, CDataInBody) {
diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc
index 2e4b04ac..916494e2 100644
--- a/tests/tokenizer.cc
+++ b/tests/tokenizer.cc
@@ -459,7 +459,7 @@ TEST_F(GumboTokenizerTest, CData) {
   gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);
 
   EXPECT_TRUE(gumbo_lex(&parser_, &token_));
-  EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
+  EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
   EXPECT_EQ(0, token_.v.character);
 
   gumbo_token_destroy(&parser_, &token_);