From acb4146a3f1384adf3aaf9058ca967bd4ff6ecf5 Mon Sep 17 00:00:00 2001 From: Mykola Bilochub Date: Fri, 10 Feb 2017 17:56:24 +0200 Subject: [PATCH] parser: improve string parsing * Eliminate unnecessary assignments. * Encode control characters in place to avoid copying and memory allocations. * Fix possible memory leak. PR-URL: https://github.com/metarhia/JSTP/pull/66 --- src/jsrs_parser.cc | 70 +++++++++++++++++++++----------------------- src/unicode_utils.cc | 14 ++++----- src/unicode_utils.h | 6 ++-- 3 files changed, 42 insertions(+), 48 deletions(-) diff --git a/src/jsrs_parser.cc b/src/jsrs_parser.cc index e7706b6d..cb06e18d 100644 --- a/src/jsrs_parser.cc +++ b/src/jsrs_parser.cc @@ -350,10 +350,11 @@ Local ParseIntegerNumber(Isolate* isolate, return Integer::New(isolate, value); } -static char* GetControlChar(Isolate* isolate, - const char* str, - size_t* res_len, - size_t* size); +static bool GetControlChar(Isolate* isolate, + const char* str, + size_t* res_len, + size_t* size, + char* write_to); Local ParseString(Isolate* isolate, const char* begin, @@ -361,7 +362,6 @@ Local ParseString(Isolate* isolate, size_t* size) { *size = end - begin; char* result = new char[*size + 1]; - memset(result, 0, *size + 1); enum { kApostrophe = 0, kQMarks} string_mode = (*begin == '\'') ? kApostrophe : @@ -375,7 +375,6 @@ Local ParseString(Isolate* isolate, (string_mode == kApostrophe && begin[i] == '\'')) { is_ended = true; *size = i + 1; - result[res_index] = '\0'; break; } @@ -383,13 +382,12 @@ Local ParseString(Isolate* isolate, if (IsLineTerminatorSequence(begin + i + 1, &in_offset)) { i += in_offset; } else { - char* symb = - GetControlChar(isolate, begin + ++i, &out_offset, &in_offset); - if (!symb) { + bool ok = GetControlChar(isolate, begin + ++i, &out_offset, &in_offset, + result + res_index); + if (!ok) { + delete[] result; return String::Empty(isolate); } - strncpy(result + res_index, symb, out_offset); - delete[] symb; i += in_offset - 1; res_index += out_offset; } @@ -418,51 +416,51 @@ static unsigned int ReadHexNumber(const char* str, size_t len, bool* ok); // Parses a part of a JavaScript string representation after the backslash // character (i.e., an escape sequence without \) into an unescaped control -// character. -static char* GetControlChar(Isolate* isolate, - const char* str, - size_t* res_len, - size_t* size) { - char* result = new char[5]; +// character and writes it to `write_to`. +// Returns true if no error occured, false otherwise. +static bool GetControlChar(Isolate* isolate, + const char* str, + size_t* res_len, + size_t* size, + char* write_to) { *size = 1; *res_len = 1; bool ok; switch (str[0]) { case 'b': { - *result = '\b'; + *write_to = '\b'; break; } case 'f': { - *result = '\f'; + *write_to = '\f'; break; } case 'n': { - *result = '\n'; + *write_to = '\n'; break; } case 'r': { - *result = '\r'; + *write_to = '\r'; break; } case 't': { - *result = '\t'; + *write_to = '\t'; break; } case 'v': { - *result = '\v'; + *write_to = '\v'; break; } case '0': { - *result = '\0'; + *write_to = '\0'; break; } case 'x': { - *result = ReadHexNumber(str + 1, 2, &ok); + *write_to = static_cast(ReadHexNumber(str + 1, 2, &ok)); if (!ok) { - delete[] result; THROW_EXCEPTION(SyntaxError, "Invalid hexadecimal escape sequence"); - return nullptr; + return false; } *size = 3; break; @@ -479,9 +477,8 @@ static char* GetControlChar(Isolate* isolate, str[hex_size + 2] != '}' && hex_size <= 6; hex_size++) { if (str[hex_size + 2] == '\0') { - delete[] result; THROW_EXCEPTION(SyntaxError, "Invalid Unicode code point escape"); - return nullptr; + return false; } } symb_code = ReadHexNumber(str + 2, hex_size, &ok); @@ -491,20 +488,19 @@ static char* GetControlChar(Isolate* isolate, } if (!ok) { - delete[] result; THROW_EXCEPTION(SyntaxError, "Invalid Unicode escape sequence"); - return nullptr; + return false; } - char* unicode_symbol = CodePointToUtf8(symb_code, res_len); - delete[] result; - return unicode_symbol; + CodePointToUtf8(symb_code, res_len, write_to); + break; } - default: - *result = str[0]; + default: { + *write_to = str[0]; + } } - return result; + return true; } // Parses a hexadecimal number into unsigned int. Whether the parsing diff --git a/src/unicode_utils.cc b/src/unicode_utils.cc index 296b3dea..fa956d0b 100644 --- a/src/unicode_utils.cc +++ b/src/unicode_utils.cc @@ -78,9 +78,8 @@ bool IsWhiteSpaceCharacter(const char* str, size_t* size) { return false; } -char* CodePointToUtf8(unsigned int c, size_t* size) { - char* result = new char[4]; - char* b = result; +void CodePointToUtf8(unsigned int c, size_t* size, char* write_to) { + char* b = write_to; if (c < 0x80) { *b++ = c; *size = 1; @@ -89,8 +88,8 @@ char* CodePointToUtf8(unsigned int c, size_t* size) { *b++ = 128 + c % 64; *size = 2; } else if (c - 0xd800u < 0x800) { - delete[] result; - return CodePointToUtf8(0xFFFD, size); + CodePointToUtf8(0xFFFD, size, write_to); + return; } else if (c < 0x10000) { *b++ = 224 + c / 4096; *b++ = 128 + c / 64 % 64; @@ -103,10 +102,9 @@ char* CodePointToUtf8(unsigned int c, size_t* size) { *b++ = 128 + c % 64; *size = 4; } else { - delete[] result; - return CodePointToUtf8(0xFFFD, size); + CodePointToUtf8(0xFFFD, size, write_to); + return; } - return result; } } // namespace unicode_utils diff --git a/src/unicode_utils.h b/src/unicode_utils.h index 48f452a3..4a8ffa6a 100644 --- a/src/unicode_utils.h +++ b/src/unicode_utils.h @@ -20,9 +20,9 @@ bool IsLineTerminatorSequence(const char* str, std::size_t* size); // code point (1, 2, 3). bool IsWhiteSpaceCharacter(const char* str, std::size_t* size); -// Encodes a Unicode code point in UTF-8. `size` will receive the number of -// bytes used (1, 2, 3 or 4). -char* CodePointToUtf8(unsigned int c, std::size_t* size); +// Encodes a Unicode code point in UTF-8 and writes it to `write_to`. +// `size` will receive the number of bytes used (1, 2, 3 or 4). +void CodePointToUtf8(unsigned int c, std::size_t* size, char* write_to); } // namespace unicode_utils