Skip to content

Commit

Permalink
parser: improve string parsing
Browse files Browse the repository at this point in the history
* Eliminate unnecessary assignments.
* Encode control characters in place to avoid copying and memory
  allocations.
* Fix possible memory leak.

PR-URL: #66
  • Loading branch information
belochub authored and aqrln committed Feb 20, 2017
1 parent f311d31 commit acb4146
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 48 deletions.
70 changes: 33 additions & 37 deletions src/jsrs_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -350,18 +350,18 @@ Local<Value> ParseIntegerNumber(Isolate* isolate,
return Integer::New(isolate, value);
}

static char* GetControlChar(Isolate* isolate,
const char* str,
size_t* res_len,
size_t* size);
static bool GetControlChar(Isolate* isolate,
const char* str,
size_t* res_len,
size_t* size,
char* write_to);

Local<Value> ParseString(Isolate* isolate,
const char* begin,
const char* end,
size_t* size) {
*size = end - begin;
char* result = new char[*size + 1];
memset(result, 0, *size + 1);

enum { kApostrophe = 0, kQMarks} string_mode = (*begin == '\'') ?
kApostrophe :
Expand All @@ -375,21 +375,19 @@ Local<Value> ParseString(Isolate* isolate,
(string_mode == kApostrophe && begin[i] == '\'')) {
is_ended = true;
*size = i + 1;
result[res_index] = '\0';
break;
}

if (begin[i] == '\\') {
if (IsLineTerminatorSequence(begin + i + 1, &in_offset)) {
i += in_offset;
} else {
char* symb =
GetControlChar(isolate, begin + ++i, &out_offset, &in_offset);
if (!symb) {
bool ok = GetControlChar(isolate, begin + ++i, &out_offset, &in_offset,
result + res_index);
if (!ok) {
delete[] result;
return String::Empty(isolate);
}
strncpy(result + res_index, symb, out_offset);
delete[] symb;
i += in_offset - 1;
res_index += out_offset;
}
Expand Down Expand Up @@ -418,51 +416,51 @@ static unsigned int ReadHexNumber(const char* str, size_t len, bool* ok);

// Parses a part of a JavaScript string representation after the backslash
// character (i.e., an escape sequence without \) into an unescaped control
// character.
static char* GetControlChar(Isolate* isolate,
const char* str,
size_t* res_len,
size_t* size) {
char* result = new char[5];
// character and writes it to `write_to`.
// Returns true if no error occured, false otherwise.
static bool GetControlChar(Isolate* isolate,
const char* str,
size_t* res_len,
size_t* size,
char* write_to) {
*size = 1;
*res_len = 1;
bool ok;
switch (str[0]) {
case 'b': {
*result = '\b';
*write_to = '\b';
break;
}
case 'f': {
*result = '\f';
*write_to = '\f';
break;
}
case 'n': {
*result = '\n';
*write_to = '\n';
break;
}
case 'r': {
*result = '\r';
*write_to = '\r';
break;
}
case 't': {
*result = '\t';
*write_to = '\t';
break;
}
case 'v': {
*result = '\v';
*write_to = '\v';
break;
}
case '0': {
*result = '\0';
*write_to = '\0';
break;
}

case 'x': {
*result = ReadHexNumber(str + 1, 2, &ok);
*write_to = static_cast<char>(ReadHexNumber(str + 1, 2, &ok));
if (!ok) {
delete[] result;
THROW_EXCEPTION(SyntaxError, "Invalid hexadecimal escape sequence");
return nullptr;
return false;
}
*size = 3;
break;
Expand All @@ -479,9 +477,8 @@ static char* GetControlChar(Isolate* isolate,
str[hex_size + 2] != '}' && hex_size <= 6;
hex_size++) {
if (str[hex_size + 2] == '\0') {
delete[] result;
THROW_EXCEPTION(SyntaxError, "Invalid Unicode code point escape");
return nullptr;
return false;
}
}
symb_code = ReadHexNumber(str + 2, hex_size, &ok);
Expand All @@ -491,20 +488,19 @@ static char* GetControlChar(Isolate* isolate,
}

if (!ok) {
delete[] result;
THROW_EXCEPTION(SyntaxError, "Invalid Unicode escape sequence");
return nullptr;
return false;
}
char* unicode_symbol = CodePointToUtf8(symb_code, res_len);
delete[] result;
return unicode_symbol;
CodePointToUtf8(symb_code, res_len, write_to);
break;
}

default:
*result = str[0];
default: {
*write_to = str[0];
}
}

return result;
return true;
}

// Parses a hexadecimal number into unsigned int. Whether the parsing
Expand Down
14 changes: 6 additions & 8 deletions src/unicode_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,8 @@ bool IsWhiteSpaceCharacter(const char* str, size_t* size) {
return false;
}

char* CodePointToUtf8(unsigned int c, size_t* size) {
char* result = new char[4];
char* b = result;
void CodePointToUtf8(unsigned int c, size_t* size, char* write_to) {
char* b = write_to;
if (c < 0x80) {
*b++ = c;
*size = 1;
Expand All @@ -89,8 +88,8 @@ char* CodePointToUtf8(unsigned int c, size_t* size) {
*b++ = 128 + c % 64;
*size = 2;
} else if (c - 0xd800u < 0x800) {
delete[] result;
return CodePointToUtf8(0xFFFD, size);
CodePointToUtf8(0xFFFD, size, write_to);
return;
} else if (c < 0x10000) {
*b++ = 224 + c / 4096;
*b++ = 128 + c / 64 % 64;
Expand All @@ -103,10 +102,9 @@ char* CodePointToUtf8(unsigned int c, size_t* size) {
*b++ = 128 + c % 64;
*size = 4;
} else {
delete[] result;
return CodePointToUtf8(0xFFFD, size);
CodePointToUtf8(0xFFFD, size, write_to);
return;
}
return result;
}

} // namespace unicode_utils
Expand Down
6 changes: 3 additions & 3 deletions src/unicode_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ bool IsLineTerminatorSequence(const char* str, std::size_t* size);
// code point (1, 2, 3).
bool IsWhiteSpaceCharacter(const char* str, std::size_t* size);

// Encodes a Unicode code point in UTF-8. `size` will receive the number of
// bytes used (1, 2, 3 or 4).
char* CodePointToUtf8(unsigned int c, std::size_t* size);
// Encodes a Unicode code point in UTF-8 and writes it to `write_to`.
// `size` will receive the number of bytes used (1, 2, 3 or 4).
void CodePointToUtf8(unsigned int c, std::size_t* size, char* write_to);

} // namespace unicode_utils

Expand Down

0 comments on commit acb4146

Please sign in to comment.