Skip to content

Commit

Permalink
parser: make parser single-pass
Browse files Browse the repository at this point in the history
* Eliminate `jstp::parser::internal::PrepareString` function thus
  avoiding extra memory copying.
* Fix deleting comment blocks located inside of strings.
* Fix key parsing not throwing error when it included spaces
  but was not enclosed in quotes.

Fixes: #60
PR-URL: #61
  • Loading branch information
belochub authored and aqrln committed Feb 20, 2017
1 parent 886eda3 commit 336c7fe
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 101 deletions.
209 changes: 119 additions & 90 deletions src/jsrs_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
#include <cstddef>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <vector>

#include "common.h"
#include "unicode_utils.h"

using std::atof;
using std::function;
using std::isalnum;
using std::isalpha;
using std::isdigit;
Expand Down Expand Up @@ -74,28 +76,30 @@ static constexpr Local<Value> (*kParseFunctions[])(Isolate*,
};

Local<Value> Parse(Isolate* isolate, const String::Utf8Value& in) {
size_t size;
const char* to_parse =
internal::PrepareString(isolate, *in, in.length(), &size);
if (!to_parse) {
return Undefined(isolate);
}
const char* str = *in;
const size_t length = in.length();
const char* end = str + length;

Type type;
if (!GetType(to_parse, to_parse + size, &type)) {

size_t start_pos = internal::SkipToNextToken(str, end);
if (!GetType(str + start_pos, end, &type)) {
THROW_EXCEPTION(TypeError, "Invalid type");
return Undefined(isolate);
}

size_t parsed_size = 0;
Local<Value> result =
(kParseFunctions[type])(isolate, to_parse, to_parse + size, &parsed_size);
if (size != parsed_size) {
(kParseFunctions[type])(isolate, str + start_pos, end, &parsed_size);

parsed_size += internal::SkipToNextToken(str + start_pos + parsed_size, end);
parsed_size += start_pos;

if (length != parsed_size) {
THROW_EXCEPTION(SyntaxError, "Invalid format");
return Undefined(isolate);
}

delete[] to_parse;
return result;
}

Expand Down Expand Up @@ -152,67 +156,76 @@ static bool GetType(const char* begin, const char* end, Type* type) {

namespace internal {

const char* PrepareString(Isolate* isolate,
const char* str,
size_t length,
size_t* new_length) {
char* result = new char[length + 1];
bool string_mode = false;
enum { kDisabled = 0, kOneline, kMultiline } comment_mode = kDisabled;
size_t j = 0;
size_t size = 0;

for (size_t i = 0; i < length; i++) {
if ((comment_mode == kDisabled) &&
(str[i] == '\"' || str[i] == '\'') &&
(i == 0 || str[i - 1] != '\\')) {
string_mode = !string_mode;
// Returns true if `str` points to a multiline comment ending, false otherwise.
bool IsMultilineCommentEnd(const char* str, size_t* size) {
if (str[0] == '*' && str[1] == '/') {
*size = 2;
return true;
}
return false;
}

// Returns count of bytes needed to skip to current comment ending.
size_t SkipToCommentEnd(const char* str, const char* end) {
bool is_single_line;

switch (str[1]) {
case '/': {
is_single_line = true;
break;
}
case '*': {
is_single_line = false;
break;
}
default: { // In case it is not a comment start
return 0;
}
}

if (!string_mode) {
if (comment_mode == kDisabled && str[i] == '/') {
switch (str[i + 1]) {
case '/': {
comment_mode = kOneline;
break;
}
case '*': {
comment_mode = kMultiline;
break;
}
}
}
function<bool(const char*, size_t*)> end_check_func;

if (comment_mode == kDisabled) {
if (IsWhiteSpaceCharacter(str + i, &size) ||
IsLineTerminatorSequence(str + i, &size)) {
i += size - 1;
} else {
result[j++] = str[i];
}
}
if (is_single_line) {
end_check_func = IsLineTerminatorSequence;
} else {
end_check_func = IsMultilineCommentEnd;
}

const size_t size = end - str;

if ((comment_mode == kOneline &&
IsLineTerminatorSequence(str + i, &size)) ||
(comment_mode == kMultiline &&
str[i - 1] == '*' && str[i] == '/')) {
comment_mode = kDisabled;
size_t pos = 2;
size_t current_size;

for (; pos < size; pos++) {
if (end_check_func(str + pos, &current_size)) {
return pos + current_size;
}
}

return pos;
}

size_t SkipToNextToken(const char* str, const char* end) {
size_t pos = 0;
size_t current_size;
const size_t size = end - str;

while (pos < size) {
if (IsWhiteSpaceCharacter(str + pos, &current_size) ||
IsLineTerminatorSequence(str + pos, &current_size)) {
pos += current_size;
} else if (str[pos] == '/') {
size_t to_skip = SkipToCommentEnd(str + pos, end);
if (!to_skip) {
break;
}
} else if (str[i] == '\\' && IsLineTerminatorSequence(str + i + 1, &size)) {
i += size;
} else if (IsLineTerminatorSequence(str + i, &size)) {
THROW_EXCEPTION(SyntaxError, "Unexpected line end in string");
delete[] result;
return nullptr;
pos += to_skip;
} else {
result[j++] = str[i];
break;
}
}

result[j] = '\0';
*new_length = j;

return result;
return pos;
}

Local<Value> ParseUndefined(Isolate* isolate,
Expand Down Expand Up @@ -367,15 +380,23 @@ Local<Value> ParseString(Isolate* isolate,
}

if (begin[i] == '\\') {
char* symb =
GetControlChar(isolate, begin + ++i, &out_offset, &in_offset);
if (!symb) {
return String::Empty(isolate);
if (IsLineTerminatorSequence(begin + i + 1, &in_offset)) {
i += in_offset;
} else {
char* symb =
GetControlChar(isolate, begin + ++i, &out_offset, &in_offset);
if (!symb) {
return String::Empty(isolate);
}
strncpy(result + res_index, symb, out_offset);
delete[] symb;
i += in_offset - 1;
res_index += out_offset;
}
strncpy(result + res_index, symb, out_offset);
delete[] symb;
i += in_offset - 1;
res_index += out_offset;
} else if (IsLineTerminatorSequence(begin + i, &in_offset)) {
delete[] result;
THROW_EXCEPTION(SyntaxError, "Unexpected line end in string");
return String::Empty(isolate);
} else {
result[res_index++] = begin[i];
}
Expand Down Expand Up @@ -524,25 +545,21 @@ Local<String> ParseKeyInObject(Isolate* isolate,
} else {
size_t current_length = 0;
for (size_t i = 0; i < *size; i++) {
if (begin[i] == ':') {
if (begin[i] == '_' || (i != 0 &&
isalnum(begin[i])) ||
isalpha(begin[i])) {
current_length++;
} else {
if (current_length != 0) {
result = String::NewFromUtf8(isolate, begin,
NewStringType::kInternalized,
static_cast<int>(current_length))
.ToLocalChecked();
break;
} else {
THROW_EXCEPTION(SyntaxError, "Unexpected token :");
THROW_EXCEPTION(SyntaxError, "Unexpected identifier");
return Local<String>();
}
} else if (begin[i] == '_' || (i != 0 ?
isalnum(begin[i]) :
isalpha(begin[i]))) {
current_length++;
} else {
THROW_EXCEPTION(SyntaxError,
"Invalid format in object: key has invalid type");
return Local<String>();
}
}
*size = current_length;
Expand Down Expand Up @@ -579,20 +596,23 @@ Local<Value> ParseObject(Isolate* isolate,

for (size_t i = 1; i < *size; i++) {
if (key_mode) {
i += SkipToNextToken(begin + i, end);
if (begin[i] == '}') {
if (begin[i - 1] != ',') { // In case of empty object
*size = 2;
} else { // In case of trailing comma
*size = i + 1;
}
*size = i + 1;
break;
}
current_key = ParseKeyInObject(isolate,
begin + i,
end,
&current_length);
i += current_length;
i += SkipToNextToken(begin + i, end);
if (begin[i] != ':') {
THROW_EXCEPTION(SyntaxError, "Unexpected token");
return Object::New(isolate);
}
} else {
i += SkipToNextToken(begin + i, end);
current_value = ParseValueInObject(isolate,
begin + i,
end,
Expand All @@ -606,6 +626,7 @@ Local<Value> ParseObject(Isolate* isolate,
}
}
i += current_length;
i += SkipToNextToken(begin + i, end);
if (begin[i] != ',' && begin[i] != '}') {
THROW_EXCEPTION(SyntaxError, "Invalid format in object");
return Object::New(isolate);
Expand All @@ -626,14 +647,19 @@ Local<Value> ParseArray(Isolate* isolate,
auto array = Array::New(isolate);
size_t current_length = 0;
*size = end - begin;
if (*begin == '[' && *(begin + 1) == ']') { // In case of empty array
*size = 2;
return array;
}

bool is_empty = true;

size_t current_element = 0;
Type current_type;

for (size_t i = 1; i < *size; i++) {
Type current_type;
i += SkipToNextToken(begin + i, end);
if (is_empty && begin[i] == ']') { // In case of empty array
*size = i + 1;
return array;
}

bool valid = GetType(begin + i, end, &current_type);
if (valid) {
auto t = kParseFunctions[current_type](isolate,
Expand All @@ -642,9 +668,12 @@ Local<Value> ParseArray(Isolate* isolate,
&current_length);
if (!(current_type == Type::kUndefined && begin[i] == ']')) {
array->Set(static_cast<uint32_t>(current_element++), t);
is_empty = false;
}

i += current_length;
i += SkipToNextToken(begin + i, end);

current_length = 0;

if (begin[i] != ',' && begin[i] != ']') {
Expand Down
7 changes: 2 additions & 5 deletions src/jsrs_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@ v8::Local<v8::Value> Parse(v8::Isolate* isolate,

namespace internal {

// Prepares a source string for parsing throwing out whitespace and comments.
const char* PrepareString(v8::Isolate* isolate,
const char* str,
std::size_t length,
std::size_t* new_length);
// Returns count of bytes needed to skip to next token.
size_t SkipToNextToken(const char* str, const char* end);

// Parses an undefined value from `begin` but never past `end` and returns the
// parsed JavaScript value. The `size` is incremented by the number of
Expand Down
17 changes: 11 additions & 6 deletions src/packet_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ using v8::Isolate;
using v8::Local;
using v8::String;

using jstp::parser::internal::PrepareString;
using jstp::parser::internal::ParseObject;
using jstp::parser::internal::SkipToNextToken;

namespace jstp {

Expand All @@ -29,9 +29,9 @@ namespace packet_parser {
Local<String> ParseNetworkPackets(Isolate* isolate,
const String::Utf8Value& in,
Local<Array> out) {
size_t total_size = 0;
const size_t total_size = in.length();
size_t parsed_size = 0;
const char* source = PrepareString(isolate, *in, in.length(), &total_size);
const char* source = *in;
const char* curr_chunk = source;
int index = 0;

Expand All @@ -40,10 +40,16 @@ Local<String> ParseNetworkPackets(Isolate* isolate,
parsed_size += chunk_size + 1;

if (parsed_size <= total_size) {
size_t parsed_chunk_size = 0;
auto result = ParseObject(isolate, curr_chunk,
size_t skipped_size = SkipToNextToken(curr_chunk,
curr_chunk + chunk_size);
size_t parsed_chunk_size;
auto result = ParseObject(isolate, curr_chunk + skipped_size,
curr_chunk + chunk_size, &parsed_chunk_size);

parsed_chunk_size += skipped_size;
parsed_chunk_size += SkipToNextToken(curr_chunk + parsed_chunk_size,
curr_chunk + chunk_size);

if (parsed_chunk_size != chunk_size) {
delete[] source;
THROW_EXCEPTION(SyntaxError, "Invalid format");
Expand All @@ -56,7 +62,6 @@ Local<String> ParseNetworkPackets(Isolate* isolate,
}

auto rest = String::NewFromUtf8(isolate, curr_chunk);
delete[] source;
return rest;
}

Expand Down

0 comments on commit 336c7fe

Please sign in to comment.