parser: make parser single-pass

* Eliminate `jstp::parser::internal::PrepareString` function thus avoiding extra memory copying. * Fix deleting comment blocks located inside of strings. * Fix key parsing not throwing error when it included spaces but was not enclosed in quotes. Fixes: #60 PR-URL: #61
metarhia · Feb 20, 2017 · 336c7fe · 336c7fe
1 parent 886eda3
commit 336c7fe
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 101 deletions.
diff --git a/src/jsrs_parser.cc b/src/jsrs_parser.cc
@@ -7,12 +7,14 @@
 #include <cstddef>
 #include <cstdlib>
 #include <cstring>
+#include <functional>
 #include <vector>
 
 #include "common.h"
 #include "unicode_utils.h"
 
 using std::atof;
+using std::function;
 using std::isalnum;
 using std::isalpha;
 using std::isdigit;
@@ -74,28 +76,30 @@ static constexpr Local<Value> (*kParseFunctions[])(Isolate*,
 };
 
 Local<Value> Parse(Isolate* isolate, const String::Utf8Value& in) {
-  size_t size;
-  const char* to_parse =
-      internal::PrepareString(isolate, *in, in.length(), &size);
-  if (!to_parse) {
-    return Undefined(isolate);
-  }
+  const char* str = *in;
+  const size_t length = in.length();
+  const char* end = str + length;
 
   Type type;
-  if (!GetType(to_parse, to_parse + size, &type)) {
+
+  size_t start_pos = internal::SkipToNextToken(str, end);
+  if (!GetType(str + start_pos, end, &type)) {
     THROW_EXCEPTION(TypeError, "Invalid type");
     return Undefined(isolate);
   }
 
   size_t parsed_size = 0;
   Local<Value> result =
-      (kParseFunctions[type])(isolate, to_parse, to_parse + size, &parsed_size);
-  if (size != parsed_size) {
+      (kParseFunctions[type])(isolate, str + start_pos, end, &parsed_size);
+
+  parsed_size += internal::SkipToNextToken(str + start_pos + parsed_size, end);
+  parsed_size += start_pos;
+
+  if (length != parsed_size) {
     THROW_EXCEPTION(SyntaxError, "Invalid format");
     return Undefined(isolate);
   }
 
-  delete[] to_parse;
   return result;
 }
 
@@ -152,67 +156,76 @@ static bool GetType(const char* begin, const char* end, Type* type) {
 
 namespace internal {
 
-const char* PrepareString(Isolate*    isolate,
-                          const char* str,
-                          size_t      length,
-                          size_t*     new_length) {
-  char* result = new char[length + 1];
-  bool string_mode = false;
-  enum { kDisabled = 0, kOneline, kMultiline } comment_mode = kDisabled;
-  size_t j = 0;
-  size_t size = 0;
-
-  for (size_t i = 0; i < length; i++) {
-    if ((comment_mode == kDisabled) &&
-        (str[i] == '\"' || str[i] == '\'') &&
-        (i == 0 || str[i - 1] != '\\')) {
-      string_mode = !string_mode;
+// Returns true if `str` points to a multiline comment ending, false otherwise.
+bool IsMultilineCommentEnd(const char* str, size_t* size) {
+  if (str[0] == '*' && str[1] == '/') {
+    *size = 2;
+    return true;
+  }
+  return false;
+}
+
+// Returns count of bytes needed to skip to current comment ending.
+size_t SkipToCommentEnd(const char* str, const char* end) {
+  bool is_single_line;
+
+  switch (str[1]) {
+    case '/': {
+      is_single_line = true;
+      break;
     }
+    case '*': {
+      is_single_line = false;
+      break;
+    }
+    default: {  // In case it is not a comment start
+      return 0;
+    }
+  }
 
-    if (!string_mode) {
-      if (comment_mode == kDisabled && str[i] == '/') {
-        switch (str[i + 1]) {
-          case '/': {
-            comment_mode = kOneline;
-            break;
-          }
-          case '*': {
-            comment_mode = kMultiline;
-            break;
-          }
-        }
-      }
+  function<bool(const char*, size_t*)> end_check_func;
 
-      if (comment_mode == kDisabled) {
-        if (IsWhiteSpaceCharacter(str + i, &size) ||
-            IsLineTerminatorSequence(str + i, &size)) {
-          i += size - 1;
-        } else {
-          result[j++] = str[i];
-        }
-      }
+  if (is_single_line) {
+    end_check_func = IsLineTerminatorSequence;
+  } else {
+    end_check_func = IsMultilineCommentEnd;
+  }
+
+  const size_t size = end - str;
 
-      if ((comment_mode == kOneline &&
-            IsLineTerminatorSequence(str + i, &size)) ||
-          (comment_mode == kMultiline &&
-            str[i - 1] == '*' && str[i] == '/')) {
-        comment_mode = kDisabled;
+  size_t pos = 2;
+  size_t current_size;
+
+  for (; pos < size; pos++) {
+    if (end_check_func(str + pos, &current_size)) {
+      return pos + current_size;
+    }
+  }
+
+  return pos;
+}
+
+size_t SkipToNextToken(const char* str, const char* end) {
+  size_t pos = 0;
+  size_t current_size;
+  const size_t size = end - str;
+
+  while (pos < size) {
+    if (IsWhiteSpaceCharacter(str + pos, &current_size) ||
+        IsLineTerminatorSequence(str + pos, &current_size)) {
+      pos += current_size;
+    } else if (str[pos] == '/') {
+      size_t to_skip = SkipToCommentEnd(str + pos, end);
+      if (!to_skip) {
+        break;
       }
-    } else if (str[i] == '\\' && IsLineTerminatorSequence(str + i + 1, &size)) {
-      i += size;
-    } else if (IsLineTerminatorSequence(str + i, &size)) {
-      THROW_EXCEPTION(SyntaxError, "Unexpected line end in string");
-      delete[] result;
-      return nullptr;
+      pos += to_skip;
     } else {
-      result[j++] = str[i];
+      break;
     }
   }
 
-  result[j] = '\0';
-  *new_length = j;
-
-  return result;
+  return pos;
 }
 
 Local<Value> ParseUndefined(Isolate*    isolate,
@@ -367,15 +380,23 @@ Local<Value> ParseString(Isolate*    isolate,
     }
 
     if (begin[i] == '\\') {
-      char* symb =
-          GetControlChar(isolate, begin + ++i, &out_offset, &in_offset);
-      if (!symb) {
-        return String::Empty(isolate);
+      if (IsLineTerminatorSequence(begin + i + 1, &in_offset)) {
+        i += in_offset;
+      } else {
+        char* symb =
+            GetControlChar(isolate, begin + ++i, &out_offset, &in_offset);
+        if (!symb) {
+          return String::Empty(isolate);
+        }
+        strncpy(result + res_index, symb, out_offset);
+        delete[] symb;
+        i += in_offset - 1;
+        res_index += out_offset;
       }
-      strncpy(result + res_index, symb, out_offset);
-      delete[] symb;
-      i += in_offset - 1;
-      res_index += out_offset;
+    } else if (IsLineTerminatorSequence(begin + i, &in_offset)) {
+      delete[] result;
+      THROW_EXCEPTION(SyntaxError, "Unexpected line end in string");
+      return String::Empty(isolate);
     } else {
       result[res_index++] = begin[i];
     }
@@ -524,25 +545,21 @@ Local<String> ParseKeyInObject(Isolate*    isolate,
   } else {
     size_t current_length = 0;
     for (size_t i = 0; i < *size; i++) {
-      if (begin[i] == ':') {
+      if (begin[i] == '_' || (i != 0 &&
+                              isalnum(begin[i])) ||
+                              isalpha(begin[i])) {
+        current_length++;
+      } else {
         if (current_length != 0) {
           result = String::NewFromUtf8(isolate, begin,
                                        NewStringType::kInternalized,
                                        static_cast<int>(current_length))
                                            .ToLocalChecked();
           break;
         } else {
-          THROW_EXCEPTION(SyntaxError, "Unexpected token :");
+          THROW_EXCEPTION(SyntaxError, "Unexpected identifier");
           return Local<String>();
         }
-      } else if (begin[i] == '_' || (i != 0 ?
-                                     isalnum(begin[i]) :
-                                     isalpha(begin[i]))) {
-        current_length++;
-      } else {
-        THROW_EXCEPTION(SyntaxError,
-            "Invalid format in object: key has invalid type");
-        return Local<String>();
       }
     }
     *size = current_length;
@@ -579,20 +596,23 @@ Local<Value> ParseObject(Isolate*    isolate,
 
   for (size_t i = 1; i < *size; i++) {
     if (key_mode) {
+      i += SkipToNextToken(begin + i, end);
       if (begin[i] == '}') {
-        if (begin[i - 1] != ',') {  // In case of empty object
-          *size = 2;
-        } else {                    // In case of trailing comma
-          *size = i + 1;
-        }
+        *size = i + 1;
         break;
       }
       current_key = ParseKeyInObject(isolate,
                                      begin + i,
                                      end,
                                      &current_length);
       i += current_length;
+      i += SkipToNextToken(begin + i, end);
+      if (begin[i] != ':') {
+        THROW_EXCEPTION(SyntaxError, "Unexpected token");
+        return Object::New(isolate);
+      }
     } else {
+      i += SkipToNextToken(begin + i, end);
       current_value = ParseValueInObject(isolate,
                                          begin + i,
                                          end,
@@ -606,6 +626,7 @@ Local<Value> ParseObject(Isolate*    isolate,
         }
       }
       i += current_length;
+      i += SkipToNextToken(begin + i, end);
       if (begin[i] != ',' && begin[i] != '}') {
         THROW_EXCEPTION(SyntaxError, "Invalid format in object");
         return Object::New(isolate);
@@ -626,14 +647,19 @@ Local<Value> ParseArray(Isolate*    isolate,
   auto array = Array::New(isolate);
   size_t current_length = 0;
   *size = end - begin;
-  if (*begin == '[' && *(begin + 1) == ']') {  // In case of empty array
-    *size = 2;
-    return array;
-  }
+
+  bool is_empty = true;
 
   size_t current_element = 0;
+  Type current_type;
+
   for (size_t i = 1; i < *size; i++) {
-    Type current_type;
+    i += SkipToNextToken(begin + i, end);
+    if (is_empty && begin[i] == ']') { // In case of empty array
+      *size = i + 1;
+      return array;
+    }
+
     bool valid = GetType(begin + i, end, &current_type);
     if (valid) {
       auto t = kParseFunctions[current_type](isolate,
@@ -642,9 +668,12 @@ Local<Value> ParseArray(Isolate*    isolate,
                                              &current_length);
       if (!(current_type == Type::kUndefined && begin[i] == ']')) {
         array->Set(static_cast<uint32_t>(current_element++), t);
+        is_empty = false;
       }
 
       i += current_length;
+      i += SkipToNextToken(begin + i, end);
+
       current_length = 0;
 
       if (begin[i] != ',' && begin[i] != ']') {

diff --git a/src/jsrs_parser.h b/src/jsrs_parser.h
@@ -19,11 +19,8 @@ v8::Local<v8::Value> Parse(v8::Isolate* isolate,
 
 namespace internal {
 
-// Prepares a source string for parsing throwing out whitespace and comments.
-const char* PrepareString(v8::Isolate* isolate,
-                          const char*  str,
-                          std::size_t  length,
-                          std::size_t* new_length);
+// Returns count of bytes needed to skip to next token.
+size_t SkipToNextToken(const char* str, const char* end);
 
 // Parses an undefined value from `begin` but never past `end` and returns the
 // parsed JavaScript value. The `size` is incremented by the number of

diff --git a/src/packet_parser.cc b/src/packet_parser.cc
@@ -19,8 +19,8 @@ using v8::Isolate;
 using v8::Local;
 using v8::String;
 
-using jstp::parser::internal::PrepareString;
 using jstp::parser::internal::ParseObject;
+using jstp::parser::internal::SkipToNextToken;
 
 namespace jstp {
 
@@ -29,9 +29,9 @@ namespace packet_parser {
 Local<String> ParseNetworkPackets(Isolate* isolate,
                                   const String::Utf8Value& in,
                                   Local<Array> out) {
-  size_t total_size = 0;
+  const size_t total_size = in.length();
   size_t parsed_size = 0;
-  const char* source = PrepareString(isolate, *in, in.length(), &total_size);
+  const char* source = *in;
   const char* curr_chunk = source;
   int index = 0;
 
@@ -40,10 +40,16 @@ Local<String> ParseNetworkPackets(Isolate* isolate,
     parsed_size += chunk_size + 1;
 
     if (parsed_size <= total_size) {
-      size_t parsed_chunk_size = 0;
-      auto result = ParseObject(isolate, curr_chunk,
+      size_t skipped_size = SkipToNextToken(curr_chunk,
+          curr_chunk + chunk_size);
+      size_t parsed_chunk_size;
+      auto result = ParseObject(isolate, curr_chunk + skipped_size,
           curr_chunk + chunk_size, &parsed_chunk_size);
 
+      parsed_chunk_size += skipped_size;
+      parsed_chunk_size += SkipToNextToken(curr_chunk + parsed_chunk_size,
+          curr_chunk + chunk_size);
+
       if (parsed_chunk_size != chunk_size) {
         delete[] source;
         THROW_EXCEPTION(SyntaxError, "Invalid format");
@@ -56,7 +62,6 @@ Local<String> ParseNetworkPackets(Isolate* isolate,
   }
 
   auto rest = String::NewFromUtf8(isolate, curr_chunk);
-  delete[] source;
   return rest;
 }