parser,tools: parse Unicode identifiers

* Add tool for getting and parsing needed categories from Unicode Character Database and generating C++ header file with code points arrays. * Add UTF-8 decoding function. * Implement Unicode keys parsing and add two possible options to use when checking whether the code point is an identifier. Refs: https://github.com/metarhia/jstp/issues/152 PR-URL: #218 Reviewed-By: Dmytro Nechai <nechaido@gmail.com> Reviewed-By: Denys Otrishko <shishugi@gmail.com> Reviewed-By: Alexey Orlenko <eaglexrlnk@gmail.com>
metarhia · Jan 22, 2018 · b2f75e5 · b2f75e5
1 parent e2a76b0
commit b2f75e5
Show file tree

Hide file tree

Showing 6 changed files with 1,024 additions and 5 deletions.
diff --git a/src/jsrs_parser.cc b/src/jsrs_parser.cc
@@ -51,6 +51,9 @@ using v8::Value;
 using jstp::unicode_utils::CodePointToUtf8;
 using jstp::unicode_utils::IsWhiteSpaceCharacter;
 using jstp::unicode_utils::IsLineTerminatorSequence;
+using jstp::unicode_utils::Utf8ToCodePoint;
+using jstp::unicode_utils::IsIdStartCodePoint;
+using jstp::unicode_utils::IsIdPartCodePoint;
 
 namespace jstp {
 
@@ -677,11 +680,13 @@ MaybeLocal<String> ParseKeyInObject(Isolate*    isolate,
     }
   } else {
     size_t current_length = 0;
-    for (size_t i = 0; i < *size; i++) {
-      if (begin[i] == '_' || (i != 0 &&
-                              isalnum(begin[i])) ||
-                              isalpha(begin[i])) {
-        current_length++;
+    size_t cp_size;
+    uint32_t cp;
+    while (current_length < *size) {
+      cp = Utf8ToCodePoint(begin + current_length, &cp_size);
+      if (current_length == 0 ? IsIdStartCodePoint(cp) :
+                                IsIdPartCodePoint(cp)) {
+        current_length += cp_size;
       } else {
         if (current_length != 0) {
           result = String::NewFromUtf8(isolate, begin,