diff --git a/llama.cpp b/llama.cpp index 4296eca3261e8..41b35b878ee73 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,7 +1,8 @@ #define LLAMA_API_INTERNAL #include "llama.h" -#include "unicode.h" +// #include "unicode.h" +#include "unicode_regex.h" #include "ggml.h" #include "ggml-alloc.h" @@ -114,6 +115,13 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +// +// unicode utilities +// + +static llm_regex regex_engine; +auto unicode_engine = regex_engine.get_unicode_engine(); + // // helpers // @@ -3314,7 +3322,7 @@ static void llm_load_vocab( for (int i = 0; i < n_merges; i++) { const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); - GGML_ASSERT(codepoints_from_utf8(word).size() > 0); + GGML_ASSERT(unicode_engine.to_codepoints(word).size() > 0); std::string first; std::string second; @@ -3359,7 +3367,7 @@ static void llm_load_vocab( for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); - GGML_ASSERT(codepoints_from_utf8(word).size() > 0); + GGML_ASSERT(unicode_engine.to_codepoints(word).size() > 0); vocab.token_to_id[word] = i; @@ -7900,7 +7908,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } case LLAMA_VOCAB_TYPE_BPE: { GGML_ASSERT(false); - return unicode_to_bytes_bpe(token_data.text); + return unicode_engine.unicode_to_bytes_bpe(token_data.text); } case LLAMA_VOCAB_TYPE_WPM: { GGML_ASSERT(false); @@ -7925,7 +7933,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { - return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); + return vocab.token_to_id.at(unicode_engine.bytes_to_unicode_bpe(ch)); } default: GGML_ASSERT(false); @@ -8249,137 +8257,13 @@ struct llm_tokenizer_bpe { } std::vector bpe_gpt2_preprocess(const std::string & text) { - std::vector bpe_words; std::vector bpe_encoded_words; - - std::string token = ""; - // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting = false; - - std::vector text_utf; - text_utf.reserve(text.size()); - bpe_words.reserve(text.size()); bpe_encoded_words.reserve(text.size()); - auto cps = codepoints_from_utf8(text); - for (size_t i = 0; i < cps.size(); ++i) - text_utf.emplace_back(codepoint_to_utf8(cps[i])); - - for (int i = 0; i < (int)text_utf.size(); i++) { - const std::string & utf_char = text_utf[i]; - bool split_condition = false; - int bytes_remain = text_utf.size() - i; - // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; - const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; - - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { - split_condition = true; - } - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char + utf_char_next; - bpe_words.emplace_back(token); - token = ""; - i++; - continue; - } - } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == "\'" && ( - (utf_char_next == "r" && utf_char_next_next == "e") || - (utf_char_next == "v" && utf_char_next_next == "e") || - (utf_char_next == "l" && utf_char_next_next == "l")) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char + utf_char_next + utf_char_next_next; - bpe_words.emplace_back(token); // the contraction - token = ""; - i += 2; - continue; - } - } - - if (!split_condition && !collecting) { - if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { - collecting_letter = true; - collecting = true; - } - else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - collecting_numeric = true; - collecting = true; - } - else if ( - ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } - else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } - else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { - split_condition = true; - } - } - else if (!split_condition && collecting) { - if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) { - split_condition = true; - } - else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) { - split_condition = true; - } - else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { - split_condition = true; - } - else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - split_condition = true; - } - } - - if (utf_char_next == "") { - split_condition = true; // final - token += utf_char; - } - - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); - } - token = utf_char; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; - } - else { - token += utf_char; - } - } - - for (std::string & word : bpe_words) { + for (std::string & word : regex_engine.falcon_style(text)) { std::string encoded_token = ""; for (char & c : word) { - encoded_token += bytes_to_unicode_bpe(c); + encoded_token += unicode_engine.bytes_to_unicode_bpe(c); } bpe_encoded_words.emplace_back(encoded_token); } @@ -12430,9 +12314,9 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; - auto unicode_sequences = codepoints_from_utf8(text); + auto unicode_sequences = unicode_engine.to_codepoints(text); for (auto& unicode_sequence : unicode_sequences) { - decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence)); + decoded_text += unicode_engine.unicode_to_bytes_bpe(unicode_engine.to_string(unicode_sequence)); } return decoded_text; diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 472b0b3a8f436..623b37b2960f7 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -38,6 +38,7 @@ static const std::map> & k_tests() { { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, { "\n =" , { 1212, 40, }, }, { "' era" , { 18, 4932, }, }, + { "12345678-1239-0fsjk" , { 10963, 27681, 5070, 24, 10963, 36, 24, 27, 5577, 85, 86, }, }, }; return _k_tests; diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 3596ce55af2ce..48d8e515c3f7b 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -64,7 +64,7 @@ int main(int argc, char **argv) { for (int i = 0; i < n_vocab; ++i) { std::string str = llama_detokenize_bpe(ctx, std::vector(1, i)); try { - auto cps = codepoints_from_utf8(str); + auto cps = (str); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_bpe(ctx, tokens); if (check != str) { @@ -80,6 +80,7 @@ int main(int argc, char **argv) { // unicode { + static UNICODE unicode_engine; const int nthread = std::thread::hardware_concurrency(); std::vector threads(nthread); @@ -97,7 +98,7 @@ int main(int argc, char **argv) { continue; } - std::string str = codepoint_to_utf8(cp); + std::string str = unicode_engine.to_string(cp); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_bpe(ctx, tokens); if (cp != 9601 && str != check) { diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 9333f8686fa1c..179d23550c575 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -74,6 +74,7 @@ int main(int argc, char **argv) { // unicode { + static UNICODE unicode_engine; const int nthread = std::thread::hardware_concurrency(); std::vector threads(nthread); @@ -85,7 +86,7 @@ int main(int argc, char **argv) { continue; } - std::string str = codepoint_to_utf8(cp); + std::string str = unicode_engine.to_string(cp); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_spm(ctx, tokens); if (cp != 9601 && str != check) { diff --git a/unicode.h b/unicode.h index 263260702e640..a711633bfd17e 100644 --- a/unicode.h +++ b/unicode.h @@ -1,475 +1,1315 @@ -#pragma once - -#include -#include -#include -#include -#include - -static const std::vector> digit_ranges = { -{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, -{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F}, -{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468}, -{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909}, -{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A}, -{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739}, -{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9}, -{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9}, -}; - -static const std::vector> letter_ranges = { -{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, -{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559}, -{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, -{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A}, -{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2}, -{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33}, -{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD}, -{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61}, -{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0}, -{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3}, -{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61}, -{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A}, -{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C}, -{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7}, -{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5}, -{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C}, -{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3}, -{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB}, -{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F}, -{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D}, -{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4}, -{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, -{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, -{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F}, -{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006}, -{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF}, -{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788}, -{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE}, -{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, -{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4}, -{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB}, -{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44}, -{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7}, -{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA}, -{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D}, -{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835}, -{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7}, -{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35}, -{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C}, -{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147}, -{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288}, -{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339}, -{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE}, -{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909}, -{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00}, -{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F}, -{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0}, -{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77}, -{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08}, -{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C}, -{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, -{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA}, -{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D}, -{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27}, -{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52}, -{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, -{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734}, -{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A}, -}; - -static const std::vector> whitespace_ranges = { -{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}, -}; - -static const std::vector> accent_mark_ranges = { -{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4}, -{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B}, -{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7}, -{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC}, -{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63}, -{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83}, -{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57}, -{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC}, -{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E}, -{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734}, -{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E}, -{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2}, -{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F}, -{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881}, -{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D}, -{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E}, -{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, -{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134}, -{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303}, -{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E}, -{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938}, -{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47}, -{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45}, -{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92}, -{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36}, -{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A}, -{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF}, -}; - -static const std::vector> punctuation_ranges = { -{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB}, -{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D}, -{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76}, -{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA}, -{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A}, -{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027}, -{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998}, -{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F}, -{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF}, -{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F}, -{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20}, -{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857}, -{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D}, -{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9}, -{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946}, -{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F}, -{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F}, -}; - -static const std::vector> symbol_ranges = { -{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7}, -{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608}, -{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA}, -{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5}, -{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C}, -{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF}, -{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B}, -{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4}, -{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, -{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3}, -{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A}, -{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69}, -{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F}, -{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F}, -{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241}, -{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789}, -{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC}, -{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD}, -{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8}, -{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53}, -{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA}, -}; - -static const std::vector> control_ranges = { -{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C}, -{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F}, -{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5}, -{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29}, -{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80}, -{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5}, -{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54}, -{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7}, -{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C}, -{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB}, -{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49}, -{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7}, -{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5}, -{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC}, -{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF}, -{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F}, -{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF}, -{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F}, -{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F}, -{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F}, -{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC}, -{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF}, -{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F}, -{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF}, -{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF}, -{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F}, -{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA}, -{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA}, -{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2}, -{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1}, -{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B}, -{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F}, -{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F}, -{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807}, -{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E}, -{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E}, -{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8}, -{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF}, -{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF}, -{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E}, -{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334}, -{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C}, -{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF}, -{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936}, -{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09}, -{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B}, -{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF}, -{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F}, -{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF}, -{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163}, -{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A}, -{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8}, -{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F}, -{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A}, -{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6}, -{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26}, -{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50}, -{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B}, -{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF}, -{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F}, -{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F}, -{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F}, -{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F}, -{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF}, -}; - -static std::string codepoint_to_utf8(uint32_t cp) { - std::string result; - if (/* 0x00 <= cp && */ cp <= 0x7f) { - result.push_back(cp); - } - else if (0x80 <= cp && cp <= 0x7ff) { - result.push_back(0xc0 | ((cp >> 6) & 0x1f)); - result.push_back(0x80 | (cp & 0x3f)); - } - else if (0x800 <= cp && cp <= 0xffff) { - result.push_back(0xe0 | ((cp >> 12) & 0x0f)); - result.push_back(0x80 | ((cp >> 6) & 0x3f)); - result.push_back(0x80 | (cp & 0x3f)); - } - else if (0x10000 <= cp && cp <= 0x10ffff) { - result.push_back(0xf0 | ((cp >> 18) & 0x07)); - result.push_back(0x80 | ((cp >> 12) & 0x3f)); - result.push_back(0x80 | ((cp >> 6) & 0x3f)); - result.push_back(0x80 | (cp & 0x3f)); - } - else { - throw std::invalid_argument("invalid codepoint"); - } - return result; -} - -static std::string codepoints_to_utf8(const std::vector & cps) { - std::string result; - for (size_t i = 0; i < cps.size(); ++i) { - result.append(codepoint_to_utf8(cps[i])); - } - return result; -} - -static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) { - assert(offset < utf8.size()); - if (!(utf8[offset + 0] & 0x80)) { - auto result = utf8[offset + 0]; - offset += 1; - return result; - } - if (!(utf8[offset + 0] & 0x40)) { - throw std::invalid_argument("invalid character"); - } - if (!(utf8[offset + 0] & 0x20)) { - if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) { - throw std::invalid_argument("invalid character"); - } - auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); - offset += 2; - return result; - } - if (!(utf8[offset + 0] & 0x10)) { - if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) { - throw std::invalid_argument("invalid character"); - } - auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); - offset += 3; - return result; - } - if (!(utf8[offset + 0] & 0x08)) { - if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) { - throw std::invalid_argument("invalid character"); - } - auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); - offset += 4; - return result; - } - throw std::invalid_argument("invalid string"); -} - -static std::vector codepoints_from_utf8(const std::string & utf8) { - std::vector result; - size_t offset = 0; - while (offset < utf8.size()) { - result.push_back(codepoint_from_utf8(utf8, offset)); - } - return result; -} - -static std::vector codepoint_to_utf16(uint32_t cp) { - std::vector result; - if (/* 0x0000 <= cp && */ cp <= 0xffff) { - result.emplace_back(cp); - } - else if (0x10000 <= cp && cp <= 0x10ffff) { - result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); - result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); - } - else { - throw std::invalid_argument("invalid codepoint"); - } - return result; -} - -static std::vector codepoints_to_utf16(const std::vector & cps) { - std::vector result; - for (size_t i = 0; i < cps.size(); ++i) { - auto temp = codepoint_to_utf16(cps[i]); - result.insert(result.end(), temp.begin(), temp.end()); - } - return result; -} - -static uint32_t codepoint_from_utf16(const std::vector & utf16, size_t & offset) { - assert(offset < utf16.size()); - if (((utf16[0] >> 10) << 10) != 0xd800) { - auto result = utf16[offset + 0]; - offset += 1; - return result; - } - - if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { - throw std::invalid_argument("invalid character"); - } - - auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); - offset += 2; - return result; -} - -static std::vector codepoints_from_utf16(const std::vector & utf16) { - std::vector result; - size_t offset = 0; - while (offset < utf16.size()) { - result.push_back(codepoint_from_utf16(utf16, offset)); - } - return result; -} - -#define CODEPOINT_TYPE_UNIDENTIFIED 0 -#define CODEPOINT_TYPE_DIGIT 1 -#define CODEPOINT_TYPE_LETTER 2 -#define CODEPOINT_TYPE_WHITESPACE 3 -#define CODEPOINT_TYPE_ACCENT_MARK 4 -#define CODEPOINT_TYPE_PUNCTUATION 5 -#define CODEPOINT_TYPE_SYMBOL 6 -#define CODEPOINT_TYPE_CONTROL 7 - -static std::unordered_map codepoint_type_map() { - std::unordered_map codepoint_types; - for (auto p : digit_ranges) { - for (auto i = p.first; i <= p.second; ++ i) { - codepoint_types[i] = CODEPOINT_TYPE_DIGIT; - } - } - for (auto p : letter_ranges) { - for (auto i = p.first; i <= p.second; ++ i) { - codepoint_types[i] = CODEPOINT_TYPE_LETTER; - } - } - for (auto p : whitespace_ranges) { - for (auto i = p.first; i <= p.second; ++ i) { - codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE; - } - } - for (auto p : accent_mark_ranges) { - for (auto i = p.first; i <= p.second; ++ i) { - codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK; - } - } - for (auto p : punctuation_ranges) { - for (auto i = p.first; i <= p.second; ++ i) { - codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION; - } - } - for (auto p : symbol_ranges) { - for (auto i = p.first; i <= p.second; ++i) { - codepoint_types[i] = CODEPOINT_TYPE_SYMBOL; - } - } - for (auto p : control_ranges) { - for (auto i = p.first; i <= p.second; ++ i) { - codepoint_types[i] = CODEPOINT_TYPE_CONTROL; - } - } - return codepoint_types; -} - -static int codepoint_type(uint32_t cp) { - static std::unordered_map codepoint_types = codepoint_type_map(); - return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp); -} - -static int codepoint_type(const std::string & utf8) { - if (utf8.length() == 0) { - return CODEPOINT_TYPE_UNIDENTIFIED; - } - size_t offset = 0; - return codepoint_type(codepoint_from_utf8(utf8, offset)); -} - -static std::unordered_map bytes_to_unicode_map_bpe() { - std::unordered_map map; - for (int ch = u'!'; ch <= u'~'; ++ch) { - assert(0 <= ch && ch < 256); - map[ch] = codepoint_to_utf8(ch); - } - for (int ch = u'¡'; ch <= u'¬'; ++ch) { - assert(0 <= ch && ch < 256); - map[ch] = codepoint_to_utf8(ch); - } - for (int ch = u'®'; ch <= u'ÿ'; ++ch) { - assert(0 <= ch && ch < 256); - map[ch] = codepoint_to_utf8(ch); - } - auto n = 0; - for (int ch = 0; ch < 256; ++ch) { - if (map.find(ch) == map.end()) { - map[ch] = codepoint_to_utf8(256 + n); - ++n; - } - } - return map; -} - -static std::string bytes_to_unicode_bpe(uint8_t byte) { - static std::unordered_map map = bytes_to_unicode_map_bpe(); - return map.at(byte); -} - -static std::unordered_map unicode_to_bytes_map_bpe() { - std::unordered_map map; - for (int ch = u'!'; ch <= u'~'; ++ch) { - assert(0 <= ch && ch < 256); - map[codepoint_to_utf8(ch)] = ch; - } - for (int ch = u'¡'; ch <= u'¬'; ++ch) { - assert(0 <= ch && ch < 256); - map[codepoint_to_utf8(ch)] = ch; - } - for (int ch = u'®'; ch <= u'ÿ'; ++ch) { - assert(0 <= ch && ch < 256); - map[codepoint_to_utf8(ch)] = ch; - } - auto n = 0; - for (int ch = 0; ch < 256; ++ch) { - if (map.find(codepoint_to_utf8(ch)) == map.end()) { - map[codepoint_to_utf8(256 + n)] = ch; - ++n; - } - } - return map; -} - -static uint8_t unicode_to_bytes_bpe(const std::string & utf8) { - static std::unordered_map map = unicode_to_bytes_map_bpe(); - return map.at(utf8); -} - +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define UNICODE_INVALID 0 +#define UNICODE_LETTER 10 +#define UNICODE_MARK 20 +#define UNICODE_NUMBER 30 +#define UNICODE_PUNCTUATION 40 +#define UNICODE_SYMBOL 50 +#define UNICODE_SEPARATOR 60 +#define UNICODE_OTHER 70 + +namespace UNICODE_RANGES { + namespace Letter { + // Letter, uppercase (11) + static const std::vector> Lu = { + {0x41,0x5a},{0xc0,0xd6},{0xd8,0xde},{0x100,0x100},{0x102,0x102},{0x104,0x104},{0x106,0x106}, + {0x108,0x108},{0x10a,0x10a},{0x10c,0x10c},{0x10e,0x10e},{0x110,0x110},{0x112,0x112}, + {0x114,0x114},{0x116,0x116},{0x118,0x118},{0x11a,0x11a},{0x11c,0x11c},{0x11e,0x11e}, + {0x120,0x120},{0x122,0x122},{0x124,0x124},{0x126,0x126},{0x128,0x128},{0x12a,0x12a}, + {0x12c,0x12c},{0x12e,0x12e},{0x130,0x130},{0x132,0x132},{0x134,0x134},{0x136,0x136}, + {0x139,0x139},{0x13b,0x13b},{0x13d,0x13d},{0x13f,0x13f},{0x141,0x141},{0x143,0x143}, + {0x145,0x145},{0x147,0x147},{0x14a,0x14a},{0x14c,0x14c},{0x14e,0x14e},{0x150,0x150}, + {0x152,0x152},{0x154,0x154},{0x156,0x156},{0x158,0x158},{0x15a,0x15a},{0x15c,0x15c}, + {0x15e,0x15e},{0x160,0x160},{0x162,0x162},{0x164,0x164},{0x166,0x166},{0x168,0x168}, + {0x16a,0x16a},{0x16c,0x16c},{0x16e,0x16e},{0x170,0x170},{0x172,0x172},{0x174,0x174}, + {0x176,0x176},{0x178,0x179},{0x17b,0x17b},{0x17d,0x17d},{0x181,0x182},{0x184,0x184}, + {0x186,0x187},{0x189,0x18b},{0x18e,0x191},{0x193,0x194},{0x196,0x198},{0x19c,0x19d}, + {0x19f,0x1a0},{0x1a2,0x1a2},{0x1a4,0x1a4},{0x1a6,0x1a7},{0x1a9,0x1a9},{0x1ac,0x1ac}, + {0x1ae,0x1af},{0x1b1,0x1b3},{0x1b5,0x1b5},{0x1b7,0x1b8},{0x1bc,0x1bc},{0x1c4,0x1c4}, + {0x1c7,0x1c7},{0x1ca,0x1ca},{0x1cd,0x1cd},{0x1cf,0x1cf},{0x1d1,0x1d1},{0x1d3,0x1d3}, + {0x1d5,0x1d5},{0x1d7,0x1d7},{0x1d9,0x1d9},{0x1db,0x1db},{0x1de,0x1de},{0x1e0,0x1e0}, + {0x1e2,0x1e2},{0x1e4,0x1e4},{0x1e6,0x1e6},{0x1e8,0x1e8},{0x1ea,0x1ea},{0x1ec,0x1ec}, + {0x1ee,0x1ee},{0x1f1,0x1f1},{0x1f4,0x1f4},{0x1f6,0x1f8},{0x1fa,0x1fa},{0x1fc,0x1fc}, + {0x1fe,0x1fe},{0x200,0x200},{0x202,0x202},{0x204,0x204},{0x206,0x206},{0x208,0x208}, + {0x20a,0x20a},{0x20c,0x20c},{0x20e,0x20e},{0x210,0x210},{0x212,0x212},{0x214,0x214}, + {0x216,0x216},{0x218,0x218},{0x21a,0x21a},{0x21c,0x21c},{0x21e,0x21e},{0x220,0x220}, + {0x222,0x222},{0x224,0x224},{0x226,0x226},{0x228,0x228},{0x22a,0x22a},{0x22c,0x22c}, + {0x22e,0x22e},{0x230,0x230},{0x232,0x232},{0x23a,0x23b},{0x23d,0x23e},{0x241,0x241}, + {0x243,0x246},{0x248,0x248},{0x24a,0x24a},{0x24c,0x24c},{0x24e,0x24e},{0x370,0x370}, + {0x372,0x372},{0x376,0x376},{0x37f,0x37f},{0x386,0x386},{0x388,0x38a},{0x38c,0x38c}, + {0x38e,0x38f},{0x391,0x3a1},{0x3a3,0x3ab},{0x3cf,0x3cf},{0x3d2,0x3d4},{0x3d8,0x3d8}, + {0x3da,0x3da},{0x3dc,0x3dc},{0x3de,0x3de},{0x3e0,0x3e0},{0x3e2,0x3e2},{0x3e4,0x3e4}, + {0x3e6,0x3e6},{0x3e8,0x3e8},{0x3ea,0x3ea},{0x3ec,0x3ec},{0x3ee,0x3ee},{0x3f4,0x3f4}, + {0x3f7,0x3f7},{0x3f9,0x3fa},{0x3fd,0x42f},{0x460,0x460},{0x462,0x462},{0x464,0x464}, + {0x466,0x466},{0x468,0x468},{0x46a,0x46a},{0x46c,0x46c},{0x46e,0x46e},{0x470,0x470}, + {0x472,0x472},{0x474,0x474},{0x476,0x476},{0x478,0x478},{0x47a,0x47a},{0x47c,0x47c}, + {0x47e,0x47e},{0x480,0x480},{0x48a,0x48a},{0x48c,0x48c},{0x48e,0x48e},{0x490,0x490}, + {0x492,0x492},{0x494,0x494},{0x496,0x496},{0x498,0x498},{0x49a,0x49a},{0x49c,0x49c}, + {0x49e,0x49e},{0x4a0,0x4a0},{0x4a2,0x4a2},{0x4a4,0x4a4},{0x4a6,0x4a6},{0x4a8,0x4a8}, + {0x4aa,0x4aa},{0x4ac,0x4ac},{0x4ae,0x4ae},{0x4b0,0x4b0},{0x4b2,0x4b2},{0x4b4,0x4b4}, + {0x4b6,0x4b6},{0x4b8,0x4b8},{0x4ba,0x4ba},{0x4bc,0x4bc},{0x4be,0x4be},{0x4c0,0x4c1}, + {0x4c3,0x4c3},{0x4c5,0x4c5},{0x4c7,0x4c7},{0x4c9,0x4c9},{0x4cb,0x4cb},{0x4cd,0x4cd}, + {0x4d0,0x4d0},{0x4d2,0x4d2},{0x4d4,0x4d4},{0x4d6,0x4d6},{0x4d8,0x4d8},{0x4da,0x4da}, + {0x4dc,0x4dc},{0x4de,0x4de},{0x4e0,0x4e0},{0x4e2,0x4e2},{0x4e4,0x4e4},{0x4e6,0x4e6}, + {0x4e8,0x4e8},{0x4ea,0x4ea},{0x4ec,0x4ec},{0x4ee,0x4ee},{0x4f0,0x4f0},{0x4f2,0x4f2}, + {0x4f4,0x4f4},{0x4f6,0x4f6},{0x4f8,0x4f8},{0x4fa,0x4fa},{0x4fc,0x4fc},{0x4fe,0x4fe}, + {0x500,0x500},{0x502,0x502},{0x504,0x504},{0x506,0x506},{0x508,0x508},{0x50a,0x50a}, + {0x50c,0x50c},{0x50e,0x50e},{0x510,0x510},{0x512,0x512},{0x514,0x514},{0x516,0x516}, + {0x518,0x518},{0x51a,0x51a},{0x51c,0x51c},{0x51e,0x51e},{0x520,0x520},{0x522,0x522}, + {0x524,0x524},{0x526,0x526},{0x528,0x528},{0x52a,0x52a},{0x52c,0x52c},{0x52e,0x52e}, + {0x531,0x556},{0x10a0,0x10c5},{0x10c7,0x10c7},{0x10cd,0x10cd},{0x13a0,0x13f5},{0x1c90,0x1cba}, + {0x1cbd,0x1cbf},{0x1e00,0x1e00},{0x1e02,0x1e02},{0x1e04,0x1e04},{0x1e06,0x1e06},{0x1e08,0x1e08}, + {0x1e0a,0x1e0a},{0x1e0c,0x1e0c},{0x1e0e,0x1e0e},{0x1e10,0x1e10},{0x1e12,0x1e12},{0x1e14,0x1e14}, + {0x1e16,0x1e16},{0x1e18,0x1e18},{0x1e1a,0x1e1a},{0x1e1c,0x1e1c},{0x1e1e,0x1e1e},{0x1e20,0x1e20}, + {0x1e22,0x1e22},{0x1e24,0x1e24},{0x1e26,0x1e26},{0x1e28,0x1e28},{0x1e2a,0x1e2a},{0x1e2c,0x1e2c}, + {0x1e2e,0x1e2e},{0x1e30,0x1e30},{0x1e32,0x1e32},{0x1e34,0x1e34},{0x1e36,0x1e36},{0x1e38,0x1e38}, + {0x1e3a,0x1e3a},{0x1e3c,0x1e3c},{0x1e3e,0x1e3e},{0x1e40,0x1e40},{0x1e42,0x1e42},{0x1e44,0x1e44}, + {0x1e46,0x1e46},{0x1e48,0x1e48},{0x1e4a,0x1e4a},{0x1e4c,0x1e4c},{0x1e4e,0x1e4e},{0x1e50,0x1e50}, + {0x1e52,0x1e52},{0x1e54,0x1e54},{0x1e56,0x1e56},{0x1e58,0x1e58},{0x1e5a,0x1e5a},{0x1e5c,0x1e5c}, + {0x1e5e,0x1e5e},{0x1e60,0x1e60},{0x1e62,0x1e62},{0x1e64,0x1e64},{0x1e66,0x1e66},{0x1e68,0x1e68}, + {0x1e6a,0x1e6a},{0x1e6c,0x1e6c},{0x1e6e,0x1e6e},{0x1e70,0x1e70},{0x1e72,0x1e72},{0x1e74,0x1e74}, + {0x1e76,0x1e76},{0x1e78,0x1e78},{0x1e7a,0x1e7a},{0x1e7c,0x1e7c},{0x1e7e,0x1e7e},{0x1e80,0x1e80}, + {0x1e82,0x1e82},{0x1e84,0x1e84},{0x1e86,0x1e86},{0x1e88,0x1e88},{0x1e8a,0x1e8a},{0x1e8c,0x1e8c}, + {0x1e8e,0x1e8e},{0x1e90,0x1e90},{0x1e92,0x1e92},{0x1e94,0x1e94},{0x1e9e,0x1e9e},{0x1ea0,0x1ea0}, + {0x1ea2,0x1ea2},{0x1ea4,0x1ea4},{0x1ea6,0x1ea6},{0x1ea8,0x1ea8},{0x1eaa,0x1eaa},{0x1eac,0x1eac}, + {0x1eae,0x1eae},{0x1eb0,0x1eb0},{0x1eb2,0x1eb2},{0x1eb4,0x1eb4},{0x1eb6,0x1eb6},{0x1eb8,0x1eb8}, + {0x1eba,0x1eba},{0x1ebc,0x1ebc},{0x1ebe,0x1ebe},{0x1ec0,0x1ec0},{0x1ec2,0x1ec2},{0x1ec4,0x1ec4}, + {0x1ec6,0x1ec6},{0x1ec8,0x1ec8},{0x1eca,0x1eca},{0x1ecc,0x1ecc},{0x1ece,0x1ece},{0x1ed0,0x1ed0}, + {0x1ed2,0x1ed2},{0x1ed4,0x1ed4},{0x1ed6,0x1ed6},{0x1ed8,0x1ed8},{0x1eda,0x1eda},{0x1edc,0x1edc}, + {0x1ede,0x1ede},{0x1ee0,0x1ee0},{0x1ee2,0x1ee2},{0x1ee4,0x1ee4},{0x1ee6,0x1ee6},{0x1ee8,0x1ee8}, + {0x1eea,0x1eea},{0x1eec,0x1eec},{0x1eee,0x1eee},{0x1ef0,0x1ef0},{0x1ef2,0x1ef2},{0x1ef4,0x1ef4}, + {0x1ef6,0x1ef6},{0x1ef8,0x1ef8},{0x1efa,0x1efa},{0x1efc,0x1efc},{0x1efe,0x1efe},{0x1f08,0x1f0f}, + {0x1f18,0x1f1d},{0x1f28,0x1f2f},{0x1f38,0x1f3f},{0x1f48,0x1f4d},{0x1f59,0x1f59},{0x1f5b,0x1f5b}, + {0x1f5d,0x1f5d},{0x1f5f,0x1f5f},{0x1f68,0x1f6f},{0x1fb8,0x1fbb},{0x1fc8,0x1fcb},{0x1fd8,0x1fdb}, + {0x1fe8,0x1fec},{0x1ff8,0x1ffb},{0x2102,0x2102},{0x2107,0x2107},{0x210b,0x210d},{0x2110,0x2112}, + {0x2115,0x2115},{0x2119,0x211d},{0x2124,0x2124},{0x2126,0x2126},{0x2128,0x2128},{0x212a,0x212d}, + {0x2130,0x2133},{0x213e,0x213f},{0x2145,0x2145},{0x2183,0x2183},{0x2c00,0x2c2f},{0x2c60,0x2c60}, + {0x2c62,0x2c64},{0x2c67,0x2c67},{0x2c69,0x2c69},{0x2c6b,0x2c6b},{0x2c6d,0x2c70},{0x2c72,0x2c72}, + {0x2c75,0x2c75},{0x2c7e,0x2c80},{0x2c82,0x2c82},{0x2c84,0x2c84},{0x2c86,0x2c86},{0x2c88,0x2c88}, + {0x2c8a,0x2c8a},{0x2c8c,0x2c8c},{0x2c8e,0x2c8e},{0x2c90,0x2c90},{0x2c92,0x2c92},{0x2c94,0x2c94}, + {0x2c96,0x2c96},{0x2c98,0x2c98},{0x2c9a,0x2c9a},{0x2c9c,0x2c9c},{0x2c9e,0x2c9e},{0x2ca0,0x2ca0}, + {0x2ca2,0x2ca2},{0x2ca4,0x2ca4},{0x2ca6,0x2ca6},{0x2ca8,0x2ca8},{0x2caa,0x2caa},{0x2cac,0x2cac}, + {0x2cae,0x2cae},{0x2cb0,0x2cb0},{0x2cb2,0x2cb2},{0x2cb4,0x2cb4},{0x2cb6,0x2cb6},{0x2cb8,0x2cb8}, + {0x2cba,0x2cba},{0x2cbc,0x2cbc},{0x2cbe,0x2cbe},{0x2cc0,0x2cc0},{0x2cc2,0x2cc2},{0x2cc4,0x2cc4}, + {0x2cc6,0x2cc6},{0x2cc8,0x2cc8},{0x2cca,0x2cca},{0x2ccc,0x2ccc},{0x2cce,0x2cce},{0x2cd0,0x2cd0}, + {0x2cd2,0x2cd2},{0x2cd4,0x2cd4},{0x2cd6,0x2cd6},{0x2cd8,0x2cd8},{0x2cda,0x2cda},{0x2cdc,0x2cdc}, + {0x2cde,0x2cde},{0x2ce0,0x2ce0},{0x2ce2,0x2ce2},{0x2ceb,0x2ceb},{0x2ced,0x2ced},{0x2cf2,0x2cf2}, + {0xa640,0xa640},{0xa642,0xa642},{0xa644,0xa644},{0xa646,0xa646},{0xa648,0xa648},{0xa64a,0xa64a}, + {0xa64c,0xa64c},{0xa64e,0xa64e},{0xa650,0xa650},{0xa652,0xa652},{0xa654,0xa654},{0xa656,0xa656}, + {0xa658,0xa658},{0xa65a,0xa65a},{0xa65c,0xa65c},{0xa65e,0xa65e},{0xa660,0xa660},{0xa662,0xa662}, + {0xa664,0xa664},{0xa666,0xa666},{0xa668,0xa668},{0xa66a,0xa66a},{0xa66c,0xa66c},{0xa680,0xa680}, + {0xa682,0xa682},{0xa684,0xa684},{0xa686,0xa686},{0xa688,0xa688},{0xa68a,0xa68a},{0xa68c,0xa68c}, + {0xa68e,0xa68e},{0xa690,0xa690},{0xa692,0xa692},{0xa694,0xa694},{0xa696,0xa696},{0xa698,0xa698}, + {0xa69a,0xa69a},{0xa722,0xa722},{0xa724,0xa724},{0xa726,0xa726},{0xa728,0xa728},{0xa72a,0xa72a}, + {0xa72c,0xa72c},{0xa72e,0xa72e},{0xa732,0xa732},{0xa734,0xa734},{0xa736,0xa736},{0xa738,0xa738}, + {0xa73a,0xa73a},{0xa73c,0xa73c},{0xa73e,0xa73e},{0xa740,0xa740},{0xa742,0xa742},{0xa744,0xa744}, + {0xa746,0xa746},{0xa748,0xa748},{0xa74a,0xa74a},{0xa74c,0xa74c},{0xa74e,0xa74e},{0xa750,0xa750}, + {0xa752,0xa752},{0xa754,0xa754},{0xa756,0xa756},{0xa758,0xa758},{0xa75a,0xa75a},{0xa75c,0xa75c}, + {0xa75e,0xa75e},{0xa760,0xa760},{0xa762,0xa762},{0xa764,0xa764},{0xa766,0xa766},{0xa768,0xa768}, + {0xa76a,0xa76a},{0xa76c,0xa76c},{0xa76e,0xa76e},{0xa779,0xa779},{0xa77b,0xa77b},{0xa77d,0xa77e}, + {0xa780,0xa780},{0xa782,0xa782},{0xa784,0xa784},{0xa786,0xa786},{0xa78b,0xa78b},{0xa78d,0xa78d}, + {0xa790,0xa790},{0xa792,0xa792},{0xa796,0xa796},{0xa798,0xa798},{0xa79a,0xa79a},{0xa79c,0xa79c}, + {0xa79e,0xa79e},{0xa7a0,0xa7a0},{0xa7a2,0xa7a2},{0xa7a4,0xa7a4},{0xa7a6,0xa7a6},{0xa7a8,0xa7a8}, + {0xa7aa,0xa7ae},{0xa7b0,0xa7b4},{0xa7b6,0xa7b6},{0xa7b8,0xa7b8},{0xa7ba,0xa7ba},{0xa7bc,0xa7bc}, + {0xa7be,0xa7be},{0xa7c0,0xa7c0},{0xa7c2,0xa7c2},{0xa7c4,0xa7c7},{0xa7c9,0xa7c9},{0xa7d0,0xa7d0}, + {0xa7d6,0xa7d6},{0xa7d8,0xa7d8},{0xa7f5,0xa7f5},{0xff21,0xff3a},{0x10400,0x10427},{0x104b0,0x104d3}, + {0x10570,0x1057a},{0x1057c,0x1058a},{0x1058c,0x10592},{0x10594,0x10595},{0x10c80,0x10cb2},{0x118a0,0x118bf}, + {0x16e40,0x16e5f},{0x1d400,0x1d419},{0x1d434,0x1d44d},{0x1d468,0x1d481},{0x1d49c,0x1d49c},{0x1d49e,0x1d49f}, + {0x1d4a2,0x1d4a2},{0x1d4a5,0x1d4a6},{0x1d4a9,0x1d4ac},{0x1d4ae,0x1d4b5},{0x1d4d0,0x1d4e9},{0x1d504,0x1d505}, + {0x1d507,0x1d50a},{0x1d50d,0x1d514},{0x1d516,0x1d51c},{0x1d538,0x1d539},{0x1d53b,0x1d53e},{0x1d540,0x1d544}, + {0x1d546,0x1d546},{0x1d54a,0x1d550},{0x1d56c,0x1d585},{0x1d5a0,0x1d5b9},{0x1d5d4,0x1d5ed},{0x1d608,0x1d621}, + {0x1d63c,0x1d655},{0x1d670,0x1d689},{0x1d6a8,0x1d6c0},{0x1d6e2,0x1d6fa},{0x1d71c,0x1d734},{0x1d756,0x1d76e}, + {0x1d790,0x1d7a8},{0x1d7ca,0x1d7ca},{0x1e900,0x1e921} + }; + + // Letter, lowercase (12) + static const std::vector> Ll = { + {0x61,0x7a},{0xb5,0xb5},{0xdf,0xf6},{0xf8,0xff},{0x101,0x101},{0x103,0x103},{0x105,0x105}, + {0x107,0x107},{0x109,0x109},{0x10b,0x10b},{0x10d,0x10d},{0x10f,0x10f},{0x111,0x111}, + {0x113,0x113},{0x115,0x115},{0x117,0x117},{0x119,0x119},{0x11b,0x11b},{0x11d,0x11d}, + {0x11f,0x11f},{0x121,0x121},{0x123,0x123},{0x125,0x125},{0x127,0x127},{0x129,0x129}, + {0x12b,0x12b},{0x12d,0x12d},{0x12f,0x12f},{0x131,0x131},{0x133,0x133},{0x135,0x135}, + {0x137,0x138},{0x13a,0x13a},{0x13c,0x13c},{0x13e,0x13e},{0x140,0x140},{0x142,0x142}, + {0x144,0x144},{0x146,0x146},{0x148,0x149},{0x14b,0x14b},{0x14d,0x14d},{0x14f,0x14f}, + {0x151,0x151},{0x153,0x153},{0x155,0x155},{0x157,0x157},{0x159,0x159},{0x15b,0x15b}, + {0x15d,0x15d},{0x15f,0x15f},{0x161,0x161},{0x163,0x163},{0x165,0x165},{0x167,0x167}, + {0x169,0x169},{0x16b,0x16b},{0x16d,0x16d},{0x16f,0x16f},{0x171,0x171},{0x173,0x173}, + {0x175,0x175},{0x177,0x177},{0x17a,0x17a},{0x17c,0x17c},{0x17e,0x180},{0x183,0x183}, + {0x185,0x185},{0x188,0x188},{0x18c,0x18d},{0x192,0x192},{0x195,0x195},{0x199,0x19b}, + {0x19e,0x19e},{0x1a1,0x1a1},{0x1a3,0x1a3},{0x1a5,0x1a5},{0x1a8,0x1a8},{0x1aa,0x1ab}, + {0x1ad,0x1ad},{0x1b0,0x1b0},{0x1b4,0x1b4},{0x1b6,0x1b6},{0x1b9,0x1ba},{0x1bd,0x1bf}, + {0x1c6,0x1c6},{0x1c9,0x1c9},{0x1cc,0x1cc},{0x1ce,0x1ce},{0x1d0,0x1d0},{0x1d2,0x1d2}, + {0x1d4,0x1d4},{0x1d6,0x1d6},{0x1d8,0x1d8},{0x1da,0x1da},{0x1dc,0x1dd},{0x1df,0x1df}, + {0x1e1,0x1e1},{0x1e3,0x1e3},{0x1e5,0x1e5},{0x1e7,0x1e7},{0x1e9,0x1e9},{0x1eb,0x1eb}, + {0x1ed,0x1ed},{0x1ef,0x1f0},{0x1f3,0x1f3},{0x1f5,0x1f5},{0x1f9,0x1f9},{0x1fb,0x1fb}, + {0x1fd,0x1fd},{0x1ff,0x1ff},{0x201,0x201},{0x203,0x203},{0x205,0x205},{0x207,0x207}, + {0x209,0x209},{0x20b,0x20b},{0x20d,0x20d},{0x20f,0x20f},{0x211,0x211},{0x213,0x213}, + {0x215,0x215},{0x217,0x217},{0x219,0x219},{0x21b,0x21b},{0x21d,0x21d},{0x21f,0x21f}, + {0x221,0x221},{0x223,0x223},{0x225,0x225},{0x227,0x227},{0x229,0x229},{0x22b,0x22b}, + {0x22d,0x22d},{0x22f,0x22f},{0x231,0x231},{0x233,0x239},{0x23c,0x23c},{0x23f,0x240}, + {0x242,0x242},{0x247,0x247},{0x249,0x249},{0x24b,0x24b},{0x24d,0x24d},{0x24f,0x293}, + {0x295,0x2af},{0x371,0x371},{0x373,0x373},{0x377,0x377},{0x37b,0x37d},{0x390,0x390}, + {0x3ac,0x3ce},{0x3d0,0x3d1},{0x3d5,0x3d7},{0x3d9,0x3d9},{0x3db,0x3db},{0x3dd,0x3dd}, + {0x3df,0x3df},{0x3e1,0x3e1},{0x3e3,0x3e3},{0x3e5,0x3e5},{0x3e7,0x3e7},{0x3e9,0x3e9}, + {0x3eb,0x3eb},{0x3ed,0x3ed},{0x3ef,0x3f3},{0x3f5,0x3f5},{0x3f8,0x3f8},{0x3fb,0x3fc}, + {0x430,0x45f},{0x461,0x461},{0x463,0x463},{0x465,0x465},{0x467,0x467},{0x469,0x469}, + {0x46b,0x46b},{0x46d,0x46d},{0x46f,0x46f},{0x471,0x471},{0x473,0x473},{0x475,0x475}, + {0x477,0x477},{0x479,0x479},{0x47b,0x47b},{0x47d,0x47d},{0x47f,0x47f},{0x481,0x481}, + {0x48b,0x48b},{0x48d,0x48d},{0x48f,0x48f},{0x491,0x491},{0x493,0x493},{0x495,0x495}, + {0x497,0x497},{0x499,0x499},{0x49b,0x49b},{0x49d,0x49d},{0x49f,0x49f},{0x4a1,0x4a1}, + {0x4a3,0x4a3},{0x4a5,0x4a5},{0x4a7,0x4a7},{0x4a9,0x4a9},{0x4ab,0x4ab},{0x4ad,0x4ad}, + {0x4af,0x4af},{0x4b1,0x4b1},{0x4b3,0x4b3},{0x4b5,0x4b5},{0x4b7,0x4b7},{0x4b9,0x4b9}, + {0x4bb,0x4bb},{0x4bd,0x4bd},{0x4bf,0x4bf},{0x4c2,0x4c2},{0x4c4,0x4c4},{0x4c6,0x4c6}, + {0x4c8,0x4c8},{0x4ca,0x4ca},{0x4cc,0x4cc},{0x4ce,0x4cf},{0x4d1,0x4d1},{0x4d3,0x4d3}, + {0x4d5,0x4d5},{0x4d7,0x4d7},{0x4d9,0x4d9},{0x4db,0x4db},{0x4dd,0x4dd},{0x4df,0x4df}, + {0x4e1,0x4e1},{0x4e3,0x4e3},{0x4e5,0x4e5},{0x4e7,0x4e7},{0x4e9,0x4e9},{0x4eb,0x4eb}, + {0x4ed,0x4ed},{0x4ef,0x4ef},{0x4f1,0x4f1},{0x4f3,0x4f3},{0x4f5,0x4f5},{0x4f7,0x4f7}, + {0x4f9,0x4f9},{0x4fb,0x4fb},{0x4fd,0x4fd},{0x4ff,0x4ff},{0x501,0x501},{0x503,0x503}, + {0x505,0x505},{0x507,0x507},{0x509,0x509},{0x50b,0x50b},{0x50d,0x50d},{0x50f,0x50f}, + {0x511,0x511},{0x513,0x513},{0x515,0x515},{0x517,0x517},{0x519,0x519},{0x51b,0x51b}, + {0x51d,0x51d},{0x51f,0x51f},{0x521,0x521},{0x523,0x523},{0x525,0x525},{0x527,0x527}, + {0x529,0x529},{0x52b,0x52b},{0x52d,0x52d},{0x52f,0x52f},{0x560,0x588},{0x10d0,0x10fa}, + {0x10fd,0x10ff},{0x13f8,0x13fd},{0x1c80,0x1c88},{0x1d00,0x1d2b},{0x1d6b,0x1d77},{0x1d79,0x1d9a}, + {0x1e01,0x1e01},{0x1e03,0x1e03},{0x1e05,0x1e05},{0x1e07,0x1e07},{0x1e09,0x1e09},{0x1e0b,0x1e0b}, + {0x1e0d,0x1e0d},{0x1e0f,0x1e0f},{0x1e11,0x1e11},{0x1e13,0x1e13},{0x1e15,0x1e15},{0x1e17,0x1e17}, + {0x1e19,0x1e19},{0x1e1b,0x1e1b},{0x1e1d,0x1e1d},{0x1e1f,0x1e1f},{0x1e21,0x1e21},{0x1e23,0x1e23}, + {0x1e25,0x1e25},{0x1e27,0x1e27},{0x1e29,0x1e29},{0x1e2b,0x1e2b},{0x1e2d,0x1e2d},{0x1e2f,0x1e2f}, + {0x1e31,0x1e31},{0x1e33,0x1e33},{0x1e35,0x1e35},{0x1e37,0x1e37},{0x1e39,0x1e39},{0x1e3b,0x1e3b}, + {0x1e3d,0x1e3d},{0x1e3f,0x1e3f},{0x1e41,0x1e41},{0x1e43,0x1e43},{0x1e45,0x1e45},{0x1e47,0x1e47}, + {0x1e49,0x1e49},{0x1e4b,0x1e4b},{0x1e4d,0x1e4d},{0x1e4f,0x1e4f},{0x1e51,0x1e51},{0x1e53,0x1e53}, + {0x1e55,0x1e55},{0x1e57,0x1e57},{0x1e59,0x1e59},{0x1e5b,0x1e5b},{0x1e5d,0x1e5d},{0x1e5f,0x1e5f}, + {0x1e61,0x1e61},{0x1e63,0x1e63},{0x1e65,0x1e65},{0x1e67,0x1e67},{0x1e69,0x1e69},{0x1e6b,0x1e6b}, + {0x1e6d,0x1e6d},{0x1e6f,0x1e6f},{0x1e71,0x1e71},{0x1e73,0x1e73},{0x1e75,0x1e75},{0x1e77,0x1e77}, + {0x1e79,0x1e79},{0x1e7b,0x1e7b},{0x1e7d,0x1e7d},{0x1e7f,0x1e7f},{0x1e81,0x1e81},{0x1e83,0x1e83}, + {0x1e85,0x1e85},{0x1e87,0x1e87},{0x1e89,0x1e89},{0x1e8b,0x1e8b},{0x1e8d,0x1e8d},{0x1e8f,0x1e8f}, + {0x1e91,0x1e91},{0x1e93,0x1e93},{0x1e95,0x1e9d},{0x1e9f,0x1e9f},{0x1ea1,0x1ea1},{0x1ea3,0x1ea3}, + {0x1ea5,0x1ea5},{0x1ea7,0x1ea7},{0x1ea9,0x1ea9},{0x1eab,0x1eab},{0x1ead,0x1ead},{0x1eaf,0x1eaf}, + {0x1eb1,0x1eb1},{0x1eb3,0x1eb3},{0x1eb5,0x1eb5},{0x1eb7,0x1eb7},{0x1eb9,0x1eb9},{0x1ebb,0x1ebb}, + {0x1ebd,0x1ebd},{0x1ebf,0x1ebf},{0x1ec1,0x1ec1},{0x1ec3,0x1ec3},{0x1ec5,0x1ec5},{0x1ec7,0x1ec7}, + {0x1ec9,0x1ec9},{0x1ecb,0x1ecb},{0x1ecd,0x1ecd},{0x1ecf,0x1ecf},{0x1ed1,0x1ed1},{0x1ed3,0x1ed3}, + {0x1ed5,0x1ed5},{0x1ed7,0x1ed7},{0x1ed9,0x1ed9},{0x1edb,0x1edb},{0x1edd,0x1edd},{0x1edf,0x1edf}, + {0x1ee1,0x1ee1},{0x1ee3,0x1ee3},{0x1ee5,0x1ee5},{0x1ee7,0x1ee7},{0x1ee9,0x1ee9},{0x1eeb,0x1eeb}, + {0x1eed,0x1eed},{0x1eef,0x1eef},{0x1ef1,0x1ef1},{0x1ef3,0x1ef3},{0x1ef5,0x1ef5},{0x1ef7,0x1ef7}, + {0x1ef9,0x1ef9},{0x1efb,0x1efb},{0x1efd,0x1efd},{0x1eff,0x1f07},{0x1f10,0x1f15},{0x1f20,0x1f27}, + {0x1f30,0x1f37},{0x1f40,0x1f45},{0x1f50,0x1f57},{0x1f60,0x1f67},{0x1f70,0x1f7d},{0x1f80,0x1f87}, + {0x1f90,0x1f97},{0x1fa0,0x1fa7},{0x1fb0,0x1fb4},{0x1fb6,0x1fb7},{0x1fbe,0x1fbe},{0x1fc2,0x1fc4}, + {0x1fc6,0x1fc7},{0x1fd0,0x1fd3},{0x1fd6,0x1fd7},{0x1fe0,0x1fe7},{0x1ff2,0x1ff4},{0x1ff6,0x1ff7}, + {0x210a,0x210a},{0x210e,0x210f},{0x2113,0x2113},{0x212f,0x212f},{0x2134,0x2134},{0x2139,0x2139}, + {0x213c,0x213d},{0x2146,0x2149},{0x214e,0x214e},{0x2184,0x2184},{0x2c30,0x2c5f},{0x2c61,0x2c61}, + {0x2c65,0x2c66},{0x2c68,0x2c68},{0x2c6a,0x2c6a},{0x2c6c,0x2c6c},{0x2c71,0x2c71},{0x2c73,0x2c74}, + {0x2c76,0x2c7b},{0x2c81,0x2c81},{0x2c83,0x2c83},{0x2c85,0x2c85},{0x2c87,0x2c87},{0x2c89,0x2c89}, + {0x2c8b,0x2c8b},{0x2c8d,0x2c8d},{0x2c8f,0x2c8f},{0x2c91,0x2c91},{0x2c93,0x2c93},{0x2c95,0x2c95}, + {0x2c97,0x2c97},{0x2c99,0x2c99},{0x2c9b,0x2c9b},{0x2c9d,0x2c9d},{0x2c9f,0x2c9f},{0x2ca1,0x2ca1}, + {0x2ca3,0x2ca3},{0x2ca5,0x2ca5},{0x2ca7,0x2ca7},{0x2ca9,0x2ca9},{0x2cab,0x2cab},{0x2cad,0x2cad}, + {0x2caf,0x2caf},{0x2cb1,0x2cb1},{0x2cb3,0x2cb3},{0x2cb5,0x2cb5},{0x2cb7,0x2cb7},{0x2cb9,0x2cb9}, + {0x2cbb,0x2cbb},{0x2cbd,0x2cbd},{0x2cbf,0x2cbf},{0x2cc1,0x2cc1},{0x2cc3,0x2cc3},{0x2cc5,0x2cc5}, + {0x2cc7,0x2cc7},{0x2cc9,0x2cc9},{0x2ccb,0x2ccb},{0x2ccd,0x2ccd},{0x2ccf,0x2ccf},{0x2cd1,0x2cd1}, + {0x2cd3,0x2cd3},{0x2cd5,0x2cd5},{0x2cd7,0x2cd7},{0x2cd9,0x2cd9},{0x2cdb,0x2cdb},{0x2cdd,0x2cdd}, + {0x2cdf,0x2cdf},{0x2ce1,0x2ce1},{0x2ce3,0x2ce4},{0x2cec,0x2cec},{0x2cee,0x2cee},{0x2cf3,0x2cf3}, + {0x2d00,0x2d25},{0x2d27,0x2d27},{0x2d2d,0x2d2d},{0xa641,0xa641},{0xa643,0xa643},{0xa645,0xa645}, + {0xa647,0xa647},{0xa649,0xa649},{0xa64b,0xa64b},{0xa64d,0xa64d},{0xa64f,0xa64f},{0xa651,0xa651}, + {0xa653,0xa653},{0xa655,0xa655},{0xa657,0xa657},{0xa659,0xa659},{0xa65b,0xa65b},{0xa65d,0xa65d}, + {0xa65f,0xa65f},{0xa661,0xa661},{0xa663,0xa663},{0xa665,0xa665},{0xa667,0xa667},{0xa669,0xa669}, + {0xa66b,0xa66b},{0xa66d,0xa66d},{0xa681,0xa681},{0xa683,0xa683},{0xa685,0xa685},{0xa687,0xa687}, + {0xa689,0xa689},{0xa68b,0xa68b},{0xa68d,0xa68d},{0xa68f,0xa68f},{0xa691,0xa691},{0xa693,0xa693}, + {0xa695,0xa695},{0xa697,0xa697},{0xa699,0xa699},{0xa69b,0xa69b},{0xa723,0xa723},{0xa725,0xa725}, + {0xa727,0xa727},{0xa729,0xa729},{0xa72b,0xa72b},{0xa72d,0xa72d},{0xa72f,0xa731},{0xa733,0xa733}, + {0xa735,0xa735},{0xa737,0xa737},{0xa739,0xa739},{0xa73b,0xa73b},{0xa73d,0xa73d},{0xa73f,0xa73f}, + {0xa741,0xa741},{0xa743,0xa743},{0xa745,0xa745},{0xa747,0xa747},{0xa749,0xa749},{0xa74b,0xa74b}, + {0xa74d,0xa74d},{0xa74f,0xa74f},{0xa751,0xa751},{0xa753,0xa753},{0xa755,0xa755},{0xa757,0xa757}, + {0xa759,0xa759},{0xa75b,0xa75b},{0xa75d,0xa75d},{0xa75f,0xa75f},{0xa761,0xa761},{0xa763,0xa763}, + {0xa765,0xa765},{0xa767,0xa767},{0xa769,0xa769},{0xa76b,0xa76b},{0xa76d,0xa76d},{0xa76f,0xa76f}, + {0xa771,0xa778},{0xa77a,0xa77a},{0xa77c,0xa77c},{0xa77f,0xa77f},{0xa781,0xa781},{0xa783,0xa783}, + {0xa785,0xa785},{0xa787,0xa787},{0xa78c,0xa78c},{0xa78e,0xa78e},{0xa791,0xa791},{0xa793,0xa795}, + {0xa797,0xa797},{0xa799,0xa799},{0xa79b,0xa79b},{0xa79d,0xa79d},{0xa79f,0xa79f},{0xa7a1,0xa7a1}, + {0xa7a3,0xa7a3},{0xa7a5,0xa7a5},{0xa7a7,0xa7a7},{0xa7a9,0xa7a9},{0xa7af,0xa7af},{0xa7b5,0xa7b5}, + {0xa7b7,0xa7b7},{0xa7b9,0xa7b9},{0xa7bb,0xa7bb},{0xa7bd,0xa7bd},{0xa7bf,0xa7bf},{0xa7c1,0xa7c1}, + {0xa7c3,0xa7c3},{0xa7c8,0xa7c8},{0xa7ca,0xa7ca},{0xa7d1,0xa7d1},{0xa7d3,0xa7d3},{0xa7d5,0xa7d5}, + {0xa7d7,0xa7d7},{0xa7d9,0xa7d9},{0xa7f6,0xa7f6},{0xa7fa,0xa7fa},{0xab30,0xab5a},{0xab60,0xab68}, + {0xab70,0xabbf},{0xfb00,0xfb06},{0xfb13,0xfb17},{0xff41,0xff5a},{0x10428,0x1044f},{0x104d8,0x104fb}, + {0x10597,0x105a1},{0x105a3,0x105b1},{0x105b3,0x105b9},{0x105bb,0x105bc},{0x10cc0,0x10cf2},{0x118c0,0x118df}, + {0x16e60,0x16e7f},{0x1d41a,0x1d433},{0x1d44e,0x1d454},{0x1d456,0x1d467},{0x1d482,0x1d49b},{0x1d4b6,0x1d4b9}, + {0x1d4bb,0x1d4bb},{0x1d4bd,0x1d4c3},{0x1d4c5,0x1d4cf},{0x1d4ea,0x1d503},{0x1d51e,0x1d537},{0x1d552,0x1d56b}, + {0x1d586,0x1d59f},{0x1d5ba,0x1d5d3},{0x1d5ee,0x1d607},{0x1d622,0x1d63b},{0x1d656,0x1d66f},{0x1d68a,0x1d6a5}, + {0x1d6c2,0x1d6da},{0x1d6dc,0x1d6e1},{0x1d6fc,0x1d714},{0x1d716,0x1d71b},{0x1d736,0x1d74e},{0x1d750,0x1d755}, + {0x1d770,0x1d788},{0x1d78a,0x1d78f},{0x1d7aa,0x1d7c2},{0x1d7c4,0x1d7c9},{0x1d7cb,0x1d7cb},{0x1df00,0x1df09}, + {0x1df0b,0x1df1e},{0x1e922,0x1e943} + }; + + // Letter, titlecase (13) + static const std::vector> Lt = { + {0x1c5,0x1c5},{0x1c8,0x1c8},{0x1cb,0x1cb},{0x1f2,0x1f2},{0x1f88,0x1f8f},{0x1f98,0x1f9f},{0x1fa8,0x1faf}, + {0x1fbc,0x1fbc},{0x1fcc,0x1fcc},{0x1ffc,0x1ffc} + }; + + // Letter, modifier (14) + static const std::vector> Lm = { + {0x2b0,0x2c1},{0x2c6,0x2d1},{0x2e0,0x2e4},{0x2ec,0x2ec},{0x2ee,0x2ee},{0x374,0x374},{0x37a,0x37a}, + {0x559,0x559},{0x640,0x640},{0x6e5,0x6e6},{0x7f4,0x7f5},{0x7fa,0x7fa},{0x81a,0x81a}, + {0x824,0x824},{0x828,0x828},{0x8c9,0x8c9},{0x971,0x971},{0xe46,0xe46},{0xec6,0xec6}, + {0x10fc,0x10fc},{0x17d7,0x17d7},{0x1843,0x1843},{0x1aa7,0x1aa7},{0x1c78,0x1c7d},{0x1d2c,0x1d6a}, + {0x1d78,0x1d78},{0x1d9b,0x1dbf},{0x2071,0x2071},{0x207f,0x207f},{0x2090,0x209c},{0x2c7c,0x2c7d}, + {0x2d6f,0x2d6f},{0x2e2f,0x2e2f},{0x3005,0x3005},{0x3031,0x3035},{0x303b,0x303b},{0x309d,0x309e}, + {0x30fc,0x30fe},{0xa015,0xa015},{0xa4f8,0xa4fd},{0xa60c,0xa60c},{0xa67f,0xa67f},{0xa69c,0xa69d}, + {0xa717,0xa71f},{0xa770,0xa770},{0xa788,0xa788},{0xa7f2,0xa7f4},{0xa7f8,0xa7f9},{0xa9cf,0xa9cf}, + {0xa9e6,0xa9e6},{0xaa70,0xaa70},{0xaadd,0xaadd},{0xaaf3,0xaaf4},{0xab5c,0xab5f},{0xab69,0xab69}, + {0xff70,0xff70},{0xff9e,0xff9f},{0x10780,0x10785},{0x10787,0x107b0},{0x107b2,0x107ba},{0x16b40,0x16b43}, + {0x16f93,0x16f9f},{0x16fe0,0x16fe1},{0x16fe3,0x16fe3},{0x1aff0,0x1aff3},{0x1aff5,0x1affb},{0x1affd,0x1affe}, + {0x1e137,0x1e13d},{0x1e94b,0x1e94b} + }; + + // Letter, other (15) + static const std::vector> Lo = { + {0xaa,0xaa},{0xba,0xba},{0x1bb,0x1bb},{0x1c0,0x1c3},{0x294,0x294},{0x5d0,0x5ea},{0x5ef,0x5f2}, + {0x620,0x63f},{0x641,0x64a},{0x66e,0x66f},{0x671,0x6d3},{0x6d5,0x6d5},{0x6ee,0x6ef}, + {0x6fa,0x6fc},{0x6ff,0x6ff},{0x710,0x710},{0x712,0x72f},{0x74d,0x7a5},{0x7b1,0x7b1}, + {0x7ca,0x7ea},{0x800,0x815},{0x840,0x858},{0x860,0x86a},{0x870,0x887},{0x889,0x88e}, + {0x8a0,0x8c8},{0x904,0x939},{0x93d,0x93d},{0x950,0x950},{0x958,0x961},{0x972,0x980}, + {0x985,0x98c},{0x98f,0x990},{0x993,0x9a8},{0x9aa,0x9b0},{0x9b2,0x9b2},{0x9b6,0x9b9}, + {0x9bd,0x9bd},{0x9ce,0x9ce},{0x9dc,0x9dd},{0x9df,0x9e1},{0x9f0,0x9f1},{0x9fc,0x9fc}, + {0xa05,0xa0a},{0xa0f,0xa10},{0xa13,0xa28},{0xa2a,0xa30},{0xa32,0xa33},{0xa35,0xa36}, + {0xa38,0xa39},{0xa59,0xa5c},{0xa5e,0xa5e},{0xa72,0xa74},{0xa85,0xa8d},{0xa8f,0xa91}, + {0xa93,0xaa8},{0xaaa,0xab0},{0xab2,0xab3},{0xab5,0xab9},{0xabd,0xabd},{0xad0,0xad0}, + {0xae0,0xae1},{0xaf9,0xaf9},{0xb05,0xb0c},{0xb0f,0xb10},{0xb13,0xb28},{0xb2a,0xb30}, + {0xb32,0xb33},{0xb35,0xb39},{0xb3d,0xb3d},{0xb5c,0xb5d},{0xb5f,0xb61},{0xb71,0xb71}, + {0xb83,0xb83},{0xb85,0xb8a},{0xb8e,0xb90},{0xb92,0xb95},{0xb99,0xb9a},{0xb9c,0xb9c}, + {0xb9e,0xb9f},{0xba3,0xba4},{0xba8,0xbaa},{0xbae,0xbb9},{0xbd0,0xbd0},{0xc05,0xc0c}, + {0xc0e,0xc10},{0xc12,0xc28},{0xc2a,0xc39},{0xc3d,0xc3d},{0xc58,0xc5a},{0xc5d,0xc5d}, + {0xc60,0xc61},{0xc80,0xc80},{0xc85,0xc8c},{0xc8e,0xc90},{0xc92,0xca8},{0xcaa,0xcb3}, + {0xcb5,0xcb9},{0xcbd,0xcbd},{0xcdd,0xcde},{0xce0,0xce1},{0xcf1,0xcf2},{0xd04,0xd0c}, + {0xd0e,0xd10},{0xd12,0xd3a},{0xd3d,0xd3d},{0xd4e,0xd4e},{0xd54,0xd56},{0xd5f,0xd61}, + {0xd7a,0xd7f},{0xd85,0xd96},{0xd9a,0xdb1},{0xdb3,0xdbb},{0xdbd,0xdbd},{0xdc0,0xdc6}, + {0xe01,0xe30},{0xe32,0xe33},{0xe40,0xe45},{0xe81,0xe82},{0xe84,0xe84},{0xe86,0xe8a}, + {0xe8c,0xea3},{0xea5,0xea5},{0xea7,0xeb0},{0xeb2,0xeb3},{0xebd,0xebd},{0xec0,0xec4}, + {0xedc,0xedf},{0xf00,0xf00},{0xf40,0xf47},{0xf49,0xf6c},{0xf88,0xf8c},{0x1000,0x102a}, + {0x103f,0x103f},{0x1050,0x1055},{0x105a,0x105d},{0x1061,0x1061},{0x1065,0x1066},{0x106e,0x1070}, + {0x1075,0x1081},{0x108e,0x108e},{0x1100,0x1248},{0x124a,0x124d},{0x1250,0x1256},{0x1258,0x1258}, + {0x125a,0x125d},{0x1260,0x1288},{0x128a,0x128d},{0x1290,0x12b0},{0x12b2,0x12b5},{0x12b8,0x12be}, + {0x12c0,0x12c0},{0x12c2,0x12c5},{0x12c8,0x12d6},{0x12d8,0x1310},{0x1312,0x1315},{0x1318,0x135a}, + {0x1380,0x138f},{0x1401,0x166c},{0x166f,0x167f},{0x1681,0x169a},{0x16a0,0x16ea},{0x16f1,0x16f8}, + {0x1700,0x1711},{0x171f,0x1731},{0x1740,0x1751},{0x1760,0x176c},{0x176e,0x1770},{0x1780,0x17b3}, + {0x17dc,0x17dc},{0x1820,0x1842},{0x1844,0x1878},{0x1880,0x1884},{0x1887,0x18a8},{0x18aa,0x18aa}, + {0x18b0,0x18f5},{0x1900,0x191e},{0x1950,0x196d},{0x1970,0x1974},{0x1980,0x19ab},{0x19b0,0x19c9}, + {0x1a00,0x1a16},{0x1a20,0x1a54},{0x1b05,0x1b33},{0x1b45,0x1b4c},{0x1b83,0x1ba0},{0x1bae,0x1baf}, + {0x1bba,0x1be5},{0x1c00,0x1c23},{0x1c4d,0x1c4f},{0x1c5a,0x1c77},{0x1ce9,0x1cec},{0x1cee,0x1cf3}, + {0x1cf5,0x1cf6},{0x1cfa,0x1cfa},{0x2135,0x2138},{0x2d30,0x2d67},{0x2d80,0x2d96},{0x2da0,0x2da6}, + {0x2da8,0x2dae},{0x2db0,0x2db6},{0x2db8,0x2dbe},{0x2dc0,0x2dc6},{0x2dc8,0x2dce},{0x2dd0,0x2dd6}, + {0x2dd8,0x2dde},{0x3006,0x3006},{0x303c,0x303c},{0x3041,0x3096},{0x309f,0x309f},{0x30a1,0x30fa}, + {0x30ff,0x30ff},{0x3105,0x312f},{0x3131,0x318e},{0x31a0,0x31bf},{0x31f0,0x31ff},{0x3400,0x4dbf}, + {0x4e00,0xa014},{0xa016,0xa48c},{0xa4d0,0xa4f7},{0xa500,0xa60b},{0xa610,0xa61f},{0xa62a,0xa62b}, + {0xa66e,0xa66e},{0xa6a0,0xa6e5},{0xa78f,0xa78f},{0xa7f7,0xa7f7},{0xa7fb,0xa801},{0xa803,0xa805}, + {0xa807,0xa80a},{0xa80c,0xa822},{0xa840,0xa873},{0xa882,0xa8b3},{0xa8f2,0xa8f7},{0xa8fb,0xa8fb}, + {0xa8fd,0xa8fe},{0xa90a,0xa925},{0xa930,0xa946},{0xa960,0xa97c},{0xa984,0xa9b2},{0xa9e0,0xa9e4}, + {0xa9e7,0xa9ef},{0xa9fa,0xa9fe},{0xaa00,0xaa28},{0xaa40,0xaa42},{0xaa44,0xaa4b},{0xaa60,0xaa6f}, + {0xaa71,0xaa76},{0xaa7a,0xaa7a},{0xaa7e,0xaaaf},{0xaab1,0xaab1},{0xaab5,0xaab6},{0xaab9,0xaabd}, + {0xaac0,0xaac0},{0xaac2,0xaac2},{0xaadb,0xaadc},{0xaae0,0xaaea},{0xaaf2,0xaaf2},{0xab01,0xab06}, + {0xab09,0xab0e},{0xab11,0xab16},{0xab20,0xab26},{0xab28,0xab2e},{0xabc0,0xabe2},{0xac00,0xd7a3}, + {0xd7b0,0xd7c6},{0xd7cb,0xd7fb},{0xf900,0xfa6d},{0xfa70,0xfad9},{0xfb1d,0xfb1d},{0xfb1f,0xfb28}, + {0xfb2a,0xfb36},{0xfb38,0xfb3c},{0xfb3e,0xfb3e},{0xfb40,0xfb41},{0xfb43,0xfb44},{0xfb46,0xfbb1}, + {0xfbd3,0xfd3d},{0xfd50,0xfd8f},{0xfd92,0xfdc7},{0xfdf0,0xfdfb},{0xfe70,0xfe74},{0xfe76,0xfefc}, + {0xff66,0xff6f},{0xff71,0xff9d},{0xffa0,0xffbe},{0xffc2,0xffc7},{0xffca,0xffcf},{0xffd2,0xffd7}, + {0xffda,0xffdc},{0x10000,0x1000b},{0x1000d,0x10026},{0x10028,0x1003a},{0x1003c,0x1003d},{0x1003f,0x1004d}, + {0x10050,0x1005d},{0x10080,0x100fa},{0x10280,0x1029c},{0x102a0,0x102d0},{0x10300,0x1031f},{0x1032d,0x10340}, + {0x10342,0x10349},{0x10350,0x10375},{0x10380,0x1039d},{0x103a0,0x103c3},{0x103c8,0x103cf},{0x10450,0x1049d}, + {0x10500,0x10527},{0x10530,0x10563},{0x10600,0x10736},{0x10740,0x10755},{0x10760,0x10767},{0x10800,0x10805}, + {0x10808,0x10808},{0x1080a,0x10835},{0x10837,0x10838},{0x1083c,0x1083c},{0x1083f,0x10855},{0x10860,0x10876}, + {0x10880,0x1089e},{0x108e0,0x108f2},{0x108f4,0x108f5},{0x10900,0x10915},{0x10920,0x10939},{0x10980,0x109b7}, + {0x109be,0x109bf},{0x10a00,0x10a00},{0x10a10,0x10a13},{0x10a15,0x10a17},{0x10a19,0x10a35},{0x10a60,0x10a7c}, + {0x10a80,0x10a9c},{0x10ac0,0x10ac7},{0x10ac9,0x10ae4},{0x10b00,0x10b35},{0x10b40,0x10b55},{0x10b60,0x10b72}, + {0x10b80,0x10b91},{0x10c00,0x10c48},{0x10d00,0x10d23},{0x10e80,0x10ea9},{0x10eb0,0x10eb1},{0x10f00,0x10f1c}, + {0x10f27,0x10f27},{0x10f30,0x10f45},{0x10f70,0x10f81},{0x10fb0,0x10fc4},{0x10fe0,0x10ff6},{0x11003,0x11037}, + {0x11071,0x11072},{0x11075,0x11075},{0x11083,0x110af},{0x110d0,0x110e8},{0x11103,0x11126},{0x11144,0x11144}, + {0x11147,0x11147},{0x11150,0x11172},{0x11176,0x11176},{0x11183,0x111b2},{0x111c1,0x111c4},{0x111da,0x111da}, + {0x111dc,0x111dc},{0x11200,0x11211},{0x11213,0x1122b},{0x11280,0x11286},{0x11288,0x11288},{0x1128a,0x1128d}, + {0x1128f,0x1129d},{0x1129f,0x112a8},{0x112b0,0x112de},{0x11305,0x1130c},{0x1130f,0x11310},{0x11313,0x11328}, + {0x1132a,0x11330},{0x11332,0x11333},{0x11335,0x11339},{0x1133d,0x1133d},{0x11350,0x11350},{0x1135d,0x11361}, + {0x11400,0x11434},{0x11447,0x1144a},{0x1145f,0x11461},{0x11480,0x114af},{0x114c4,0x114c5},{0x114c7,0x114c7}, + {0x11580,0x115ae},{0x115d8,0x115db},{0x11600,0x1162f},{0x11644,0x11644},{0x11680,0x116aa},{0x116b8,0x116b8}, + {0x11700,0x1171a},{0x11740,0x11746},{0x11800,0x1182b},{0x118ff,0x11906},{0x11909,0x11909},{0x1190c,0x11913}, + {0x11915,0x11916},{0x11918,0x1192f},{0x1193f,0x1193f},{0x11941,0x11941},{0x119a0,0x119a7},{0x119aa,0x119d0}, + {0x119e1,0x119e1},{0x119e3,0x119e3},{0x11a00,0x11a00},{0x11a0b,0x11a32},{0x11a3a,0x11a3a},{0x11a50,0x11a50}, + {0x11a5c,0x11a89},{0x11a9d,0x11a9d},{0x11ab0,0x11af8},{0x11c00,0x11c08},{0x11c0a,0x11c2e},{0x11c40,0x11c40}, + {0x11c72,0x11c8f},{0x11d00,0x11d06},{0x11d08,0x11d09},{0x11d0b,0x11d30},{0x11d46,0x11d46},{0x11d60,0x11d65}, + {0x11d67,0x11d68},{0x11d6a,0x11d89},{0x11d98,0x11d98},{0x11ee0,0x11ef2},{0x11fb0,0x11fb0},{0x12000,0x12399}, + {0x12480,0x12543},{0x12f90,0x12ff0},{0x13000,0x1342e},{0x14400,0x14646},{0x16800,0x16a38},{0x16a40,0x16a5e}, + {0x16a70,0x16abe},{0x16ad0,0x16aed},{0x16b00,0x16b2f},{0x16b63,0x16b77},{0x16b7d,0x16b8f},{0x16f00,0x16f4a}, + {0x16f50,0x16f50},{0x17000,0x187f7},{0x18800,0x18cd5},{0x18d00,0x18d08},{0x1b000,0x1b122},{0x1b150,0x1b152}, + {0x1b164,0x1b167},{0x1b170,0x1b2fb},{0x1bc00,0x1bc6a},{0x1bc70,0x1bc7c},{0x1bc80,0x1bc88},{0x1bc90,0x1bc99}, + {0x1df0a,0x1df0a},{0x1e100,0x1e12c},{0x1e14e,0x1e14e},{0x1e290,0x1e2ad},{0x1e2c0,0x1e2eb},{0x1e7e0,0x1e7e6}, + {0x1e7e8,0x1e7eb},{0x1e7ed,0x1e7ee},{0x1e7f0,0x1e7fe},{0x1e800,0x1e8c4},{0x1ee00,0x1ee03},{0x1ee05,0x1ee1f}, + {0x1ee21,0x1ee22},{0x1ee24,0x1ee24},{0x1ee27,0x1ee27},{0x1ee29,0x1ee32},{0x1ee34,0x1ee37},{0x1ee39,0x1ee39}, + {0x1ee3b,0x1ee3b},{0x1ee42,0x1ee42},{0x1ee47,0x1ee47},{0x1ee49,0x1ee49},{0x1ee4b,0x1ee4b},{0x1ee4d,0x1ee4f}, + {0x1ee51,0x1ee52},{0x1ee54,0x1ee54},{0x1ee57,0x1ee57},{0x1ee59,0x1ee59},{0x1ee5b,0x1ee5b},{0x1ee5d,0x1ee5d}, + {0x1ee5f,0x1ee5f},{0x1ee61,0x1ee62},{0x1ee64,0x1ee64},{0x1ee67,0x1ee6a},{0x1ee6c,0x1ee72},{0x1ee74,0x1ee77}, + {0x1ee79,0x1ee7c},{0x1ee7e,0x1ee7e},{0x1ee80,0x1ee89},{0x1ee8b,0x1ee9b},{0x1eea1,0x1eea3},{0x1eea5,0x1eea9}, + {0x1eeab,0x1eebb},{0x20000,0x2a6df},{0x2a700,0x2b738},{0x2b740,0x2b81d},{0x2b820,0x2cea1},{0x2ceb0,0x2ebe0}, + {0x2f800,0x2fa1d},{0x30000,0x3134a} + }; +} + + namespace Mark { + // Mark, nonspacing (21) + static const std::vector> Mn = { + {0x300,0x36f},{0x483,0x487},{0x591,0x5bd},{0x5bf,0x5bf},{0x5c1,0x5c2},{0x5c4,0x5c5},{0x5c7,0x5c7}, + {0x610,0x61a},{0x64b,0x65f},{0x670,0x670},{0x6d6,0x6dc},{0x6df,0x6e4},{0x6e7,0x6e8}, + {0x6ea,0x6ed},{0x711,0x711},{0x730,0x74a},{0x7a6,0x7b0},{0x7eb,0x7f3},{0x7fd,0x7fd}, + {0x816,0x819},{0x81b,0x823},{0x825,0x827},{0x829,0x82d},{0x859,0x85b},{0x898,0x89f}, + {0x8ca,0x8e1},{0x8e3,0x902},{0x93a,0x93a},{0x93c,0x93c},{0x941,0x948},{0x94d,0x94d}, + {0x951,0x957},{0x962,0x963},{0x981,0x981},{0x9bc,0x9bc},{0x9c1,0x9c4},{0x9cd,0x9cd}, + {0x9e2,0x9e3},{0x9fe,0x9fe},{0xa01,0xa02},{0xa3c,0xa3c},{0xa41,0xa42},{0xa47,0xa48}, + {0xa4b,0xa4d},{0xa51,0xa51},{0xa70,0xa71},{0xa75,0xa75},{0xa81,0xa82},{0xabc,0xabc}, + {0xac1,0xac5},{0xac7,0xac8},{0xacd,0xacd},{0xae2,0xae3},{0xafa,0xaff},{0xb01,0xb01}, + {0xb3c,0xb3c},{0xb3f,0xb3f},{0xb41,0xb44},{0xb4d,0xb4d},{0xb55,0xb56},{0xb62,0xb63}, + {0xb82,0xb82},{0xbc0,0xbc0},{0xbcd,0xbcd},{0xc00,0xc00},{0xc04,0xc04},{0xc3c,0xc3c}, + {0xc3e,0xc40},{0xc46,0xc48},{0xc4a,0xc4d},{0xc55,0xc56},{0xc62,0xc63},{0xc81,0xc81}, + {0xcbc,0xcbc},{0xcbf,0xcbf},{0xcc6,0xcc6},{0xccc,0xccd},{0xce2,0xce3},{0xd00,0xd01}, + {0xd3b,0xd3c},{0xd41,0xd44},{0xd4d,0xd4d},{0xd62,0xd63},{0xd81,0xd81},{0xdca,0xdca}, + {0xdd2,0xdd4},{0xdd6,0xdd6},{0xe31,0xe31},{0xe34,0xe3a},{0xe47,0xe4e},{0xeb1,0xeb1}, + {0xeb4,0xebc},{0xec8,0xecd},{0xf18,0xf19},{0xf35,0xf35},{0xf37,0xf37},{0xf39,0xf39}, + {0xf71,0xf7e},{0xf80,0xf84},{0xf86,0xf87},{0xf8d,0xf97},{0xf99,0xfbc},{0xfc6,0xfc6}, + {0x102d,0x1030},{0x1032,0x1037},{0x1039,0x103a},{0x103d,0x103e},{0x1058,0x1059},{0x105e,0x1060}, + {0x1071,0x1074},{0x1082,0x1082},{0x1085,0x1086},{0x108d,0x108d},{0x109d,0x109d},{0x135d,0x135f}, + {0x1712,0x1714},{0x1732,0x1733},{0x1752,0x1753},{0x1772,0x1773},{0x17b4,0x17b5},{0x17b7,0x17bd}, + {0x17c6,0x17c6},{0x17c9,0x17d3},{0x17dd,0x17dd},{0x180b,0x180d},{0x180f,0x180f},{0x1885,0x1886}, + {0x18a9,0x18a9},{0x1920,0x1922},{0x1927,0x1928},{0x1932,0x1932},{0x1939,0x193b},{0x1a17,0x1a18}, + {0x1a1b,0x1a1b},{0x1a56,0x1a56},{0x1a58,0x1a5e},{0x1a60,0x1a60},{0x1a62,0x1a62},{0x1a65,0x1a6c}, + {0x1a73,0x1a7c},{0x1a7f,0x1a7f},{0x1ab0,0x1abd},{0x1abf,0x1ace},{0x1b00,0x1b03},{0x1b34,0x1b34}, + {0x1b36,0x1b3a},{0x1b3c,0x1b3c},{0x1b42,0x1b42},{0x1b6b,0x1b73},{0x1b80,0x1b81},{0x1ba2,0x1ba5}, + {0x1ba8,0x1ba9},{0x1bab,0x1bad},{0x1be6,0x1be6},{0x1be8,0x1be9},{0x1bed,0x1bed},{0x1bef,0x1bf1}, + {0x1c2c,0x1c33},{0x1c36,0x1c37},{0x1cd0,0x1cd2},{0x1cd4,0x1ce0},{0x1ce2,0x1ce8},{0x1ced,0x1ced}, + {0x1cf4,0x1cf4},{0x1cf8,0x1cf9},{0x1dc0,0x1dff},{0x20d0,0x20dc},{0x20e1,0x20e1},{0x20e5,0x20f0}, + {0x2cef,0x2cf1},{0x2d7f,0x2d7f},{0x2de0,0x2dff},{0x302a,0x302d},{0x3099,0x309a},{0xa66f,0xa66f}, + {0xa674,0xa67d},{0xa69e,0xa69f},{0xa6f0,0xa6f1},{0xa802,0xa802},{0xa806,0xa806},{0xa80b,0xa80b}, + {0xa825,0xa826},{0xa82c,0xa82c},{0xa8c4,0xa8c5},{0xa8e0,0xa8f1},{0xa8ff,0xa8ff},{0xa926,0xa92d}, + {0xa947,0xa951},{0xa980,0xa982},{0xa9b3,0xa9b3},{0xa9b6,0xa9b9},{0xa9bc,0xa9bd},{0xa9e5,0xa9e5}, + {0xaa29,0xaa2e},{0xaa31,0xaa32},{0xaa35,0xaa36},{0xaa43,0xaa43},{0xaa4c,0xaa4c},{0xaa7c,0xaa7c}, + {0xaab0,0xaab0},{0xaab2,0xaab4},{0xaab7,0xaab8},{0xaabe,0xaabf},{0xaac1,0xaac1},{0xaaec,0xaaed}, + {0xaaf6,0xaaf6},{0xabe5,0xabe5},{0xabe8,0xabe8},{0xabed,0xabed},{0xfb1e,0xfb1e},{0xfe00,0xfe0f}, + {0xfe20,0xfe2f},{0x101fd,0x101fd},{0x102e0,0x102e0},{0x10376,0x1037a},{0x10a01,0x10a03},{0x10a05,0x10a06}, + {0x10a0c,0x10a0f},{0x10a38,0x10a3a},{0x10a3f,0x10a3f},{0x10ae5,0x10ae6},{0x10d24,0x10d27},{0x10eab,0x10eac}, + {0x10f46,0x10f50},{0x10f82,0x10f85},{0x11001,0x11001},{0x11038,0x11046},{0x11070,0x11070},{0x11073,0x11074}, + {0x1107f,0x11081},{0x110b3,0x110b6},{0x110b9,0x110ba},{0x110c2,0x110c2},{0x11100,0x11102},{0x11127,0x1112b}, + {0x1112d,0x11134},{0x11173,0x11173},{0x11180,0x11181},{0x111b6,0x111be},{0x111c9,0x111cc},{0x111cf,0x111cf}, + {0x1122f,0x11231},{0x11234,0x11234},{0x11236,0x11237},{0x1123e,0x1123e},{0x112df,0x112df},{0x112e3,0x112ea}, + {0x11300,0x11301},{0x1133b,0x1133c},{0x11340,0x11340},{0x11366,0x1136c},{0x11370,0x11374},{0x11438,0x1143f}, + {0x11442,0x11444},{0x11446,0x11446},{0x1145e,0x1145e},{0x114b3,0x114b8},{0x114ba,0x114ba},{0x114bf,0x114c0}, + {0x114c2,0x114c3},{0x115b2,0x115b5},{0x115bc,0x115bd},{0x115bf,0x115c0},{0x115dc,0x115dd},{0x11633,0x1163a}, + {0x1163d,0x1163d},{0x1163f,0x11640},{0x116ab,0x116ab},{0x116ad,0x116ad},{0x116b0,0x116b5},{0x116b7,0x116b7}, + {0x1171d,0x1171f},{0x11722,0x11725},{0x11727,0x1172b},{0x1182f,0x11837},{0x11839,0x1183a},{0x1193b,0x1193c}, + {0x1193e,0x1193e},{0x11943,0x11943},{0x119d4,0x119d7},{0x119da,0x119db},{0x119e0,0x119e0},{0x11a01,0x11a0a}, + {0x11a33,0x11a38},{0x11a3b,0x11a3e},{0x11a47,0x11a47},{0x11a51,0x11a56},{0x11a59,0x11a5b},{0x11a8a,0x11a96}, + {0x11a98,0x11a99},{0x11c30,0x11c36},{0x11c38,0x11c3d},{0x11c3f,0x11c3f},{0x11c92,0x11ca7},{0x11caa,0x11cb0}, + {0x11cb2,0x11cb3},{0x11cb5,0x11cb6},{0x11d31,0x11d36},{0x11d3a,0x11d3a},{0x11d3c,0x11d3d},{0x11d3f,0x11d45}, + {0x11d47,0x11d47},{0x11d90,0x11d91},{0x11d95,0x11d95},{0x11d97,0x11d97},{0x11ef3,0x11ef4},{0x16af0,0x16af4}, + {0x16b30,0x16b36},{0x16f4f,0x16f4f},{0x16f8f,0x16f92},{0x16fe4,0x16fe4},{0x1bc9d,0x1bc9e},{0x1cf00,0x1cf2d}, + {0x1cf30,0x1cf46},{0x1d167,0x1d169},{0x1d17b,0x1d182},{0x1d185,0x1d18b},{0x1d1aa,0x1d1ad},{0x1d242,0x1d244}, + {0x1da00,0x1da36},{0x1da3b,0x1da6c},{0x1da75,0x1da75},{0x1da84,0x1da84},{0x1da9b,0x1da9f},{0x1daa1,0x1daaf}, + {0x1e000,0x1e006},{0x1e008,0x1e018},{0x1e01b,0x1e021},{0x1e023,0x1e024},{0x1e026,0x1e02a},{0x1e130,0x1e136}, + {0x1e2ae,0x1e2ae},{0x1e2ec,0x1e2ef},{0x1e8d0,0x1e8d6},{0x1e944,0x1e94a},{0xe0100,0xe01ef} + }; + + // Mark, spacing combining (22) + static const std::vector> Mc = { + {0x903,0x903},{0x93b,0x93b},{0x93e,0x940},{0x949,0x94c},{0x94e,0x94f},{0x982,0x983},{0x9be,0x9c0}, + {0x9c7,0x9c8},{0x9cb,0x9cc},{0x9d7,0x9d7},{0xa03,0xa03},{0xa3e,0xa40},{0xa83,0xa83}, + {0xabe,0xac0},{0xac9,0xac9},{0xacb,0xacc},{0xb02,0xb03},{0xb3e,0xb3e},{0xb40,0xb40}, + {0xb47,0xb48},{0xb4b,0xb4c},{0xb57,0xb57},{0xbbe,0xbbf},{0xbc1,0xbc2},{0xbc6,0xbc8}, + {0xbca,0xbcc},{0xbd7,0xbd7},{0xc01,0xc03},{0xc41,0xc44},{0xc82,0xc83},{0xcbe,0xcbe}, + {0xcc0,0xcc4},{0xcc7,0xcc8},{0xcca,0xccb},{0xcd5,0xcd6},{0xd02,0xd03},{0xd3e,0xd40}, + {0xd46,0xd48},{0xd4a,0xd4c},{0xd57,0xd57},{0xd82,0xd83},{0xdcf,0xdd1},{0xdd8,0xddf}, + {0xdf2,0xdf3},{0xf3e,0xf3f},{0xf7f,0xf7f},{0x102b,0x102c},{0x1031,0x1031},{0x1038,0x1038}, + {0x103b,0x103c},{0x1056,0x1057},{0x1062,0x1064},{0x1067,0x106d},{0x1083,0x1084},{0x1087,0x108c}, + {0x108f,0x108f},{0x109a,0x109c},{0x1715,0x1715},{0x1734,0x1734},{0x17b6,0x17b6},{0x17be,0x17c5}, + {0x17c7,0x17c8},{0x1923,0x1926},{0x1929,0x192b},{0x1930,0x1931},{0x1933,0x1938},{0x1a19,0x1a1a}, + {0x1a55,0x1a55},{0x1a57,0x1a57},{0x1a61,0x1a61},{0x1a63,0x1a64},{0x1a6d,0x1a72},{0x1b04,0x1b04}, + {0x1b35,0x1b35},{0x1b3b,0x1b3b},{0x1b3d,0x1b41},{0x1b43,0x1b44},{0x1b82,0x1b82},{0x1ba1,0x1ba1}, + {0x1ba6,0x1ba7},{0x1baa,0x1baa},{0x1be7,0x1be7},{0x1bea,0x1bec},{0x1bee,0x1bee},{0x1bf2,0x1bf3}, + {0x1c24,0x1c2b},{0x1c34,0x1c35},{0x1ce1,0x1ce1},{0x1cf7,0x1cf7},{0x302e,0x302f},{0xa823,0xa824}, + {0xa827,0xa827},{0xa880,0xa881},{0xa8b4,0xa8c3},{0xa952,0xa953},{0xa983,0xa983},{0xa9b4,0xa9b5}, + {0xa9ba,0xa9bb},{0xa9be,0xa9c0},{0xaa2f,0xaa30},{0xaa33,0xaa34},{0xaa4d,0xaa4d},{0xaa7b,0xaa7b}, + {0xaa7d,0xaa7d},{0xaaeb,0xaaeb},{0xaaee,0xaaef},{0xaaf5,0xaaf5},{0xabe3,0xabe4},{0xabe6,0xabe7}, + {0xabe9,0xabea},{0xabec,0xabec},{0x11000,0x11000},{0x11002,0x11002},{0x11082,0x11082},{0x110b0,0x110b2}, + {0x110b7,0x110b8},{0x1112c,0x1112c},{0x11145,0x11146},{0x11182,0x11182},{0x111b3,0x111b5},{0x111bf,0x111c0}, + {0x111ce,0x111ce},{0x1122c,0x1122e},{0x11232,0x11233},{0x11235,0x11235},{0x112e0,0x112e2},{0x11302,0x11303}, + {0x1133e,0x1133f},{0x11341,0x11344},{0x11347,0x11348},{0x1134b,0x1134d},{0x11357,0x11357},{0x11362,0x11363}, + {0x11435,0x11437},{0x11440,0x11441},{0x11445,0x11445},{0x114b0,0x114b2},{0x114b9,0x114b9},{0x114bb,0x114be}, + {0x114c1,0x114c1},{0x115af,0x115b1},{0x115b8,0x115bb},{0x115be,0x115be},{0x11630,0x11632},{0x1163b,0x1163c}, + {0x1163e,0x1163e},{0x116ac,0x116ac},{0x116ae,0x116af},{0x116b6,0x116b6},{0x11720,0x11721},{0x11726,0x11726}, + {0x1182c,0x1182e},{0x11838,0x11838},{0x11930,0x11935},{0x11937,0x11938},{0x1193d,0x1193d},{0x11940,0x11940}, + {0x11942,0x11942},{0x119d1,0x119d3},{0x119dc,0x119df},{0x119e4,0x119e4},{0x11a39,0x11a39},{0x11a57,0x11a58}, + {0x11a97,0x11a97},{0x11c2f,0x11c2f},{0x11c3e,0x11c3e},{0x11ca9,0x11ca9},{0x11cb1,0x11cb1},{0x11cb4,0x11cb4}, + {0x11d8a,0x11d8e},{0x11d93,0x11d94},{0x11d96,0x11d96},{0x11ef5,0x11ef6},{0x16f51,0x16f87},{0x16ff0,0x16ff1}, + {0x1d165,0x1d166},{0x1d16d,0x1d172} + }; + + // Mark, enclosing (23) + static const std::vector> Me = { + {0x488,0x489},{0x1abe,0x1abe},{0x20dd,0x20e0},{0x20e2,0x20e4},{0xa670,0xa672} + }; +} + + namespace Number { + // Number, decimal digit (31) + static const std::vector> Nd = { + {0x30,0x39},{0x660,0x669},{0x6f0,0x6f9},{0x7c0,0x7c9},{0x966,0x96f},{0x9e6,0x9ef},{0xa66,0xa6f}, + {0xae6,0xaef},{0xb66,0xb6f},{0xbe6,0xbef},{0xc66,0xc6f},{0xce6,0xcef},{0xd66,0xd6f}, + {0xde6,0xdef},{0xe50,0xe59},{0xed0,0xed9},{0xf20,0xf29},{0x1040,0x1049},{0x1090,0x1099}, + {0x17e0,0x17e9},{0x1810,0x1819},{0x1946,0x194f},{0x19d0,0x19d9},{0x1a80,0x1a89},{0x1a90,0x1a99}, + {0x1b50,0x1b59},{0x1bb0,0x1bb9},{0x1c40,0x1c49},{0x1c50,0x1c59},{0xa620,0xa629},{0xa8d0,0xa8d9}, + {0xa900,0xa909},{0xa9d0,0xa9d9},{0xa9f0,0xa9f9},{0xaa50,0xaa59},{0xabf0,0xabf9},{0xff10,0xff19}, + {0x104a0,0x104a9},{0x10d30,0x10d39},{0x11066,0x1106f},{0x110f0,0x110f9},{0x11136,0x1113f},{0x111d0,0x111d9}, + {0x112f0,0x112f9},{0x11450,0x11459},{0x114d0,0x114d9},{0x11650,0x11659},{0x116c0,0x116c9},{0x11730,0x11739}, + {0x118e0,0x118e9},{0x11950,0x11959},{0x11c50,0x11c59},{0x11d50,0x11d59},{0x11da0,0x11da9},{0x16a60,0x16a69}, + {0x16ac0,0x16ac9},{0x16b50,0x16b59},{0x1d7ce,0x1d7ff},{0x1e140,0x1e149},{0x1e2f0,0x1e2f9},{0x1e950,0x1e959}, + {0x1fbf0,0x1fbf9} + }; + + // Number, letter (32) + static const std::vector> Nl = { + {0x16ee,0x16f0},{0x2160,0x2182},{0x2185,0x2188},{0x3007,0x3007},{0x3021,0x3029},{0x3038,0x303a},{0xa6e6,0xa6ef}, + {0x10140,0x10174},{0x10341,0x10341},{0x1034a,0x1034a},{0x103d1,0x103d5},{0x12400,0x1246e} + }; + + // Number, other (33) + static const std::vector> No = { + {0xb2,0xb3},{0xb9,0xb9},{0xbc,0xbe},{0x9f4,0x9f9},{0xb72,0xb77},{0xbf0,0xbf2},{0xc78,0xc7e}, + {0xd58,0xd5e},{0xd70,0xd78},{0xf2a,0xf33},{0x1369,0x137c},{0x17f0,0x17f9},{0x19da,0x19da}, + {0x2070,0x2070},{0x2074,0x2079},{0x2080,0x2089},{0x2150,0x215f},{0x2189,0x2189},{0x2460,0x249b}, + {0x24ea,0x24ff},{0x2776,0x2793},{0x2cfd,0x2cfd},{0x3192,0x3195},{0x3220,0x3229},{0x3248,0x324f}, + {0x3251,0x325f},{0x3280,0x3289},{0x32b1,0x32bf},{0xa830,0xa835},{0x10107,0x10133},{0x10175,0x10178}, + {0x1018a,0x1018b},{0x102e1,0x102fb},{0x10320,0x10323},{0x10858,0x1085f},{0x10879,0x1087f},{0x108a7,0x108af}, + {0x108fb,0x108ff},{0x10916,0x1091b},{0x109bc,0x109bd},{0x109c0,0x109cf},{0x109d2,0x109ff},{0x10a40,0x10a48}, + {0x10a7d,0x10a7e},{0x10a9d,0x10a9f},{0x10aeb,0x10aef},{0x10b58,0x10b5f},{0x10b78,0x10b7f},{0x10ba9,0x10baf}, + {0x10cfa,0x10cff},{0x10e60,0x10e7e},{0x10f1d,0x10f26},{0x10f51,0x10f54},{0x10fc5,0x10fcb},{0x11052,0x11065}, + {0x111e1,0x111f4},{0x1173a,0x1173b},{0x118ea,0x118f2},{0x11c5a,0x11c6c},{0x11fc0,0x11fd4},{0x16b5b,0x16b61}, + {0x16e80,0x16e96},{0x1d2e0,0x1d2f3},{0x1d360,0x1d378},{0x1e8c7,0x1e8cf},{0x1ec71,0x1ecab},{0x1ecad,0x1ecaf}, + {0x1ecb1,0x1ecb4},{0x1ed01,0x1ed2d},{0x1ed2f,0x1ed3d},{0x1f100,0x1f10c} + }; +} + + namespace Punctuation { + // Punctuation, connector (41) + static const std::vector> Pc = { + {0x5f,0x5f},{0x203f,0x2040},{0x2054,0x2054},{0xfe33,0xfe34},{0xfe4d,0xfe4f},{0xff3f,0xff3f} + }; + + // Punctuation, dash (42) + static const std::vector> Pd = { + {0x2d,0x2d},{0x58a,0x58a},{0x5be,0x5be},{0x1400,0x1400},{0x1806,0x1806},{0x2010,0x2015},{0x2e17,0x2e17}, + {0x2e1a,0x2e1a},{0x2e3a,0x2e3b},{0x2e40,0x2e40},{0x2e5d,0x2e5d},{0x301c,0x301c},{0x3030,0x3030}, + {0x30a0,0x30a0},{0xfe31,0xfe32},{0xfe58,0xfe58},{0xfe63,0xfe63},{0xff0d,0xff0d},{0x10ead,0x10ead} + }; + + // Punctuation, open (43) + static const std::vector> Ps = { + {0x28,0x28},{0x5b,0x5b},{0x7b,0x7b},{0xf3a,0xf3a},{0xf3c,0xf3c},{0x169b,0x169b},{0x201a,0x201a}, + {0x201e,0x201e},{0x2045,0x2045},{0x207d,0x207d},{0x208d,0x208d},{0x2308,0x2308},{0x230a,0x230a}, + {0x2329,0x2329},{0x2768,0x2768},{0x276a,0x276a},{0x276c,0x276c},{0x276e,0x276e},{0x2770,0x2770}, + {0x2772,0x2772},{0x2774,0x2774},{0x27c5,0x27c5},{0x27e6,0x27e6},{0x27e8,0x27e8},{0x27ea,0x27ea}, + {0x27ec,0x27ec},{0x27ee,0x27ee},{0x2983,0x2983},{0x2985,0x2985},{0x2987,0x2987},{0x2989,0x2989}, + {0x298b,0x298b},{0x298d,0x298d},{0x298f,0x298f},{0x2991,0x2991},{0x2993,0x2993},{0x2995,0x2995}, + {0x2997,0x2997},{0x29d8,0x29d8},{0x29da,0x29da},{0x29fc,0x29fc},{0x2e22,0x2e22},{0x2e24,0x2e24}, + {0x2e26,0x2e26},{0x2e28,0x2e28},{0x2e42,0x2e42},{0x2e55,0x2e55},{0x2e57,0x2e57},{0x2e59,0x2e59}, + {0x2e5b,0x2e5b},{0x3008,0x3008},{0x300a,0x300a},{0x300c,0x300c},{0x300e,0x300e},{0x3010,0x3010}, + {0x3014,0x3014},{0x3016,0x3016},{0x3018,0x3018},{0x301a,0x301a},{0x301d,0x301d},{0xfd3f,0xfd3f}, + {0xfe17,0xfe17},{0xfe35,0xfe35},{0xfe37,0xfe37},{0xfe39,0xfe39},{0xfe3b,0xfe3b},{0xfe3d,0xfe3d}, + {0xfe3f,0xfe3f},{0xfe41,0xfe41},{0xfe43,0xfe43},{0xfe47,0xfe47},{0xfe59,0xfe59},{0xfe5b,0xfe5b}, + {0xfe5d,0xfe5d},{0xff08,0xff08},{0xff3b,0xff3b},{0xff5b,0xff5b},{0xff5f,0xff5f},{0xff62,0xff62} + }; + + // Punctuation, close (44) + static const std::vector> Pe = { + {0x29,0x29},{0x5d,0x5d},{0x7d,0x7d},{0xf3b,0xf3b},{0xf3d,0xf3d},{0x169c,0x169c},{0x2046,0x2046}, + {0x207e,0x207e},{0x208e,0x208e},{0x2309,0x2309},{0x230b,0x230b},{0x232a,0x232a},{0x2769,0x2769}, + {0x276b,0x276b},{0x276d,0x276d},{0x276f,0x276f},{0x2771,0x2771},{0x2773,0x2773},{0x2775,0x2775}, + {0x27c6,0x27c6},{0x27e7,0x27e7},{0x27e9,0x27e9},{0x27eb,0x27eb},{0x27ed,0x27ed},{0x27ef,0x27ef}, + {0x2984,0x2984},{0x2986,0x2986},{0x2988,0x2988},{0x298a,0x298a},{0x298c,0x298c},{0x298e,0x298e}, + {0x2990,0x2990},{0x2992,0x2992},{0x2994,0x2994},{0x2996,0x2996},{0x2998,0x2998},{0x29d9,0x29d9}, + {0x29db,0x29db},{0x29fd,0x29fd},{0x2e23,0x2e23},{0x2e25,0x2e25},{0x2e27,0x2e27},{0x2e29,0x2e29}, + {0x2e56,0x2e56},{0x2e58,0x2e58},{0x2e5a,0x2e5a},{0x2e5c,0x2e5c},{0x3009,0x3009},{0x300b,0x300b}, + {0x300d,0x300d},{0x300f,0x300f},{0x3011,0x3011},{0x3015,0x3015},{0x3017,0x3017},{0x3019,0x3019}, + {0x301b,0x301b},{0x301e,0x301f},{0xfd3e,0xfd3e},{0xfe18,0xfe18},{0xfe36,0xfe36},{0xfe38,0xfe38}, + {0xfe3a,0xfe3a},{0xfe3c,0xfe3c},{0xfe3e,0xfe3e},{0xfe40,0xfe40},{0xfe42,0xfe42},{0xfe44,0xfe44}, + {0xfe48,0xfe48},{0xfe5a,0xfe5a},{0xfe5c,0xfe5c},{0xfe5e,0xfe5e},{0xff09,0xff09},{0xff3d,0xff3d}, + {0xff5d,0xff5d},{0xff60,0xff60},{0xff63,0xff63} + }; + + // Punctuation, initial quote (45) + static const std::vector> Pi = { + {0xab,0xab},{0x2018,0x2018},{0x201b,0x201c},{0x201f,0x201f},{0x2039,0x2039},{0x2e02,0x2e02},{0x2e04,0x2e04}, + {0x2e09,0x2e09},{0x2e0c,0x2e0c},{0x2e1c,0x2e1c},{0x2e20,0x2e20} + }; + + // Punctuation, final quote (46) + static const std::vector> Pf = { + {0xbb,0xbb},{0x2019,0x2019},{0x201d,0x201d},{0x203a,0x203a},{0x2e03,0x2e03},{0x2e05,0x2e05},{0x2e0a,0x2e0a}, + {0x2e0d,0x2e0d},{0x2e1d,0x2e1d},{0x2e21,0x2e21} + }; + + // Punctuation, other (47) + static const std::vector> Po = { + {0x21,0x23},{0x25,0x27},{0x2a,0x2a},{0x2c,0x2c},{0x2e,0x2f},{0x3a,0x3b},{0x3f,0x40}, + {0x5c,0x5c},{0xa1,0xa1},{0xa7,0xa7},{0xb6,0xb7},{0xbf,0xbf},{0x37e,0x37e}, + {0x387,0x387},{0x55a,0x55f},{0x589,0x589},{0x5c0,0x5c0},{0x5c3,0x5c3},{0x5c6,0x5c6}, + {0x5f3,0x5f4},{0x609,0x60a},{0x60c,0x60d},{0x61b,0x61b},{0x61d,0x61f},{0x66a,0x66d}, + {0x6d4,0x6d4},{0x700,0x70d},{0x7f7,0x7f9},{0x830,0x83e},{0x85e,0x85e},{0x964,0x965}, + {0x970,0x970},{0x9fd,0x9fd},{0xa76,0xa76},{0xaf0,0xaf0},{0xc77,0xc77},{0xc84,0xc84}, + {0xdf4,0xdf4},{0xe4f,0xe4f},{0xe5a,0xe5b},{0xf04,0xf12},{0xf14,0xf14},{0xf85,0xf85}, + {0xfd0,0xfd4},{0xfd9,0xfda},{0x104a,0x104f},{0x10fb,0x10fb},{0x1360,0x1368},{0x166e,0x166e}, + {0x16eb,0x16ed},{0x1735,0x1736},{0x17d4,0x17d6},{0x17d8,0x17da},{0x1800,0x1805},{0x1807,0x180a}, + {0x1944,0x1945},{0x1a1e,0x1a1f},{0x1aa0,0x1aa6},{0x1aa8,0x1aad},{0x1b5a,0x1b60},{0x1b7d,0x1b7e}, + {0x1bfc,0x1bff},{0x1c3b,0x1c3f},{0x1c7e,0x1c7f},{0x1cc0,0x1cc7},{0x1cd3,0x1cd3},{0x2016,0x2017}, + {0x2020,0x2027},{0x2030,0x2038},{0x203b,0x203e},{0x2041,0x2043},{0x2047,0x2051},{0x2053,0x2053}, + {0x2055,0x205e},{0x2cf9,0x2cfc},{0x2cfe,0x2cff},{0x2d70,0x2d70},{0x2e00,0x2e01},{0x2e06,0x2e08}, + {0x2e0b,0x2e0b},{0x2e0e,0x2e16},{0x2e18,0x2e19},{0x2e1b,0x2e1b},{0x2e1e,0x2e1f},{0x2e2a,0x2e2e}, + {0x2e30,0x2e39},{0x2e3c,0x2e3f},{0x2e41,0x2e41},{0x2e43,0x2e4f},{0x2e52,0x2e54},{0x3001,0x3003}, + {0x303d,0x303d},{0x30fb,0x30fb},{0xa4fe,0xa4ff},{0xa60d,0xa60f},{0xa673,0xa673},{0xa67e,0xa67e}, + {0xa6f2,0xa6f7},{0xa874,0xa877},{0xa8ce,0xa8cf},{0xa8f8,0xa8fa},{0xa8fc,0xa8fc},{0xa92e,0xa92f}, + {0xa95f,0xa95f},{0xa9c1,0xa9cd},{0xa9de,0xa9df},{0xaa5c,0xaa5f},{0xaade,0xaadf},{0xaaf0,0xaaf1}, + {0xabeb,0xabeb},{0xfe10,0xfe16},{0xfe19,0xfe19},{0xfe30,0xfe30},{0xfe45,0xfe46},{0xfe49,0xfe4c}, + {0xfe50,0xfe52},{0xfe54,0xfe57},{0xfe5f,0xfe61},{0xfe68,0xfe68},{0xfe6a,0xfe6b},{0xff01,0xff03}, + {0xff05,0xff07},{0xff0a,0xff0a},{0xff0c,0xff0c},{0xff0e,0xff0f},{0xff1a,0xff1b},{0xff1f,0xff20}, + {0xff3c,0xff3c},{0xff61,0xff61},{0xff64,0xff65},{0x10100,0x10102},{0x1039f,0x1039f},{0x103d0,0x103d0}, + {0x1056f,0x1056f},{0x10857,0x10857},{0x1091f,0x1091f},{0x1093f,0x1093f},{0x10a50,0x10a58},{0x10a7f,0x10a7f}, + {0x10af0,0x10af6},{0x10b39,0x10b3f},{0x10b99,0x10b9c},{0x10f55,0x10f59},{0x10f86,0x10f89},{0x11047,0x1104d}, + {0x110bb,0x110bc},{0x110be,0x110c1},{0x11140,0x11143},{0x11174,0x11175},{0x111c5,0x111c8},{0x111cd,0x111cd}, + {0x111db,0x111db},{0x111dd,0x111df},{0x11238,0x1123d},{0x112a9,0x112a9},{0x1144b,0x1144f},{0x1145a,0x1145b}, + {0x1145d,0x1145d},{0x114c6,0x114c6},{0x115c1,0x115d7},{0x11641,0x11643},{0x11660,0x1166c},{0x116b9,0x116b9}, + {0x1173c,0x1173e},{0x1183b,0x1183b},{0x11944,0x11946},{0x119e2,0x119e2},{0x11a3f,0x11a46},{0x11a9a,0x11a9c}, + {0x11a9e,0x11aa2},{0x11c41,0x11c45},{0x11c70,0x11c71},{0x11ef7,0x11ef8},{0x11fff,0x11fff},{0x12470,0x12474}, + {0x12ff1,0x12ff2},{0x16a6e,0x16a6f},{0x16af5,0x16af5},{0x16b37,0x16b3b},{0x16b44,0x16b44},{0x16e97,0x16e9a}, + {0x16fe2,0x16fe2},{0x1bc9f,0x1bc9f},{0x1da87,0x1da8b},{0x1e95e,0x1e95f} + }; +} + + namespace Symbol { + // Symbol, math (51) + static const std::vector> Sm = { + {0x2b,0x2b},{0x3c,0x3e},{0x7c,0x7c},{0x7e,0x7e},{0xac,0xac},{0xb1,0xb1},{0xd7,0xd7}, + {0xf7,0xf7},{0x3f6,0x3f6},{0x606,0x608},{0x2044,0x2044},{0x2052,0x2052},{0x207a,0x207c}, + {0x208a,0x208c},{0x2118,0x2118},{0x2140,0x2144},{0x214b,0x214b},{0x2190,0x2194},{0x219a,0x219b}, + {0x21a0,0x21a0},{0x21a3,0x21a3},{0x21a6,0x21a6},{0x21ae,0x21ae},{0x21ce,0x21cf},{0x21d2,0x21d2}, + {0x21d4,0x21d4},{0x21f4,0x22ff},{0x2320,0x2321},{0x237c,0x237c},{0x239b,0x23b3},{0x23dc,0x23e1}, + {0x25b7,0x25b7},{0x25c1,0x25c1},{0x25f8,0x25ff},{0x266f,0x266f},{0x27c0,0x27c4},{0x27c7,0x27e5}, + {0x27f0,0x27ff},{0x2900,0x2982},{0x2999,0x29d7},{0x29dc,0x29fb},{0x29fe,0x2aff},{0x2b30,0x2b44}, + {0x2b47,0x2b4c},{0xfb29,0xfb29},{0xfe62,0xfe62},{0xfe64,0xfe66},{0xff0b,0xff0b},{0xff1c,0xff1e}, + {0xff5c,0xff5c},{0xff5e,0xff5e},{0xffe2,0xffe2},{0xffe9,0xffec},{0x1d6c1,0x1d6c1},{0x1d6db,0x1d6db}, + {0x1d6fb,0x1d6fb},{0x1d715,0x1d715},{0x1d735,0x1d735},{0x1d74f,0x1d74f},{0x1d76f,0x1d76f},{0x1d789,0x1d789}, + {0x1d7a9,0x1d7a9},{0x1d7c3,0x1d7c3},{0x1eef0,0x1eef1} + }; + + // Symbol, currency (52) + static const std::vector> Sc = { + {0x24,0x24},{0xa2,0xa5},{0x58f,0x58f},{0x60b,0x60b},{0x7fe,0x7ff},{0x9f2,0x9f3},{0x9fb,0x9fb}, + {0xaf1,0xaf1},{0xbf9,0xbf9},{0xe3f,0xe3f},{0x17db,0x17db},{0x20a0,0x20c0},{0xa838,0xa838}, + {0xfdfc,0xfdfc},{0xfe69,0xfe69},{0xff04,0xff04},{0xffe0,0xffe1},{0xffe5,0xffe6},{0x11fdd,0x11fe0}, + {0x1e2ff,0x1e2ff},{0x1ecb0,0x1ecb0} + }; + + // Symbol, modifier (53) + static const std::vector> Sk = { + {0x5e,0x5e},{0x60,0x60},{0xa8,0xa8},{0xaf,0xaf},{0xb4,0xb4},{0xb8,0xb8},{0x2c2,0x2c5}, + {0x2d2,0x2df},{0x2e5,0x2eb},{0x2ed,0x2ed},{0x2ef,0x2ff},{0x375,0x375},{0x384,0x385}, + {0x888,0x888},{0x1fbd,0x1fbd},{0x1fbf,0x1fc1},{0x1fcd,0x1fcf},{0x1fdd,0x1fdf},{0x1fed,0x1fef}, + {0x1ffd,0x1ffe},{0x309b,0x309c},{0xa700,0xa716},{0xa720,0xa721},{0xa789,0xa78a},{0xab5b,0xab5b}, + {0xab6a,0xab6b},{0xfbb2,0xfbc2},{0xff3e,0xff3e},{0xff40,0xff40},{0xffe3,0xffe3},{0x1f3fb,0x1f3ff} + }; + + // Symbol, other (54) + static const std::vector> So = { + {0xa6,0xa6},{0xa9,0xa9},{0xae,0xae},{0xb0,0xb0},{0x482,0x482},{0x58d,0x58e},{0x60e,0x60f}, + {0x6de,0x6de},{0x6e9,0x6e9},{0x6fd,0x6fe},{0x7f6,0x7f6},{0x9fa,0x9fa},{0xb70,0xb70}, + {0xbf3,0xbf8},{0xbfa,0xbfa},{0xc7f,0xc7f},{0xd4f,0xd4f},{0xd79,0xd79},{0xf01,0xf03}, + {0xf13,0xf13},{0xf15,0xf17},{0xf1a,0xf1f},{0xf34,0xf34},{0xf36,0xf36},{0xf38,0xf38}, + {0xfbe,0xfc5},{0xfc7,0xfcc},{0xfce,0xfcf},{0xfd5,0xfd8},{0x109e,0x109f},{0x1390,0x1399}, + {0x166d,0x166d},{0x1940,0x1940},{0x19de,0x19ff},{0x1b61,0x1b6a},{0x1b74,0x1b7c},{0x2100,0x2101}, + {0x2103,0x2106},{0x2108,0x2109},{0x2114,0x2114},{0x2116,0x2117},{0x211e,0x2123},{0x2125,0x2125}, + {0x2127,0x2127},{0x2129,0x2129},{0x212e,0x212e},{0x213a,0x213b},{0x214a,0x214a},{0x214c,0x214d}, + {0x214f,0x214f},{0x218a,0x218b},{0x2195,0x2199},{0x219c,0x219f},{0x21a1,0x21a2},{0x21a4,0x21a5}, + {0x21a7,0x21ad},{0x21af,0x21cd},{0x21d0,0x21d1},{0x21d3,0x21d3},{0x21d5,0x21f3},{0x2300,0x2307}, + {0x230c,0x231f},{0x2322,0x2328},{0x232b,0x237b},{0x237d,0x239a},{0x23b4,0x23db},{0x23e2,0x2426}, + {0x2440,0x244a},{0x249c,0x24e9},{0x2500,0x25b6},{0x25b8,0x25c0},{0x25c2,0x25f7},{0x2600,0x266e}, + {0x2670,0x2767},{0x2794,0x27bf},{0x2800,0x28ff},{0x2b00,0x2b2f},{0x2b45,0x2b46},{0x2b4d,0x2b73}, + {0x2b76,0x2b95},{0x2b97,0x2bff},{0x2ce5,0x2cea},{0x2e50,0x2e51},{0x2e80,0x2e99},{0x2e9b,0x2ef3}, + {0x2f00,0x2fd5},{0x2ff0,0x2ffb},{0x3004,0x3004},{0x3012,0x3013},{0x3020,0x3020},{0x3036,0x3037}, + {0x303e,0x303f},{0x3190,0x3191},{0x3196,0x319f},{0x31c0,0x31e3},{0x3200,0x321e},{0x322a,0x3247}, + {0x3250,0x3250},{0x3260,0x327f},{0x328a,0x32b0},{0x32c0,0x33ff},{0x4dc0,0x4dff},{0xa490,0xa4c6}, + {0xa828,0xa82b},{0xa836,0xa837},{0xa839,0xa839},{0xaa77,0xaa79},{0xfd40,0xfd4f},{0xfdcf,0xfdcf}, + {0xfdfd,0xfdff},{0xffe4,0xffe4},{0xffe8,0xffe8},{0xffed,0xffee},{0xfffc,0xfffd},{0x10137,0x1013f}, + {0x10179,0x10189},{0x1018c,0x1018e},{0x10190,0x1019c},{0x101a0,0x101a0},{0x101d0,0x101fc},{0x10877,0x10878}, + {0x10ac8,0x10ac8},{0x1173f,0x1173f},{0x11fd5,0x11fdc},{0x11fe1,0x11ff1},{0x16b3c,0x16b3f},{0x16b45,0x16b45}, + {0x1bc9c,0x1bc9c},{0x1cf50,0x1cfc3},{0x1d000,0x1d0f5},{0x1d100,0x1d126},{0x1d129,0x1d164},{0x1d16a,0x1d16c}, + {0x1d183,0x1d184},{0x1d18c,0x1d1a9},{0x1d1ae,0x1d1ea},{0x1d200,0x1d241},{0x1d245,0x1d245},{0x1d300,0x1d356}, + {0x1d800,0x1d9ff},{0x1da37,0x1da3a},{0x1da6d,0x1da74},{0x1da76,0x1da83},{0x1da85,0x1da86},{0x1e14f,0x1e14f}, + {0x1ecac,0x1ecac},{0x1ed2e,0x1ed2e},{0x1f000,0x1f02b},{0x1f030,0x1f093},{0x1f0a0,0x1f0ae},{0x1f0b1,0x1f0bf}, + {0x1f0c1,0x1f0cf},{0x1f0d1,0x1f0f5},{0x1f10d,0x1f1ad},{0x1f1e6,0x1f202},{0x1f210,0x1f23b},{0x1f240,0x1f248}, + {0x1f250,0x1f251},{0x1f260,0x1f265},{0x1f300,0x1f3fa},{0x1f400,0x1f6d7},{0x1f6dd,0x1f6ec},{0x1f6f0,0x1f6fc}, + {0x1f700,0x1f773},{0x1f780,0x1f7d8},{0x1f7e0,0x1f7eb},{0x1f7f0,0x1f7f0},{0x1f800,0x1f80b},{0x1f810,0x1f847}, + {0x1f850,0x1f859},{0x1f860,0x1f887},{0x1f890,0x1f8ad},{0x1f8b0,0x1f8b1},{0x1f900,0x1fa53},{0x1fa60,0x1fa6d}, + {0x1fa70,0x1fa74},{0x1fa78,0x1fa7c},{0x1fa80,0x1fa86},{0x1fa90,0x1faac},{0x1fab0,0x1faba},{0x1fac0,0x1fac5}, + {0x1fad0,0x1fad9},{0x1fae0,0x1fae7},{0x1faf0,0x1faf6},{0x1fb00,0x1fb92},{0x1fb94,0x1fbca} + }; +} + + namespace Separator { + // Separator, space (61) + static const std::vector> Zs = { + {0x20,0x20},{0xa0,0xa0},{0x1680,0x1680},{0x2000,0x200a},{0x202f,0x202f},{0x205f,0x205f},{0x3000,0x3000} + }; + + // Separator, line (62) + static const std::vector> Zl = { + {0x2028,0x2028} + }; + + // Separator, paragraph (63) + static const std::vector> Zp = { + {0x2029,0x2029} + }; +} + + namespace Other { + // Other, control (71) + static const std::vector> Cc = { + {0x0,0x1f},{0x7f,0x9f} + }; + + // Other, format (72) + static const std::vector> Cf = { + {0xad,0xad},{0x600,0x605},{0x61c,0x61c},{0x6dd,0x6dd},{0x70f,0x70f},{0x890,0x891},{0x8e2,0x8e2}, + {0x180e,0x180e},{0x200b,0x200f},{0x202a,0x202e},{0x2060,0x2064},{0x2066,0x206f},{0xfeff,0xfeff}, + {0xfff9,0xfffb},{0x110bd,0x110bd},{0x110cd,0x110cd},{0x13430,0x13438},{0x1bca0,0x1bca3},{0x1d173,0x1d17a}, + {0xe0001,0xe0001},{0xe0020,0xe007f} + }; + + // Other, surrogate (73) + static const std::vector> Cs = { + {0xd800,0xdfff} + }; + + // Other, private use (74) + static const std::vector> Co = { + {0xe000,0xf8ff},{0xf0000,0xffffd},{0x100000,0x10fffd} + }; + + // Other, not assigned (75) + static const std::vector> Cn = { + {0x378,0x379},{0x380,0x383},{0x38b,0x38b},{0x38d,0x38d},{0x3a2,0x3a2},{0x530,0x530},{0x557,0x558}, + {0x58b,0x58c},{0x590,0x590},{0x5c8,0x5cf},{0x5eb,0x5ee},{0x5f5,0x5ff},{0x70e,0x70e}, + {0x74b,0x74c},{0x7b2,0x7bf},{0x7fb,0x7fc},{0x82e,0x82f},{0x83f,0x83f},{0x85c,0x85d}, + {0x85f,0x85f},{0x86b,0x86f},{0x88f,0x88f},{0x892,0x897},{0x984,0x984},{0x98d,0x98e}, + {0x991,0x992},{0x9a9,0x9a9},{0x9b1,0x9b1},{0x9b3,0x9b5},{0x9ba,0x9bb},{0x9c5,0x9c6}, + {0x9c9,0x9ca},{0x9cf,0x9d6},{0x9d8,0x9db},{0x9de,0x9de},{0x9e4,0x9e5},{0x9ff,0xa00}, + {0xa04,0xa04},{0xa0b,0xa0e},{0xa11,0xa12},{0xa29,0xa29},{0xa31,0xa31},{0xa34,0xa34}, + {0xa37,0xa37},{0xa3a,0xa3b},{0xa3d,0xa3d},{0xa43,0xa46},{0xa49,0xa4a},{0xa4e,0xa50}, + {0xa52,0xa58},{0xa5d,0xa5d},{0xa5f,0xa65},{0xa77,0xa80},{0xa84,0xa84},{0xa8e,0xa8e}, + {0xa92,0xa92},{0xaa9,0xaa9},{0xab1,0xab1},{0xab4,0xab4},{0xaba,0xabb},{0xac6,0xac6}, + {0xaca,0xaca},{0xace,0xacf},{0xad1,0xadf},{0xae4,0xae5},{0xaf2,0xaf8},{0xb00,0xb00}, + {0xb04,0xb04},{0xb0d,0xb0e},{0xb11,0xb12},{0xb29,0xb29},{0xb31,0xb31},{0xb34,0xb34}, + {0xb3a,0xb3b},{0xb45,0xb46},{0xb49,0xb4a},{0xb4e,0xb54},{0xb58,0xb5b},{0xb5e,0xb5e}, + {0xb64,0xb65},{0xb78,0xb81},{0xb84,0xb84},{0xb8b,0xb8d},{0xb91,0xb91},{0xb96,0xb98}, + {0xb9b,0xb9b},{0xb9d,0xb9d},{0xba0,0xba2},{0xba5,0xba7},{0xbab,0xbad},{0xbba,0xbbd}, + {0xbc3,0xbc5},{0xbc9,0xbc9},{0xbce,0xbcf},{0xbd1,0xbd6},{0xbd8,0xbe5},{0xbfb,0xbff}, + {0xc0d,0xc0d},{0xc11,0xc11},{0xc29,0xc29},{0xc3a,0xc3b},{0xc45,0xc45},{0xc49,0xc49}, + {0xc4e,0xc54},{0xc57,0xc57},{0xc5b,0xc5c},{0xc5e,0xc5f},{0xc64,0xc65},{0xc70,0xc76}, + {0xc8d,0xc8d},{0xc91,0xc91},{0xca9,0xca9},{0xcb4,0xcb4},{0xcba,0xcbb},{0xcc5,0xcc5}, + {0xcc9,0xcc9},{0xcce,0xcd4},{0xcd7,0xcdc},{0xcdf,0xcdf},{0xce4,0xce5},{0xcf0,0xcf0}, + {0xcf3,0xcff},{0xd0d,0xd0d},{0xd11,0xd11},{0xd45,0xd45},{0xd49,0xd49},{0xd50,0xd53}, + {0xd64,0xd65},{0xd80,0xd80},{0xd84,0xd84},{0xd97,0xd99},{0xdb2,0xdb2},{0xdbc,0xdbc}, + {0xdbe,0xdbf},{0xdc7,0xdc9},{0xdcb,0xdce},{0xdd5,0xdd5},{0xdd7,0xdd7},{0xde0,0xde5}, + {0xdf0,0xdf1},{0xdf5,0xe00},{0xe3b,0xe3e},{0xe5c,0xe80},{0xe83,0xe83},{0xe85,0xe85}, + {0xe8b,0xe8b},{0xea4,0xea4},{0xea6,0xea6},{0xebe,0xebf},{0xec5,0xec5},{0xec7,0xec7}, + {0xece,0xecf},{0xeda,0xedb},{0xee0,0xeff},{0xf48,0xf48},{0xf6d,0xf70},{0xf98,0xf98}, + {0xfbd,0xfbd},{0xfcd,0xfcd},{0xfdb,0xfff},{0x10c6,0x10c6},{0x10c8,0x10cc},{0x10ce,0x10cf}, + {0x1249,0x1249},{0x124e,0x124f},{0x1257,0x1257},{0x1259,0x1259},{0x125e,0x125f},{0x1289,0x1289}, + {0x128e,0x128f},{0x12b1,0x12b1},{0x12b6,0x12b7},{0x12bf,0x12bf},{0x12c1,0x12c1},{0x12c6,0x12c7}, + {0x12d7,0x12d7},{0x1311,0x1311},{0x1316,0x1317},{0x135b,0x135c},{0x137d,0x137f},{0x139a,0x139f}, + {0x13f6,0x13f7},{0x13fe,0x13ff},{0x169d,0x169f},{0x16f9,0x16ff},{0x1716,0x171e},{0x1737,0x173f}, + {0x1754,0x175f},{0x176d,0x176d},{0x1771,0x1771},{0x1774,0x177f},{0x17de,0x17df},{0x17ea,0x17ef}, + {0x17fa,0x17ff},{0x181a,0x181f},{0x1879,0x187f},{0x18ab,0x18af},{0x18f6,0x18ff},{0x191f,0x191f}, + {0x192c,0x192f},{0x193c,0x193f},{0x1941,0x1943},{0x196e,0x196f},{0x1975,0x197f},{0x19ac,0x19af}, + {0x19ca,0x19cf},{0x19db,0x19dd},{0x1a1c,0x1a1d},{0x1a5f,0x1a5f},{0x1a7d,0x1a7e},{0x1a8a,0x1a8f}, + {0x1a9a,0x1a9f},{0x1aae,0x1aaf},{0x1acf,0x1aff},{0x1b4d,0x1b4f},{0x1b7f,0x1b7f},{0x1bf4,0x1bfb}, + {0x1c38,0x1c3a},{0x1c4a,0x1c4c},{0x1c89,0x1c8f},{0x1cbb,0x1cbc},{0x1cc8,0x1ccf},{0x1cfb,0x1cff}, + {0x1f16,0x1f17},{0x1f1e,0x1f1f},{0x1f46,0x1f47},{0x1f4e,0x1f4f},{0x1f58,0x1f58},{0x1f5a,0x1f5a}, + {0x1f5c,0x1f5c},{0x1f5e,0x1f5e},{0x1f7e,0x1f7f},{0x1fb5,0x1fb5},{0x1fc5,0x1fc5},{0x1fd4,0x1fd5}, + {0x1fdc,0x1fdc},{0x1ff0,0x1ff1},{0x1ff5,0x1ff5},{0x1fff,0x1fff},{0x2065,0x2065},{0x2072,0x2073}, + {0x208f,0x208f},{0x209d,0x209f},{0x20c1,0x20cf},{0x20f1,0x20ff},{0x218c,0x218f},{0x2427,0x243f}, + {0x244b,0x245f},{0x2b74,0x2b75},{0x2b96,0x2b96},{0x2cf4,0x2cf8},{0x2d26,0x2d26},{0x2d28,0x2d2c}, + {0x2d2e,0x2d2f},{0x2d68,0x2d6e},{0x2d71,0x2d7e},{0x2d97,0x2d9f},{0x2da7,0x2da7},{0x2daf,0x2daf}, + {0x2db7,0x2db7},{0x2dbf,0x2dbf},{0x2dc7,0x2dc7},{0x2dcf,0x2dcf},{0x2dd7,0x2dd7},{0x2ddf,0x2ddf}, + {0x2e5e,0x2e7f},{0x2e9a,0x2e9a},{0x2ef4,0x2eff},{0x2fd6,0x2fef},{0x2ffc,0x2fff},{0x3040,0x3040}, + {0x3097,0x3098},{0x3100,0x3104},{0x3130,0x3130},{0x318f,0x318f},{0x31e4,0x31ef},{0x321f,0x321f}, + {0xa48d,0xa48f},{0xa4c7,0xa4cf},{0xa62c,0xa63f},{0xa6f8,0xa6ff},{0xa7cb,0xa7cf},{0xa7d2,0xa7d2}, + {0xa7d4,0xa7d4},{0xa7da,0xa7f1},{0xa82d,0xa82f},{0xa83a,0xa83f},{0xa878,0xa87f},{0xa8c6,0xa8cd}, + {0xa8da,0xa8df},{0xa954,0xa95e},{0xa97d,0xa97f},{0xa9ce,0xa9ce},{0xa9da,0xa9dd},{0xa9ff,0xa9ff}, + {0xaa37,0xaa3f},{0xaa4e,0xaa4f},{0xaa5a,0xaa5b},{0xaac3,0xaada},{0xaaf7,0xab00},{0xab07,0xab08}, + {0xab0f,0xab10},{0xab17,0xab1f},{0xab27,0xab27},{0xab2f,0xab2f},{0xab6c,0xab6f},{0xabee,0xabef}, + {0xabfa,0xabff},{0xd7a4,0xd7af},{0xd7c7,0xd7ca},{0xd7fc,0xd7ff},{0xfa6e,0xfa6f},{0xfada,0xfaff}, + {0xfb07,0xfb12},{0xfb18,0xfb1c},{0xfb37,0xfb37},{0xfb3d,0xfb3d},{0xfb3f,0xfb3f},{0xfb42,0xfb42}, + {0xfb45,0xfb45},{0xfbc3,0xfbd2},{0xfd90,0xfd91},{0xfdc8,0xfdce},{0xfdd0,0xfdef},{0xfe1a,0xfe1f}, + {0xfe53,0xfe53},{0xfe67,0xfe67},{0xfe6c,0xfe6f},{0xfe75,0xfe75},{0xfefd,0xfefe},{0xff00,0xff00}, + {0xffbf,0xffc1},{0xffc8,0xffc9},{0xffd0,0xffd1},{0xffd8,0xffd9},{0xffdd,0xffdf},{0xffe7,0xffe7}, + {0xffef,0xfff8},{0xfffe,0xffff},{0x1000c,0x1000c},{0x10027,0x10027},{0x1003b,0x1003b},{0x1003e,0x1003e}, + {0x1004e,0x1004f},{0x1005e,0x1007f},{0x100fb,0x100ff},{0x10103,0x10106},{0x10134,0x10136},{0x1018f,0x1018f}, + {0x1019d,0x1019f},{0x101a1,0x101cf},{0x101fe,0x1027f},{0x1029d,0x1029f},{0x102d1,0x102df},{0x102fc,0x102ff}, + {0x10324,0x1032c},{0x1034b,0x1034f},{0x1037b,0x1037f},{0x1039e,0x1039e},{0x103c4,0x103c7},{0x103d6,0x103ff}, + {0x1049e,0x1049f},{0x104aa,0x104af},{0x104d4,0x104d7},{0x104fc,0x104ff},{0x10528,0x1052f},{0x10564,0x1056e}, + {0x1057b,0x1057b},{0x1058b,0x1058b},{0x10593,0x10593},{0x10596,0x10596},{0x105a2,0x105a2},{0x105b2,0x105b2}, + {0x105ba,0x105ba},{0x105bd,0x105ff},{0x10737,0x1073f},{0x10756,0x1075f},{0x10768,0x1077f},{0x10786,0x10786}, + {0x107b1,0x107b1},{0x107bb,0x107ff},{0x10806,0x10807},{0x10809,0x10809},{0x10836,0x10836},{0x10839,0x1083b}, + {0x1083d,0x1083e},{0x10856,0x10856},{0x1089f,0x108a6},{0x108b0,0x108df},{0x108f3,0x108f3},{0x108f6,0x108fa}, + {0x1091c,0x1091e},{0x1093a,0x1093e},{0x10940,0x1097f},{0x109b8,0x109bb},{0x109d0,0x109d1},{0x10a04,0x10a04}, + {0x10a07,0x10a0b},{0x10a14,0x10a14},{0x10a18,0x10a18},{0x10a36,0x10a37},{0x10a3b,0x10a3e},{0x10a49,0x10a4f}, + {0x10a59,0x10a5f},{0x10aa0,0x10abf},{0x10ae7,0x10aea},{0x10af7,0x10aff},{0x10b36,0x10b38},{0x10b56,0x10b57}, + {0x10b73,0x10b77},{0x10b92,0x10b98},{0x10b9d,0x10ba8},{0x10bb0,0x10bff},{0x10c49,0x10c7f},{0x10cb3,0x10cbf}, + {0x10cf3,0x10cf9},{0x10d28,0x10d2f},{0x10d3a,0x10e5f},{0x10e7f,0x10e7f},{0x10eaa,0x10eaa},{0x10eae,0x10eaf}, + {0x10eb2,0x10eff},{0x10f28,0x10f2f},{0x10f5a,0x10f6f},{0x10f8a,0x10faf},{0x10fcc,0x10fdf},{0x10ff7,0x10fff}, + {0x1104e,0x11051},{0x11076,0x1107e},{0x110c3,0x110cc},{0x110ce,0x110cf},{0x110e9,0x110ef},{0x110fa,0x110ff}, + {0x11135,0x11135},{0x11148,0x1114f},{0x11177,0x1117f},{0x111e0,0x111e0},{0x111f5,0x111ff},{0x11212,0x11212}, + {0x1123f,0x1127f},{0x11287,0x11287},{0x11289,0x11289},{0x1128e,0x1128e},{0x1129e,0x1129e},{0x112aa,0x112af}, + {0x112eb,0x112ef},{0x112fa,0x112ff},{0x11304,0x11304},{0x1130d,0x1130e},{0x11311,0x11312},{0x11329,0x11329}, + {0x11331,0x11331},{0x11334,0x11334},{0x1133a,0x1133a},{0x11345,0x11346},{0x11349,0x1134a},{0x1134e,0x1134f}, + {0x11351,0x11356},{0x11358,0x1135c},{0x11364,0x11365},{0x1136d,0x1136f},{0x11375,0x113ff},{0x1145c,0x1145c}, + {0x11462,0x1147f},{0x114c8,0x114cf},{0x114da,0x1157f},{0x115b6,0x115b7},{0x115de,0x115ff},{0x11645,0x1164f}, + {0x1165a,0x1165f},{0x1166d,0x1167f},{0x116ba,0x116bf},{0x116ca,0x116ff},{0x1171b,0x1171c},{0x1172c,0x1172f}, + {0x11747,0x117ff},{0x1183c,0x1189f},{0x118f3,0x118fe},{0x11907,0x11908},{0x1190a,0x1190b},{0x11914,0x11914}, + {0x11917,0x11917},{0x11936,0x11936},{0x11939,0x1193a},{0x11947,0x1194f},{0x1195a,0x1199f},{0x119a8,0x119a9}, + {0x119d8,0x119d9},{0x119e5,0x119ff},{0x11a48,0x11a4f},{0x11aa3,0x11aaf},{0x11af9,0x11bff},{0x11c09,0x11c09}, + {0x11c37,0x11c37},{0x11c46,0x11c4f},{0x11c6d,0x11c6f},{0x11c90,0x11c91},{0x11ca8,0x11ca8},{0x11cb7,0x11cff}, + {0x11d07,0x11d07},{0x11d0a,0x11d0a},{0x11d37,0x11d39},{0x11d3b,0x11d3b},{0x11d3e,0x11d3e},{0x11d48,0x11d4f}, + {0x11d5a,0x11d5f},{0x11d66,0x11d66},{0x11d69,0x11d69},{0x11d8f,0x11d8f},{0x11d92,0x11d92},{0x11d99,0x11d9f}, + {0x11daa,0x11edf},{0x11ef9,0x11faf},{0x11fb1,0x11fbf},{0x11ff2,0x11ffe},{0x1239a,0x123ff},{0x1246f,0x1246f}, + {0x12475,0x1247f},{0x12544,0x12f8f},{0x12ff3,0x12fff},{0x1342f,0x1342f},{0x13439,0x143ff},{0x14647,0x167ff}, + {0x16a39,0x16a3f},{0x16a5f,0x16a5f},{0x16a6a,0x16a6d},{0x16abf,0x16abf},{0x16aca,0x16acf},{0x16aee,0x16aef}, + {0x16af6,0x16aff},{0x16b46,0x16b4f},{0x16b5a,0x16b5a},{0x16b62,0x16b62},{0x16b78,0x16b7c},{0x16b90,0x16e3f}, + {0x16e9b,0x16eff},{0x16f4b,0x16f4e},{0x16f88,0x16f8e},{0x16fa0,0x16fdf},{0x16fe5,0x16fef},{0x16ff2,0x16fff}, + {0x187f8,0x187ff},{0x18cd6,0x18cff},{0x18d09,0x1afef},{0x1aff4,0x1aff4},{0x1affc,0x1affc},{0x1afff,0x1afff}, + {0x1b123,0x1b14f},{0x1b153,0x1b163},{0x1b168,0x1b16f},{0x1b2fc,0x1bbff},{0x1bc6b,0x1bc6f},{0x1bc7d,0x1bc7f}, + {0x1bc89,0x1bc8f},{0x1bc9a,0x1bc9b},{0x1bca4,0x1ceff},{0x1cf2e,0x1cf2f},{0x1cf47,0x1cf4f},{0x1cfc4,0x1cfff}, + {0x1d0f6,0x1d0ff},{0x1d127,0x1d128},{0x1d1eb,0x1d1ff},{0x1d246,0x1d2df},{0x1d2f4,0x1d2ff},{0x1d357,0x1d35f}, + {0x1d379,0x1d3ff},{0x1d455,0x1d455},{0x1d49d,0x1d49d},{0x1d4a0,0x1d4a1},{0x1d4a3,0x1d4a4},{0x1d4a7,0x1d4a8}, + {0x1d4ad,0x1d4ad},{0x1d4ba,0x1d4ba},{0x1d4bc,0x1d4bc},{0x1d4c4,0x1d4c4},{0x1d506,0x1d506},{0x1d50b,0x1d50c}, + {0x1d515,0x1d515},{0x1d51d,0x1d51d},{0x1d53a,0x1d53a},{0x1d53f,0x1d53f},{0x1d545,0x1d545},{0x1d547,0x1d549}, + {0x1d551,0x1d551},{0x1d6a6,0x1d6a7},{0x1d7cc,0x1d7cd},{0x1da8c,0x1da9a},{0x1daa0,0x1daa0},{0x1dab0,0x1deff}, + {0x1df1f,0x1dfff},{0x1e007,0x1e007},{0x1e019,0x1e01a},{0x1e022,0x1e022},{0x1e025,0x1e025},{0x1e02b,0x1e0ff}, + {0x1e12d,0x1e12f},{0x1e13e,0x1e13f},{0x1e14a,0x1e14d},{0x1e150,0x1e28f},{0x1e2af,0x1e2bf},{0x1e2fa,0x1e2fe}, + {0x1e300,0x1e7df},{0x1e7e7,0x1e7e7},{0x1e7ec,0x1e7ec},{0x1e7ef,0x1e7ef},{0x1e7ff,0x1e7ff},{0x1e8c5,0x1e8c6}, + {0x1e8d7,0x1e8ff},{0x1e94c,0x1e94f},{0x1e95a,0x1e95d},{0x1e960,0x1ec70},{0x1ecb5,0x1ed00},{0x1ed3e,0x1edff}, + {0x1ee04,0x1ee04},{0x1ee20,0x1ee20},{0x1ee23,0x1ee23},{0x1ee25,0x1ee26},{0x1ee28,0x1ee28},{0x1ee33,0x1ee33}, + {0x1ee38,0x1ee38},{0x1ee3a,0x1ee3a},{0x1ee3c,0x1ee41},{0x1ee43,0x1ee46},{0x1ee48,0x1ee48},{0x1ee4a,0x1ee4a}, + {0x1ee4c,0x1ee4c},{0x1ee50,0x1ee50},{0x1ee53,0x1ee53},{0x1ee55,0x1ee56},{0x1ee58,0x1ee58},{0x1ee5a,0x1ee5a}, + {0x1ee5c,0x1ee5c},{0x1ee5e,0x1ee5e},{0x1ee60,0x1ee60},{0x1ee63,0x1ee63},{0x1ee65,0x1ee66},{0x1ee6b,0x1ee6b}, + {0x1ee73,0x1ee73},{0x1ee78,0x1ee78},{0x1ee7d,0x1ee7d},{0x1ee7f,0x1ee7f},{0x1ee8a,0x1ee8a},{0x1ee9c,0x1eea0}, + {0x1eea4,0x1eea4},{0x1eeaa,0x1eeaa},{0x1eebc,0x1eeef},{0x1eef2,0x1efff},{0x1f02c,0x1f02f},{0x1f094,0x1f09f}, + {0x1f0af,0x1f0b0},{0x1f0c0,0x1f0c0},{0x1f0d0,0x1f0d0},{0x1f0f6,0x1f0ff},{0x1f1ae,0x1f1e5},{0x1f203,0x1f20f}, + {0x1f23c,0x1f23f},{0x1f249,0x1f24f},{0x1f252,0x1f25f},{0x1f266,0x1f2ff},{0x1f6d8,0x1f6dc},{0x1f6ed,0x1f6ef}, + {0x1f6fd,0x1f6ff},{0x1f774,0x1f77f},{0x1f7d9,0x1f7df},{0x1f7ec,0x1f7ef},{0x1f7f1,0x1f7ff},{0x1f80c,0x1f80f}, + {0x1f848,0x1f84f},{0x1f85a,0x1f85f},{0x1f888,0x1f88f},{0x1f8ae,0x1f8af},{0x1f8b2,0x1f8ff},{0x1fa54,0x1fa5f}, + {0x1fa6e,0x1fa6f},{0x1fa75,0x1fa77},{0x1fa7d,0x1fa7f},{0x1fa87,0x1fa8f},{0x1faad,0x1faaf},{0x1fabb,0x1fabf}, + {0x1fac6,0x1facf},{0x1fada,0x1fadf},{0x1fae8,0x1faef},{0x1faf7,0x1faff},{0x1fb93,0x1fb93},{0x1fbcb,0x1fbef}, + {0x1fbfa,0x1ffff},{0x2a6e0,0x2a6ff},{0x2b739,0x2b73f},{0x2b81e,0x2b81f},{0x2cea2,0x2ceaf},{0x2ebe1,0x2f7ff}, + {0x2fa1e,0x2ffff},{0x3134b,0xe0000},{0xe0002,0xe001f},{0xe0080,0xe00ff},{0xe01f0,0xeffff},{0xffffe,0xfffff}, + {0x10fffe,0x10ffff} + }; +} +} + +namespace REGEX_RANGES { + // \s whitespace (81) + static const std::vector> Whitespace = { + {0x9,0xd},{0x20,0x20},{0x85,0x85},{0xa0,0xa0},{0x1680,0x1680},{0x2000,0x200a},{0x2028,0x2029}, + {0x202f,0x202f},{0x205f,0x205f},{0x3000,0x3000} + }; +} + +class UNICODE { +public: + std::vector to_codepoints(const std::string & str) { + std::vector result; + result.reserve(str.size()); + size_t offset = 0; + while (offset < str.size()) { + result.push_back(utf8_to_codepoint(str, offset)); + } + return result; + } + + std::vector> to_codepoints(const std::vector & str) { + std::vector> result; + result.reserve(str.size()); + for (auto & i : str) { + result.push_back(to_codepoints(i)); + } + return result; + } + + std::string to_string(const std::vector & codepoints) { + std::string result; + result.reserve(codepoints.size()*4); + for (auto & i : codepoints) { + result += codepoint_to_utf8(i); + } + return result; + } + + std::string to_string(const uint32_t & codepoints) { + return codepoint_to_utf8(codepoints); + } + + bool is_category(const uint32_t & codepoint, const std::string & UNICODE_TYPE) { + auto it = category_name_to_category_code.find(UNICODE_TYPE); + if (it != category_name_to_category_code.end()) { + return is_category_implement(codepoint, it->second); + } else { + throw std::runtime_error("Invalid UNICODE_TYPE"); + } + } + + bool is_category(const uint32_t & codepoint, const std::vector & UNICODE_TYPES) { + std::vector category_codes; + category_codes.reserve(UNICODE_TYPES.size()); + + for (auto & UNICODE_TYPE : UNICODE_TYPES) { + auto it = category_name_to_category_code.find(UNICODE_TYPE); + if (it != category_name_to_category_code.end()) { + category_codes.push_back(it->second); + } else { + throw std::runtime_error("Invalid UNICODE_TYPE"); + } + } + + return is_category_implement_batched(codepoint, category_codes); + } + + bool is_category(const uint32_t & codepoint, const std::vector & UNICODE_TYPES) { + return is_category_implement_batched(codepoint, UNICODE_TYPES); + } + + std::string to_category_name(const uint32_t & UNICODE_TYPE) { + auto it = category_code_to_category_name.find(UNICODE_TYPE); + if (it != category_code_to_category_name.end()) { + return it->second; + } else { + throw std::runtime_error("Invalid UNICODE_TYPE"); + } + } + + std::vector to_category_code(const std::vector & UNICODE_TYPES) { + std::vector result; + result.reserve(UNICODE_TYPES.size()); + for (auto & UNICODE_TYPE : UNICODE_TYPES) { + result.push_back(to_category_name(UNICODE_TYPE)); + } + return result; + } + + uint32_t to_category_code(const std::string & UNICODE_TYPE) { + auto it = category_name_to_category_code.find(UNICODE_TYPE); + if (it != category_name_to_category_code.end()) { + return it->second; + } else { + throw std::runtime_error("Invalid UNICODE_TYPE"); + } + } + + std::vector to_category_code(const std::vector & UNICODE_TYPES) { + std::vector result; + result.reserve(UNICODE_TYPES.size()); + for (auto & UNICODE_TYPE : UNICODE_TYPES) { + result.push_back(to_category_code(UNICODE_TYPE)); + } + return result; + } + + uint32_t get_category(const uint32_t & codepoint) { + return category_implement(codepoint); + } + + bool overload_category(const std::vector> & range, const std::string & UNICODE_TYPE) { + auto it = category_name_to_category_code.find(UNICODE_TYPE); + if (it != category_name_to_category_code.end()) { + for (const auto & i : range) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code[index] = it->second; + } + } + } else { + throw std::runtime_error("Invalid UNICODE_TYPE"); + } + category_overloaded = true; + return true; + } + + static std::string bytes_to_unicode_bpe(uint8_t byte) { + static std::unordered_map map = bytes_to_unicode_map_bpe(); + return map.at(byte); + } + + static uint8_t unicode_to_bytes_bpe(const std::string & utf8) { + static std::unordered_map map = unicode_to_bytes_map_bpe(); + return map.at(utf8); + } + + UNICODE () { + initialize(); + } + +private: + std::unordered_map category_name_to_category_code; + std::unordered_map category_code_to_category_name; + std::unordered_map codepoint_to_category_code; + std::vector> codepoint_ranges_low_frequency; + std::map, uint32_t> codepoint_ranges_to_category_code; + bool category_overloaded = false; + + static size_t binary_search_implement(const uint32_t & codepoint, const std::vector> & ranges) { + size_t left = 0; + size_t right = ranges.size() - 1; + + while (left <= right && right < ranges.size()) { + size_t mid = left + (right - left) / 2; + const auto& range = ranges[mid]; + + if (codepoint >= range.first && codepoint <= range.second) { + // Target is within the range of the current pair. + return mid; + } else if (codepoint < range.first) { + // Target is less than the start of the range, search in the left half. + right = mid - 1; + } else { + // Target is greater than the end of the range, search in the right half. + left = mid + 1; + } + } + throw std::runtime_error("Target out of range!"); + } + + static uint32_t utf8_to_codepoint(const std::string & utf8, size_t & offset) { + const auto byte1 = utf8[offset]; + + if (!(byte1 & 0x80)) { // 1-byte sequence + return utf8[offset++]; + } + if ((byte1 & 0xC0) != 0xC0) { // Checks for 10xx xxxx which is invalid at start + throw std::invalid_argument("invalid character"); + } + + // Pre-compute size based on the first byte + const size_t size = + !(byte1 & 0x20) ? 2 : + !(byte1 & 0x10) ? 3 : + !(byte1 & 0x08) ? 4 : 0; + + if (size == 0 || (offset + size > utf8.size())) { // Check for invalid size or string length + throw std::invalid_argument("invalid character"); + } + + uint32_t result = byte1 & (0xFF >> (size + 1)); // Initialize result with bits from the first byte + + for (size_t i = 1; i < size; ++i) { + const auto byte = utf8[offset + i]; + if ((byte & 0xC0) != 0x80) { // Following bytes must match 10xx xxxx + throw std::invalid_argument("invalid character"); + } + result = (result << 6) | (byte & 0x3F); + } + + offset += size; + return result; + } + + + static std::string codepoint_to_utf8(uint32_t codepoint) { + std::string result; + result.reserve(4); + if (/* 0x00 <= cp && */ codepoint <= 0x7f) { + result.push_back(codepoint); + } + else if (0x80 <= codepoint && codepoint <= 0x7ff) { + result.push_back(0xc0 | ((codepoint >> 6) & 0x1f)); + result.push_back(0x80 | (codepoint & 0x3f)); + } + else if (0x800 <= codepoint && codepoint <= 0xffff) { + result.push_back(0xe0 | ((codepoint >> 12) & 0x0f)); + result.push_back(0x80 | ((codepoint >> 6) & 0x3f)); + result.push_back(0x80 | (codepoint & 0x3f)); + } + else if (0x10000 <= codepoint && codepoint <= 0x10ffff) { + result.push_back(0xf0 | ((codepoint >> 18) & 0x07)); + result.push_back(0x80 | ((codepoint >> 12) & 0x3f)); + result.push_back(0x80 | ((codepoint >> 6) & 0x3f)); + result.push_back(0x80 | (codepoint & 0x3f)); + } + else { + throw std::invalid_argument("invalid codepoint"); + } + return result; + } + + uint32_t category_implement(const uint32_t & codepoint) { + if (!category_overloaded && codepoint >= 97 && codepoint <= 122) { + return 12; + } + + auto it = codepoint_to_category_code.find(codepoint); + if (it != codepoint_to_category_code.end()) { + return it->second; + } else if (codepoint < 0x110000){ + auto index = binary_search_implement(codepoint, codepoint_ranges_low_frequency); + return codepoint_ranges_to_category_code.at(codepoint_ranges_low_frequency[index]); + } else { + return UNICODE_INVALID; + } + } + + bool is_category_implement(const uint32_t & codepoint, const uint32_t & UNICODE_TYPE) { + uint32_t difference = category_implement(codepoint) - UNICODE_TYPE; + if (UNICODE_TYPE % 10 == 0) { + return (difference > 0) && (difference < 10); + } else { + return difference == 0; + } + } + + // If the codepoint satisfies at least one of the categories, then return true + bool is_category_implement_batched(const uint32_t & codepoint, const std::vector & UNICODE_TYPES) { + uint32_t codepoint_category = category_implement(codepoint); + for (auto & UNICODE_TYPE : UNICODE_TYPES) { + auto difference = codepoint_category - UNICODE_TYPE; + if (UNICODE_TYPE % 10 == 0 && difference > 0 && difference < 10) { + return true; + } + else if (difference == 0) { + return true; + } + } + return false; + } + + static std::unordered_map unicode_to_bytes_map_bpe() { + std::unordered_map map; + for (int ch = 33; ch <= 126; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + for (int ch = 161; ch <= 172; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + for (int ch = 174; ch <= 255; ++ch) { + assert(0 <= ch && ch < 256); + map[codepoint_to_utf8(ch)] = ch; + } + auto n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (map.find(codepoint_to_utf8(ch)) == map.end()) { + map[codepoint_to_utf8(256 + n)] = ch; + ++n; + } + } + return map; + } + + static std::unordered_map bytes_to_unicode_map_bpe() { + std::unordered_map map; + for (int ch = 33; ch <= 126; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + for (int ch = 161; ch <= 172; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + for (int ch = 174; ch <= 255; ++ch) { + assert(0 <= ch && ch < 256); + map[ch] = codepoint_to_utf8(ch); + } + auto n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (map.find(ch) == map.end()) { + map[ch] = codepoint_to_utf8(256 + n); + ++n; + } + } + return map; + } + + bool initialize() { + category_name_to_category_code = {{"LETTER",10}, {"MARK", 20}, {"NUMBER", 30}, + {"PUNCTUATION", 40}, {"SYMBOL", 50}, {"SEPARATOR", 60}, + {"OTHER", 70}, {"WHITESPACE", 81}, {"Lu", 11}, + {"Ll", 12}, {"Lt", 13}, {"Lm", 14}, {"Lo", 15}, + {"Mn", 21}, {"Mc", 22}, {"Me", 23}, {"Nd", 31}, + {"Nl", 32}, {"No", 33}, {"Pc", 41}, {"Pd", 42}, + {"Ps", 43}, {"Pe", 44}, {"Pi", 45}, {"Pf", 46}, + {"Po", 47}, {"Sm", 51}, {"Sc", 52}, {"Sk", 53}, + {"So", 54}, {"Zs", 61}, {"Zl", 62}, {"Zp", 63}, + {"Cc", 71}, {"Cf", 72}, {"Cs", 73}, {"Co", 74}, + {"Cn",75}}; + for (const auto & i : category_name_to_category_code) { + category_code_to_category_name[i.second] = i.first; + } + + for (const auto & i : UNICODE_RANGES::Letter::Lu) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 11); + } + } + for (const auto & i : UNICODE_RANGES::Letter::Ll) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 12); + } + } + for (const auto & i : UNICODE_RANGES::Letter::Lt) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 13); + } + } + for (const auto & i : UNICODE_RANGES::Letter::Lm) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 14); + } + } + for (const auto & i : UNICODE_RANGES::Letter::Lo) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 15); + } + } + for (const auto & i : UNICODE_RANGES::Mark::Mn) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 21); + } + } + for (const auto & i : UNICODE_RANGES::Mark::Mc) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 22); + } + } + for (const auto & i : UNICODE_RANGES::Mark::Me) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 23); + } + } + for (const auto & i : UNICODE_RANGES::Number::Nd) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 31); + } + } + for (const auto & i : UNICODE_RANGES::Number::Nl) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 32); + } + } + for (const auto & i : UNICODE_RANGES::Number::No) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 33); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Pc) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 41); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Pd) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 42); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Ps) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 43); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Pe) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 44); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Pi) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 45); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Pf) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 46); + } + } + for (const auto & i : UNICODE_RANGES::Punctuation::Po) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 47); + } + } + for (const auto & i : UNICODE_RANGES::Symbol::Sm) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 51); + } + } + for (const auto & i : UNICODE_RANGES::Symbol::Sc) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 52); + } + } + for (const auto & i : UNICODE_RANGES::Symbol::Sk) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 53); + } + } + for (const auto & i : UNICODE_RANGES::Symbol::So) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 54); + } + } + for (const auto & i : UNICODE_RANGES::Separator::Zs) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 61); + } + } + for (const auto & i : UNICODE_RANGES::Separator::Zl) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 62); + } + } + for (const auto & i : UNICODE_RANGES::Separator::Zp) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 63); + } + } + for (const auto & i : UNICODE_RANGES::Other::Cc) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 71); + } + } + for (const auto & i : UNICODE_RANGES::Other::Cf) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 72); + } + } + for (const auto & i : UNICODE_RANGES::Other::Cs) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 73); + } + } + for (const auto & i : UNICODE_RANGES::Other::Co) { + for (uint32_t index = i.first; index <= i.second; index++) { + codepoint_to_category_code.emplace(index, 74); + } + } + for (const auto & i : UNICODE_RANGES::Other::Cn) { + codepoint_ranges_low_frequency.push_back(i); + codepoint_ranges_to_category_code.emplace(i, 75); + } + std::sort(codepoint_ranges_low_frequency.begin(), codepoint_ranges_low_frequency.end()); + return true; + } +}; \ No newline at end of file diff --git a/unicode_regex.h b/unicode_regex.h new file mode 100644 index 0000000000000..bb9a668c835a7 --- /dev/null +++ b/unicode_regex.h @@ -0,0 +1,252 @@ +#pragma once + +#include "unicode.h" +#include "unordered_set" + +class llm_regex { +public: + std::vector gpt2_style(const std::string & str) { + std::vector results; + results.reserve(str.size()); + + auto codepoints = unicode_engine.to_codepoints(str); + + for (auto & cp : gpt2_style_implement(codepoints)) { + results.push_back(unicode_engine.to_string(cp)); + } + + return results; + } + + std::vector falcon_style(const std::string & str) { + std::vector results; + results.reserve(str.size()); + + auto codepoints = unicode_engine.to_codepoints(str); + + for (auto & cp_1 : split_punctuation_unicode_ascii(codepoints)) { + for (auto & cp_2 : gpt2_style_implement(cp_1)) { + for (auto & cp_3 : split_digits_unicode(cp_2)) { + for (auto & cp_4 : split_continuous_digits_ascii(cp_3)) { + results.push_back(unicode_engine.to_string(cp_4)); + } + } + } + } + + return results; + } + + UNICODE & get_unicode_engine() { + return unicode_engine; + } + + llm_regex() { + unicode_engine.overload_category(REGEX_RANGES::Whitespace, "WHITESPACE"); + } +private: + UNICODE unicode_engine; + + // Very basic match no metacharacter support + static bool basic_match(const std::vector> & codepoint_rules, + const std::vector & codepoints, + std::vector> & output, + size_t & offset) { + + for (auto & codepoint_rule : codepoint_rules) { + bool satisfy = true; + for (size_t ru_index = 0; ru_index < codepoint_rule.size(); ru_index++) { + if (offset + ru_index >= codepoints.size() || codepoint_rule[ru_index] != codepoints[offset + ru_index]) { + satisfy = false; + break; + } + } + if (satisfy) { + output.push_back(codepoint_rule); + offset += codepoint_rule.size(); + return true; + } + } + + return false; + } + + // "behavior": "Isolated" + // separate any continuous digits longer than 2 + static std::vector> split_continuous_digits_ascii(const std::vector & codepoints) { + std::vector> results; + results.reserve(codepoints.size()); + std::vector codepoints_buffer; + codepoints_buffer.reserve(codepoints.size()); + + size_t offset = 0; + + while (offset < codepoints.size()) { + codepoints_buffer.clear(); + uint32_t codepoint = codepoints[offset]; + uint32_t counter = 0; + + if (codepoint >= 48 && codepoint <= 57) { + while (offset < codepoints.size() && codepoints[offset] >= 48 && codepoints[offset] <= 57 && counter < 3) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + counter++; + } + } else { + while (offset < codepoints.size() && (codepoints[offset] < 48 || codepoints[offset] > 57)) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } + + if (!codepoints_buffer.empty()) { + results.push_back(codepoints_buffer); + } + } + + return results; + } + + // "individual_digits": false + std::vector> split_digits_unicode(const std::vector & codepoints) { + std::vector> results; + results.reserve(codepoints.size()); + std::vector codepoints_buffer; + codepoints_buffer.reserve(codepoints.size()); + + size_t offset = 0; + + while (offset < codepoints.size()) { + codepoints_buffer.clear(); + uint32_t codepoint = codepoints[offset]; + + if (unicode_engine.is_category(codepoint, "NUMBER")) { + while (offset < codepoints.size() && unicode_engine.is_category(codepoints[offset], "NUMBER")) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } else { + while (offset < codepoints.size() && !unicode_engine.is_category(codepoints[offset], "NUMBER")) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } + + if (!codepoints_buffer.empty()) { + results.push_back(codepoints_buffer); + } + } + + return results; + } + + // contiguous mode only + std::vector> split_punctuation_unicode_ascii(const std::vector & codepoints) { + std::vector> results; + results.reserve(codepoints.size()); + std::vector codepoints_buffer; + codepoints_buffer.reserve(codepoints.size()); + + size_t offset = 0; + + while (offset < codepoints.size()) { + codepoints_buffer.clear(); + uint32_t codepoint = codepoints[offset]; + + if (is_ascii_punctuation(codepoint) || unicode_engine.is_category(codepoint, "PUNCTUATION")) { + while (offset < codepoints.size() && (is_ascii_punctuation(codepoints[offset]) || unicode_engine.is_category(codepoints[offset], "PUNCTUATION"))) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } else { + while (offset < codepoints.size() && !(is_ascii_punctuation(codepoints[offset]) || unicode_engine.is_category(codepoints[offset], "PUNCTUATION"))) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } + + if (!codepoints_buffer.empty()) { + results.push_back(codepoints_buffer); + } + } + + return results; + } + + std::vector> gpt2_style_implement(const std::vector & codepoints) { + std::vector> results; + results.reserve(codepoints.size()); + std::vector codepoints_buffer; + codepoints_buffer.reserve(codepoints.size()); + + size_t offset = 0; + + static auto codepoint_rules_1 = unicode_engine.to_codepoints({"'s", "'t", "'re", "'ve", "'m", "ll", "'d"}); + static auto codepoint_rules_2 = unicode_engine.to_category_code({"WHITESPACE", "LETTER", "NUMBER"}); + + while (offset < codepoints.size()) { + codepoints_buffer.clear(); + uint32_t codepoint = codepoints[offset]; + uint32_t codepoint_next = (offset + 1 < codepoints.size()) ? codepoints[offset + 1] : 0xFFFFFFFF; + + //'s|'t|'re|'ve|'m|'ll|'d + if (basic_match(codepoint_rules_1, codepoints, results, offset)) { + continue; + } + // ?\p{L}+ + else if (unicode_engine.is_category(codepoint, "LETTER") || (codepoint == 32 && unicode_engine.is_category(codepoint_next, "LETTER"))) { + codepoints_buffer.push_back(codepoint); + offset++; + while (offset < codepoints.size() && unicode_engine.is_category(codepoints[offset], "LETTER")) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } + // ?\p{N}+ + else if (unicode_engine.is_category(codepoint, "NUMBER") || (codepoint == 32 && unicode_engine.is_category(codepoint_next, "NUMBER"))) { + codepoints_buffer.push_back(codepoint); + offset++; + while (offset < codepoints.size() && unicode_engine.is_category(codepoints[offset], "NUMBER")) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } + // ?[^\s\p{L}\p{N}]+ + else if (!unicode_engine.is_category(codepoint, codepoint_rules_2) || (codepoint == 32 && !unicode_engine.is_category(codepoint_next, codepoint_rules_2))) { + codepoints_buffer.push_back(codepoint); + offset++; + while (offset < codepoints.size() && !unicode_engine.is_category(codepoints[offset], codepoint_rules_2)) { + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } + //\s+(?!\S)|\s+ + else if (unicode_engine.is_category(codepoint, "WHITESPACE")) { + codepoints_buffer.push_back(codepoint); + offset++; + while (offset < codepoints.size() && unicode_engine.is_category(codepoints[offset], "WHITESPACE")) { + if (offset + 1 < codepoints.size() && !unicode_engine.is_category(codepoints[offset+1], "WHITESPACE")) { break;} + codepoints_buffer.push_back(codepoints[offset]); + offset++; + } + } else { + offset++; + } + + if (!codepoints_buffer.empty()) { + results.push_back(codepoints_buffer); + } + } + + return results; + } + + static bool is_ascii_punctuation(const uint32_t & codepoint) { + static std::unordered_set ascii_punctuation = {33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, + 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94, 95, 96, + 123, 124, 125, 126}; + auto it = ascii_punctuation.find(codepoint); + + return it != ascii_punctuation.end(); + } +};