@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
7171 throw std::invalid_argument (" failed to convert utf8 to codepoint" );
7272}
7373
74- // static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp ) {
74+ // static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt ) {
7575// std::vector<uint16_t> result;
76- // if (/* 0x0000 <= cp && */ cp <= 0xffff) {
77- // result.emplace_back(cp );
76+ // if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
77+ // result.emplace_back(cpt );
7878// return result;
7979// }
80- // if (0x10000 <= cp && cp <= 0x10ffff) {
81- // result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
82- // result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
80+ // if (0x10000 <= cpt && cpt <= 0x10ffff) {
81+ // result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
82+ // result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
8383// return result;
8484// }
8585// throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
120120// return result;
121121// }
122122
123- static std::vector<codepoint_flags > unicode_cpt_flags_array () {
124- std::vector<codepoint_flags > cpt_flags (MAX_CODEPOINTS, codepoint_flags ::UNDEFINED);
123+ static std::vector<unicode_cpt_flags > unicode_cpt_flags_array () {
124+ std::vector<unicode_cpt_flags > cpt_flags (MAX_CODEPOINTS, unicode_cpt_flags ::UNDEFINED);
125125
126126 assert (unicode_ranges_flags.begin ()[0 ].first == 0 );
127127 assert (unicode_ranges_flags.begin ()[unicode_ranges_flags.size ()-1 ].first == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253253 return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
254254 };
255255
256- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
257- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags {};
256+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
257+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags {};
258258 };
259259
260260 size_t _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371371 return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
372372 };
373373
374- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
375- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags {};
374+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
375+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags {};
376376 };
377377
378378 size_t _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
572572// interface
573573//
574574
575- std::string unicode_cpt_to_utf8 (uint32_t cp ) {
575+ std::string unicode_cpt_to_utf8 (uint32_t cpt ) {
576576 std::string result;
577577
578- if (/* 0x00 <= cp && */ cp <= 0x7f ) {
579- result.push_back (cp );
578+ if (/* 0x00 <= cpt && */ cpt <= 0x7f ) {
579+ result.push_back (cpt );
580580 return result;
581581 }
582- if (0x80 <= cp && cp <= 0x7ff ) {
583- result.push_back (0xc0 | ((cp >> 6 ) & 0x1f ));
584- result.push_back (0x80 | (cp & 0x3f ));
582+ if (0x80 <= cpt && cpt <= 0x7ff ) {
583+ result.push_back (0xc0 | ((cpt >> 6 ) & 0x1f ));
584+ result.push_back (0x80 | (cpt & 0x3f ));
585585 return result;
586586 }
587- if (0x800 <= cp && cp <= 0xffff ) {
588- result.push_back (0xe0 | ((cp >> 12 ) & 0x0f ));
589- result.push_back (0x80 | ((cp >> 6 ) & 0x3f ));
590- result.push_back (0x80 | (cp & 0x3f ));
587+ if (0x800 <= cpt && cpt <= 0xffff ) {
588+ result.push_back (0xe0 | ((cpt >> 12 ) & 0x0f ));
589+ result.push_back (0x80 | ((cpt >> 6 ) & 0x3f ));
590+ result.push_back (0x80 | (cpt & 0x3f ));
591591 return result;
592592 }
593- if (0x10000 <= cp && cp <= 0x10ffff ) {
594- result.push_back (0xf0 | ((cp >> 18 ) & 0x07 ));
595- result.push_back (0x80 | ((cp >> 12 ) & 0x3f ));
596- result.push_back (0x80 | ((cp >> 6 ) & 0x3f ));
597- result.push_back (0x80 | (cp & 0x3f ));
593+ if (0x10000 <= cpt && cpt <= 0x10ffff ) {
594+ result.push_back (0xf0 | ((cpt >> 18 ) & 0x07 ));
595+ result.push_back (0x80 | ((cpt >> 12 ) & 0x3f ));
596+ result.push_back (0x80 | ((cpt >> 6 ) & 0x3f ));
597+ result.push_back (0x80 | (cpt & 0x3f ));
598598 return result;
599599 }
600600
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
624624 return result;
625625}
626626
627- codepoint_flags unicode_cpt_flags (const uint32_t cp ) {
628- static const codepoint_flags undef (codepoint_flags ::UNDEFINED);
627+ unicode_cpt_flags unicode_cpt_flags_from_cpt (const uint32_t cpt ) {
628+ static const unicode_cpt_flags undef (unicode_cpt_flags ::UNDEFINED);
629629 static const auto cpt_flags = unicode_cpt_flags_array ();
630- return cp < cpt_flags.size () ? cpt_flags[cp ] : undef;
630+ return cpt < cpt_flags.size () ? cpt_flags[cpt ] : undef;
631631}
632632
633- codepoint_flags unicode_cpt_flags (const std::string & utf8) {
634- static const codepoint_flags undef (codepoint_flags ::UNDEFINED);
633+ unicode_cpt_flags unicode_cpt_flags_from_utf8 (const std::string & utf8) {
634+ static const unicode_cpt_flags undef (unicode_cpt_flags ::UNDEFINED);
635635 if (utf8.empty ()) {
636636 return undef; // undefined
637637 }
638638 size_t offset = 0 ;
639- return unicode_cpt_flags (unicode_cpt_from_utf8 (utf8, offset));
639+ return unicode_cpt_flags_from_cpt (unicode_cpt_from_utf8 (utf8, offset));
640640}
641641
642642std::string unicode_byte_to_utf8 (uint8_t byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
649649 return map.at (utf8);
650650}
651651
652- uint32_t unicode_tolower (uint32_t cp ) {
652+ uint32_t unicode_tolower (uint32_t cpt ) {
653653 // binary search
654- auto it = std::lower_bound (unicode_map_lowercase.begin (), unicode_map_lowercase.end (), cp ,
654+ auto it = std::lower_bound (unicode_map_lowercase.begin (), unicode_map_lowercase.end (), cpt ,
655655 [](const std::pair<uint32_t , uint32_t > & pair, uint32_t value) {
656656 return pair.first < value;
657657 });
658- if (it != unicode_map_lowercase.end () && it->first == cp ) {
658+ if (it != unicode_map_lowercase.end () && it->first == cpt ) {
659659 return it->second ;
660660 }
661- return cp ; // Return the original code point if no lowercase mapping is found
661+ return cpt ; // Return the original code point if no lowercase mapping is found
662662}
663663
664664std::vector<std::string> unicode_regex_split (const std::string & text, const std::vector<std::string> & regex_exprs) {
665665 // unicode categories
666666 static const std::map<std::string, int > k_ucat_enum = {
667- { " \\ p{N}" , codepoint_flags ::NUMBER },
668- { " \\ p{L}" , codepoint_flags ::LETTER },
669- { " \\ p{P}" , codepoint_flags ::PUNCTUATION },
667+ { " \\ p{N}" , unicode_cpt_flags ::NUMBER },
668+ { " \\ p{L}" , unicode_cpt_flags ::LETTER },
669+ { " \\ p{P}" , unicode_cpt_flags ::PUNCTUATION },
670670 };
671671
672672 static const std::map<int , int > k_ucat_cpt = {
673- { codepoint_flags ::NUMBER, 0xD1 },
674- { codepoint_flags ::LETTER, 0xD2 },
675- { codepoint_flags ::PUNCTUATION, 0xD3 },
673+ { unicode_cpt_flags ::NUMBER, 0xD1 },
674+ { unicode_cpt_flags ::LETTER, 0xD2 },
675+ { unicode_cpt_flags ::PUNCTUATION, 0xD3 },
676676 };
677677
678678 static const std::map<int , std::string> k_ucat_map = {
679- { codepoint_flags ::NUMBER, " \x30 -\x39 " }, // 0-9
680- { codepoint_flags ::LETTER, " \x41 -\x5A\x61 -\x7A " }, // A-Za-z
681- { codepoint_flags ::PUNCTUATION, " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " }, // !-#%-*,-/:-;?-@\[-\]_\{\}
679+ { unicode_cpt_flags ::NUMBER, " \x30 -\x39 " }, // 0-9
680+ { unicode_cpt_flags ::LETTER, " \x41 -\x5A\x61 -\x7A " }, // A-Za-z
681+ { unicode_cpt_flags ::PUNCTUATION, " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " }, // !-#%-*,-/:-;?-@\[-\]_\{\}
682682 };
683683
684684 // compute collapsed codepoints only if needed by at least one regex
685685 bool need_collapse = false ;
686- for (auto & regex_expr : regex_exprs) {
686+ for (const auto & regex_expr : regex_exprs) {
687687 // search for unicode categories
688688 for (const auto & ucat : k_ucat_enum) {
689689 if (std::string::npos != regex_expr.find (ucat.first )) {
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
709709 continue ;
710710 }
711711
712- const auto flags = unicode_cpt_flags (cpts[i]);
712+ const auto flags = unicode_cpt_flags_from_cpt (cpts[i]);
713713
714714 if (flags.is_whitespace ) {
715715 // NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
725725
726726 std::vector<size_t > bpe_offsets = { cpts.size () };
727727
728- for (auto & regex_expr : regex_exprs) {
728+ for (const auto & regex_expr : regex_exprs) {
729729 // first, see if we have an efficient custom regex implementation
730730 auto tmp = unicode_regex_split_custom (text, regex_expr, bpe_offsets);
731731
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
739739 // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
740740 // with the corresponding collapsed representation
741741 bool use_collapsed = false ;
742- for (auto & ucat : k_ucat_enum) {
742+ for (const auto & ucat : k_ucat_enum) {
743743 if (std::string::npos != regex_expr.find (ucat.first )) {
744744 use_collapsed = true ;
745745 break ;
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
805805 // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
806806 std::wstring wtext (cpts.begin (), cpts.end ());
807807 for (size_t i = 0 ; i < wtext.size (); ++i) {
808- if (wtext[i] > 0x7F && unicode_cpt_flags (wtext[i]).is_whitespace ) {
808+ if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt (wtext[i]).is_whitespace ) {
809809 wtext[i] = 0x0B ;
810810 }
811811 }
0 commit comments