Gioni06 · Gioni06 · Jan 28, 2023 · Jan 28, 2023 · Jan 28, 2023
diff --git a/composer.json b/composer.json
@@ -9,7 +9,7 @@
     "source": "https://github.com/Gioni06/GPT3Tokenizer"
   },
   "scripts": {
-    "test": "vendor/bin/phpunit tests"
+    "test": "vendor/bin/phpunit -d memory_limit=-1 tests"
   },
   "authors": [
     {

diff --git a/src/Gpt3Tokenizer.php b/src/Gpt3Tokenizer.php
@@ -37,6 +37,7 @@ public function __construct(Gpt3TokenizerConfig $config)
     private function cacheSet($key, $val): void
     {
         if ($this->apcuAvailable) {
+            /** @noinspection PhpComposerExtensionStubsInspection */
             apcu_store($key, $val);
         } else {
             $this->cache[$key] = $val;
@@ -46,6 +47,7 @@ private function cacheSet($key, $val): void
     private function cacheGet($key): mixed
     {
         if ($this->apcuAvailable) {
+            /** @noinspection PhpComposerExtensionStubsInspection */
             return apcu_fetch($key);
         } else {
             return $this->cache[$key] ?? null;
@@ -55,6 +57,7 @@ private function cacheGet($key): mixed
     private function cacheExists($key): array|bool
     {
         if ($this->apcuAvailable) {
+            /** @noinspection PhpComposerExtensionStubsInspection */
             return apcu_exists($key);
         } else {
             return isset($this->cache[$key]);
@@ -63,32 +66,267 @@ private function cacheExists($key): array|bool
 
     public static function bytes_to_unicode(): array
     {
-        $bs = array_merge(range(mb_ord('!'), mb_ord('~') + 1), range(mb_ord('¡'), mb_ord('¬') + 1), range(mb_ord('®'), mb_ord('ÿ') + 1));
-
-        $cs = $bs;
-        $n = 0;
-        foreach (range(0, 2 ** 8 - 1) as $b) {
-            if (!in_array($b, $bs)) {
-                $bs[] = $b;
-                $cs[] = 2 ** 8 + $n;
-                $n = $n + 1;
-            }
-        }
-
-        $cs = array_map(function($x) {
-            return mb_chr($x);
-        }, $cs);
-
-        $result = array();
-        array_map(function($_, $i) use(&$result, $bs, $cs) {
-            $result[$bs[$i]] = $cs[$i];
-        }, $bs, array_keys($cs));
-
-        if (array_key_exists(256, $result)) {
-            unset($result[256]);
-        }
-        ksort($result);
-        return $result;
+        // Bytes-to-Unicode is a list of utf-8 byte and a corresponding unicode string.
+        // Using this static list is much faster than decoding the utf-8 everytime a character is encountered.
+        // Also, it produces the exact output as tokenizer from OpenAI uses. https://beta.openai.com/tokenizer
+        return [
+            0 => 'Ā',
+            1 => 'ā',
+            2 => 'Ă',
+            3 => 'ă',
+            4 => 'Ą',
+            5 => 'ą',
+            6 => 'Ć',
+            7 => 'ć',
+            8 => 'Ĉ',
+            9 => 'ĉ',
+            10 => 'Ċ',
+            11 => 'ċ',
+            12 => 'Č',
+            13 => 'č',
+            14 => 'Ď',
+            15 => 'ď',
+            16 => 'Đ',
+            17 => 'đ',
+            18 => 'Ē',
+            19 => 'ē',
+            20 => 'Ĕ',
+            21 => 'ĕ',
+            22 => 'Ė',
+            23 => 'ė',
+            24 => 'Ę',
+            25 => 'ę',
+            26 => 'Ě',
+            27 => 'ě',
+            28 => 'Ĝ',
+            29 => 'ĝ',
+            30 => 'Ğ',
+            31 => 'ğ',
+            32 => 'Ġ',
+            33 => '!',
+            34 => '"',
+            35 => '#',
+            36 => '$',
+            37 => '%',
+            38 => '&',
+            39 => '\'',
+            40 => '(',
+            41 => ')',
+            42 => '*',
+            43 => '+',
+            44 => ',',
+            45 => '-',
+            46 => '.',
+            47 => '/',
+            48 => '0',
+            49 => '1',
+            50 => '2',
+            51 => '3',
+            52 => '4',
+            53 => '5',
+            54 => '6',
+            55 => '7',
+            56 => '8',
+            57 => '9',
+            58 => ':',
+            59 => ';',
+            60 => '<',
+            61 => '=',
+            62 => '>',
+            63 => '?',
+            64 => '@',
+            65 => 'A',
+            66 => 'B',
+            67 => 'C',
+            68 => 'D',
+            69 => 'E',
+            70 => 'F',
+            71 => 'G',
+            72 => 'H',
+            73 => 'I',
+            74 => 'J',
+            75 => 'K',
+            76 => 'L',
+            77 => 'M',
+            78 => 'N',
+            79 => 'O',
+            80 => 'P',
+            81 => 'Q',
+            82 => 'R',
+            83 => 'S',
+            84 => 'T',
+            85 => 'U',
+            86 => 'V',
+            87 => 'W',
+            88 => 'X',
+            89 => 'Y',
+            90 => 'Z',
+            91 => '[',
+            92 => '\\',
+            93 => ']',
+            94 => '^',
+            95 => '_',
+            96 => '`',
+            97 => 'a',
+            98 => 'b',
+            99 => 'c',
+            100 => 'd',
+            101 => 'e',
+            102 => 'f',
+            103 => 'g',
+            104 => 'h',
+            105 => 'i',
+            106 => 'j',
+            107 => 'k',
+            108 => 'l',
+            109 => 'm',
+            110 => 'n',
+            111 => 'o',
+            112 => 'p',
+            113 => 'q',
+            114 => 'r',
+            115 => 's',
+            116 => 't',
+            117 => 'u',
+            118 => 'v',
+            119 => 'w',
+            120 => 'x',
+            121 => 'y',
+            122 => 'z',
+            123 => '{',
+            124 => '|',
+            125 => '}',
+            126 => '~',
+            127 => 'ġ',
+            128 => 'Ģ',
+            129 => 'ģ',
+            130 => 'Ĥ',
+            131 => 'ĥ',
+            132 => 'Ħ',
+            133 => 'ħ',
+            134 => 'Ĩ',
+            135 => 'ĩ',
+            136 => 'Ī',
+            137 => 'ī',
+            138 => 'Ĭ',
+            139 => 'ĭ',
+            140 => 'Į',
+            141 => 'į',
+            142 => 'İ',
+            143 => 'ı',
+            144 => 'Ĳ',
+            145 => 'ĳ',
+            146 => 'Ĵ',
+            147 => 'ĵ',
+            148 => 'Ķ',
+            149 => 'ķ',
+            150 => 'ĸ',
+            151 => 'Ĺ',
+            152 => 'ĺ',
+            153 => 'Ļ',
+            154 => 'ļ',
+            155 => 'Ľ',
+            156 => 'ľ',
+            157 => 'Ŀ',
+            158 => 'ŀ',
+            159 => 'Ł',
+            160 => 'ł',
+            161 => '¡',
+            162 => '¢',
+            163 => '£',
+            164 => '¤',
+            165 => '¥',
+            166 => '¦',
+            167 => '§',
+            168 => '¨',
+            169 => '©',
+            170 => 'ª',
+            171 => '«',
+            172 => '¬',
+            173 => 'Ń',
+            174 => '®',
+            175 => '¯',
+            176 => '°',
+            177 => '±',
+            178 => '²',
+            179 => '³',
+            180 => '´',
+            181 => 'µ',
+            182 => '¶',
+            183 => '·',
+            184 => '¸',
+            185 => '¹',
+            186 => 'º',
+            187 => '»',
+            188 => '¼',
+            189 => '½',
+            190 => '¾',
+            191 => '¿',
+            192 => 'À',
+            193 => 'Á',
+            194 => 'Â',
+            195 => 'Ã',
+            196 => 'Ä',
+            197 => 'Å',
+            198 => 'Æ',
+            199 => 'Ç',
+            200 => 'È',
+            201 => 'É',
+            202 => 'Ê',
+            203 => 'Ë',
+            204 => 'Ì',
+            205 => 'Í',
+            206 => 'Î',
+            207 => 'Ï',
+            208 => 'Ð',
+            209 => 'Ñ',
+            210 => 'Ò',
+            211 => 'Ó',
+            212 => 'Ô',
+            213 => 'Õ',
+            214 => 'Ö',
+            215 => '×',
+            216 => 'Ø',
+            217 => 'Ù',
+            218 => 'Ú',
+            219 => 'Û',
+            220 => 'Ü',
+            221 => 'Ý',
+            222 => 'Þ',
+            223 => 'ß',
+            224 => 'à',
+            225 => 'á',
+            226 => 'â',
+            227 => 'ã',
+            228 => 'ä',
+            229 => 'å',
+            230 => 'æ',
+            231 => 'ç',
+            232 => 'è',
+            233 => 'é',
+            234 => 'ê',
+            235 => 'ë',
+            236 => 'ì',
+            237 => 'í',
+            238 => 'î',
+            239 => 'ï',
+            240 => 'ð',
+            241 => 'ñ',
+            242 => 'ò',
+            243 => 'ó',
+            244 => 'ô',
+            245 => 'õ',
+            246 => 'ö',
+            247 => '÷',
+            248 => 'ø',
+            249 => 'ù',
+            250 => 'ú',
+            251 => 'û',
+            252 => 'ü',
+            253 => 'ý',
+            254 => 'þ',
+            255 => 'ÿ',
+        ];
     }
 
     public static function encodeStr(string $str): array {

diff --git a/tests/Gpt3TokenizerTest.php b/tests/Gpt3TokenizerTest.php
@@ -1,4 +1,5 @@
-<?php
+<?php /** @noinspection SpellCheckingInspection */
+
 namespace Gioni06\Gpt3Tokenizer\Tests;
 
 use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;
@@ -19,13 +20,15 @@ public function test_encodeStr_function(): void
     {
         $this->assertEquals([ '32', '119', '111', '114', '108', '100' ], Gpt3Tokenizer::encodeStr(" world"));
         $this->assertEquals([ '32', '240', '159', '140', '141' ], Gpt3Tokenizer::encodeStr(" 🌍"));
+        $this->assertEquals([ '240', '159', '148', '173' ], Gpt3Tokenizer::encodeStr("🔭"));
     }
 
 
     public function test_decodeStr_function(): void
     {
         $this->assertEquals(" world", Gpt3Tokenizer::decodeStr([ '32', '119', '111', '114', '108', '100' ]));
         $this->assertEquals(" 🌍", Gpt3Tokenizer::decodeStr([ '32', '240', '159', '140', '141' ]));
+        $this->assertEquals("🔭", Gpt3Tokenizer::decodeStr([ '240', '159', '148', '173' ]));
     }
 
     public function test_get_pairs_function()
@@ -58,6 +61,7 @@ public function test_bytes_to_unicode_function()
         $this->assertEquals("d", Gpt3Tokenizer::bytes_to_unicode()[100]);
         $this->assertEquals("È", Gpt3Tokenizer::bytes_to_unicode()[200]);
         $this->assertEquals("ÿ", Gpt3Tokenizer::bytes_to_unicode()[255]);
+        $this->assertCount(256, Gpt3Tokenizer::bytes_to_unicode());
     }
 
     /*
@@ -138,4 +142,12 @@ public function test_config()
         $this->assertStringEndsWith('vocab.json', $config->getConfig()['vocabPath']);
         $this->assertFalse($config->getConfig()['useCache']);
     }
+
+    public function test_regression_issue_5()
+    {
+        $config = new Gpt3TokenizerConfig();
+        $tokenizer = new Gpt3Tokenizer($config);
+        $tokens = $tokenizer->encode("🔭");
+        $this->assertEquals([8582, 242, 255], $tokens);
+    }
 }