Merge pull request #3 from Gioni06/optimizations

optimizations
Gioni06 · Jan 19, 2023 · 2070dc4 · 2070dc4
2 parents dce81d6 + bc746af
commit 2070dc4
Show file tree

Hide file tree

Showing 9 changed files with 1,416 additions and 97 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,14 @@ Install the package from [Packagist](https://packagist.org/packages/gioni06/gpt3
 composer require gioni06/gpt3-tokenizer
 ```
 
-What's next:
+## Testing
+Loading the vocabulary files consumes a lot of memory. You might need to increase the phpunit memory limit.
+https://stackoverflow.com/questions/46448294/phpunit-coverage-allowed-memory-size-of-536870912-bytes-exhausted
+```bash
+-d memory_limit=-1
+```
+
+## What's next
 - Caching for performance improvements
 - Ability to work with user provided vocabulary files
 
@@ -23,8 +30,10 @@ What's next:
 ```php
 use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;
 
+$config = new Gpt3TokenizerConfig();
+$tokenizer = new Gpt3Tokenizer($config);
 $text = "This is some text";
-$tokens = GPT3Tokenizer::encode($text);
+$tokens = $tokenizer->encode($text);
 // [1212,318,617,2420]
 ```
 
@@ -33,8 +42,10 @@ $tokens = GPT3Tokenizer::encode($text);
 ```php
 use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;
 
+$config = new Gpt3TokenizerConfig();
+$tokenizer = new Gpt3Tokenizer($config);
 $tokens = [1212,318,617,2420]
-$text = GPT3Tokenizer::decode($tokens);
+$text = $tokenizer->decode($tokens);
 // "This is some text"
 ```
 
@@ -43,8 +54,10 @@ $text = GPT3Tokenizer::decode($tokens);
 ```php
 use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;
 
+$config = new Gpt3TokenizerConfig();
+$tokenizer = new Gpt3Tokenizer($config);
 $text = "This is some text";
-$numberOfTokens = GPT3Tokenizer::count($text);
+$numberOfTokens = $tokenizer->count($text);
 // 4
 ```
 

diff --git a/composer.json b/composer.json
@@ -18,7 +18,8 @@
     }
   ],
   "require": {
-    "php": "^8.0.2"
+    "php": "^8.0.2",
+    "ext-mbstring": "*"
   },
   "require-dev": {
     "phpunit/phpunit": "^9.5.8"

diff --git a/composer.lock b/composer.lock
diff --git a/src/Gpt3Tokenizer.php b/src/Gpt3Tokenizer.php
@@ -4,13 +4,70 @@
 
 class Gpt3Tokenizer
 {
+    private mixed $vocab;
+    private array $bpeMerges;
+    private array $bpe_ranks;
+    private bool $apcuAvailable;
+
+    private array $cache = [];
+
+    private bool $useCache;
+
+
+    public function __construct(Gpt3TokenizerConfig $config)
+    {
+        $vocabPath = $config->getConfig()['vocabPath'];
+        $vocab = new Vocab($vocabPath);
+        $this->vocab = $vocab->data();
+        // Free memory that is no longer needed
+        unset($vocab);
+
+        $mergesPath = $config->getConfig()['mergesPath'];
+        $merges = new Merges($mergesPath);
+        $this->bpeMerges = $merges->bpeMerges();
+        $this->bpe_ranks = array_combine(Gpt3Tokenizer::zipBpe($this->bpeMerges), range(0, count($this->bpeMerges) - 1));
+        // Free memory that is no longer needed
+        unset($this->bpeMerges);
+        unset($merges);
+
+        $this->apcuAvailable = function_exists('apcu_enabled') && apcu_enabled();
+        $this->useCache = $config->getConfig()['useCache'];
+    }
+
+    private function cacheSet($key, $val): void
+    {
+        if ($this->apcuAvailable) {
+            apcu_store($key, $val);
+        } else {
+            $this->cache[$key] = $val;
+        }
+    }
+
+    private function cacheGet($key): mixed
+    {
+        if ($this->apcuAvailable) {
+            return apcu_fetch($key);
+        } else {
+            return $this->cache[$key] ?? null;
+        }
+    }
+
+    private function cacheExists($key): array|bool
+    {
+        if ($this->apcuAvailable) {
+            return apcu_exists($key);
+        } else {
+            return isset($this->cache[$key]);
+        }
+    }
+
     public static function bytes_to_unicode(): array
     {
         $bs = array_merge(range(mb_ord('!'), mb_ord('~') + 1), range(mb_ord('¡'), mb_ord('¬') + 1), range(mb_ord('®'), mb_ord('ÿ') + 1));
 
         $cs = $bs;
         $n = 0;
-        for ($b = 0; $b < 2 ** 8; $b++) {
+        foreach (range(0, 2 ** 8 - 1) as $b) {
             if (!in_array($b, $bs)) {
                 $bs[] = $b;
                 $cs[] = 2 ** 8 + $n;
@@ -26,6 +83,7 @@ public static function bytes_to_unicode(): array
         array_map(function($_, $i) use(&$result, $bs, $cs) {
             $result[$bs[$i]] = $cs[$i];
         }, $bs, array_keys($cs));
+
         if (array_key_exists(256, $result)) {
             unset($result[256]);
         }
@@ -47,13 +105,6 @@ public static function decodeStr(array $codes): string {
         return implode($bytes);
     }
 
-    public static function bpeMerges(array $lines): array
-    {
-        return array_map(function($x) {
-            return array_filter(preg_split("/(\s+)/", $x), function($e) { return strlen(trim($e)) > 0; });
-        }, array_slice($lines, 1, count($lines) - 1));
-    }
-
     public static function get_pairs($input_arr): array
     {
         $pairs = array();
@@ -73,20 +124,13 @@ public static function zipBpe(array $bpeMerges): array
         return $bpe;
     }
 
-    public static function dictZip(array $x, array $y): array
+    public function bpe(string $token): string
     {
-        return array_combine($x, $y);
-    }
+        if($this->useCache && $this->cacheExists($token)) {
+            return $this->cacheGet($token);
+        }
 
-    public static function splitString($string): array|bool|null
-    {
-        return mb_str_split($string);
-    }
-    public static function bpe(string $token): string
-    {
-        $bpeMerges = Gpt3Tokenizer::bpeMerges((new Merges())->lines());
-        $bpe_ranks = Gpt3Tokenizer::dictZip(Gpt3Tokenizer::zipBpe($bpeMerges), range(0, count($bpeMerges) - 1));
-        $chars = self::splitString($token);
+        $chars = mb_str_split($token);
         $pairs = self::get_pairs($chars);
         if(!count($pairs)) {
             return implode(" ", $chars);
@@ -96,8 +140,8 @@ public static function bpe(string $token): string
             $minPairs = [];
             foreach ($pairs as $pair) {
                 $pairStr = implode(",", $pair);
-                if (array_key_exists($pairStr, $bpe_ranks)) {
-                    $minPairs[$bpe_ranks[$pairStr]] = $pair;
+                if (array_key_exists($pairStr, $this->bpe_ranks)) {
+                    $minPairs[$this->bpe_ranks[$pairStr]] = $pair;
                 } else {
                     $minPairs[10e10] = $pair;
                 }
@@ -109,7 +153,7 @@ public static function bpe(string $token): string
             }, array_keys($minPairs)))];
 
             $bigramStr = implode(",", $bigram);
-            if (!array_key_exists($bigramStr, $bpe_ranks)) {
+            if (!array_key_exists($bigramStr, $this->bpe_ranks)) {
                 break;
             }
 
@@ -142,12 +186,15 @@ public static function bpe(string $token): string
                 $pairs = self::get_pairs($chars);
             }
         }
-        return implode(" ", $chars);
+        $result = implode(" ", $chars);
+        if($this->useCache) {
+            $this->cacheSet($token, $result);
+        }
+        return $result;
     }
 
-    public static function encode(string $text): array
+    public function encode(string $text): array
     {
-        $encoder = (new Vocab())->data();
         $byte_encoder = self::bytes_to_unicode();
         $pat = "/'s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^[:space:]\pL\pN]+|\s+(?!\S)|\s+/u";
         $bpe_tokens = array();
@@ -158,37 +205,35 @@ public static function encode(string $text): array
                 return $byte_encoder[$x];
             }, self::encodeStr($token)));
 
-            $new_tokens = array_map(function($x) use ($encoder) {
-                return $encoder[$x];
-            }, explode(' ', self::bpe($token)));
+            $new_tokens = array_map(function($x) {
+                return $this->vocab[$x];
+            }, explode(' ', $this->bpe($token)));
             $bpe_tokens = array_merge($bpe_tokens, $new_tokens);
         }
         return $bpe_tokens;
     }
 
-    public static function decode(array $tokens): string
+    public function decode(array $tokens): string
     {
-        $encoder = (new Vocab())->data();
-        $decoder = array_flip($encoder);
+        $decoder = array_flip($this->vocab);
         $byte_decoder = array_flip(self::bytes_to_unicode());
 
         $text = array_map(function($x) use ($decoder) {
             return $decoder[$x];
         }, $tokens);
 
         $text = implode($text);
-        $chars = self::splitString($text);
+        $chars = mb_str_split($text);
         $decodedChars = array();
         for ($i = 0; $i < count($chars); $i++) {
             $decodedChars[] = $byte_decoder[$chars[$i]];
         }
         return self::decodeStr($decodedChars);
     }
 
-    public static function count(string $text): int
+    public function count(string $text): int
     {
         $tokens = self::encode($text);
         return count($tokens);
     }
-    public function __construct(){}
 }
diff --git a/src/Gpt3TokenizerConfig.php b/src/Gpt3TokenizerConfig.php
@@ -0,0 +1,36 @@
+<?php
+
+namespace Gioni06\Gpt3Tokenizer;
+
+class Gpt3TokenizerConfig
+{
+
+    private array $config = [
+        'mergesPath' => __DIR__ . '/pretrained_vocab_files/merges.txt',
+        'vocabPath' => __DIR__ . '/pretrained_vocab_files/vocab.json',
+        'useCache' => true,
+    ];
+
+    public function mergesPath($path): Gpt3TokenizerConfig
+    {
+        $this->config['mergesPath'] = $path;
+        return $this;
+    }
+
+    public function vocabPath($path): Gpt3TokenizerConfig
+    {
+        $this->config['vocabPath'] = $path;
+        return $this;
+    }
+
+    public function useCache($useCache): Gpt3TokenizerConfig
+    {
+        $this->config['useCache'] = $useCache;
+        return $this;
+    }
+
+    public function getConfig(): array
+    {
+        return $this->config;
+    }
+}
diff --git a/src/Merges.php b/src/Merges.php
@@ -2,15 +2,28 @@
 
 namespace Gioni06\Gpt3Tokenizer;
 class Merges {
-    private array $merges;
-
-    public function __construct(string $path = __DIR__ . '/pretrained_vocab_files/merges.txt')
+    public function __construct(private string $path = __DIR__ . '/pretrained_vocab_files/merges.txt')
     {
-        $this->merges = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
     }
 
-    public function lines()
+    public function bpeMerges(): array
     {
-        return $this->merges;
+        $lines = [];
+        $fp = @fopen($this->path, "r");
+        if ($fp) {
+            // drop the first line of the buffer
+            fgets($fp, 300);
+            while (($buffer = fgets($fp, 300)) !== false) {
+                $line = array_filter(preg_split("/(\s+)/", $buffer), function($e) {
+                    return strlen(trim($e)) > 0;
+                });
+                $lines[] = $line;
+            }
+            if (!feof($fp)) {
+                throw new Exception("Error: unexpected fgets() fail\n");
+            }
+            fclose($fp);
+        }
+        return $lines;
     }
 }