Skip to content

Commit

Permalink
Merge pull request #3 from Gioni06/optimizations
Browse files Browse the repository at this point in the history
optimizations
  • Loading branch information
Gioni06 authored Jan 19, 2023
2 parents dce81d6 + bc746af commit 2070dc4
Show file tree
Hide file tree
Showing 9 changed files with 1,416 additions and 97 deletions.
21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@ Install the package from [Packagist](https://packagist.org/packages/gioni06/gpt3
composer require gioni06/gpt3-tokenizer
```

What's next:
## Testing
Loading the vocabulary files consumes a lot of memory. You might need to increase the phpunit memory limit.
https://stackoverflow.com/questions/46448294/phpunit-coverage-allowed-memory-size-of-536870912-bytes-exhausted
```bash
-d memory_limit=-1
```

## What's next
- Caching for performance improvements
- Ability to work with user provided vocabulary files

Expand All @@ -23,8 +30,10 @@ What's next:
```php
use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;

$config = new Gpt3TokenizerConfig();
$tokenizer = new Gpt3Tokenizer($config);
$text = "This is some text";
$tokens = GPT3Tokenizer::encode($text);
$tokens = $tokenizer->encode($text);
// [1212,318,617,2420]
```

Expand All @@ -33,8 +42,10 @@ $tokens = GPT3Tokenizer::encode($text);
```php
use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;

$config = new Gpt3TokenizerConfig();
$tokenizer = new Gpt3Tokenizer($config);
$tokens = [1212,318,617,2420]
$text = GPT3Tokenizer::decode($tokens);
$text = $tokenizer->decode($tokens);
// "This is some text"
```

Expand All @@ -43,8 +54,10 @@ $text = GPT3Tokenizer::decode($tokens);
```php
use Gioni06\Gpt3Tokenizer\Gpt3Tokenizer;

$config = new Gpt3TokenizerConfig();
$tokenizer = new Gpt3Tokenizer($config);
$text = "This is some text";
$numberOfTokens = GPT3Tokenizer::count($text);
$numberOfTokens = $tokenizer->count($text);
// 4
```

Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
}
],
"require": {
"php": "^8.0.2"
"php": "^8.0.2",
"ext-mbstring": "*"
},
"require-dev": {
"phpunit/phpunit": "^9.5.8"
Expand Down
5 changes: 3 additions & 2 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

115 changes: 80 additions & 35 deletions src/Gpt3Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,70 @@

class Gpt3Tokenizer
{
private mixed $vocab;
private array $bpeMerges;
private array $bpe_ranks;
private bool $apcuAvailable;

private array $cache = [];

private bool $useCache;


public function __construct(Gpt3TokenizerConfig $config)
{
$vocabPath = $config->getConfig()['vocabPath'];
$vocab = new Vocab($vocabPath);
$this->vocab = $vocab->data();
// Free memory that is no longer needed
unset($vocab);

$mergesPath = $config->getConfig()['mergesPath'];
$merges = new Merges($mergesPath);
$this->bpeMerges = $merges->bpeMerges();
$this->bpe_ranks = array_combine(Gpt3Tokenizer::zipBpe($this->bpeMerges), range(0, count($this->bpeMerges) - 1));
// Free memory that is no longer needed
unset($this->bpeMerges);
unset($merges);

$this->apcuAvailable = function_exists('apcu_enabled') && apcu_enabled();
$this->useCache = $config->getConfig()['useCache'];
}

private function cacheSet($key, $val): void
{
if ($this->apcuAvailable) {
apcu_store($key, $val);
} else {
$this->cache[$key] = $val;
}
}

private function cacheGet($key): mixed
{
if ($this->apcuAvailable) {
return apcu_fetch($key);
} else {
return $this->cache[$key] ?? null;
}
}

private function cacheExists($key): array|bool
{
if ($this->apcuAvailable) {
return apcu_exists($key);
} else {
return isset($this->cache[$key]);
}
}

public static function bytes_to_unicode(): array
{
$bs = array_merge(range(mb_ord('!'), mb_ord('~') + 1), range(mb_ord('¡'), mb_ord('¬') + 1), range(mb_ord('®'), mb_ord('ÿ') + 1));

$cs = $bs;
$n = 0;
for ($b = 0; $b < 2 ** 8; $b++) {
foreach (range(0, 2 ** 8 - 1) as $b) {
if (!in_array($b, $bs)) {
$bs[] = $b;
$cs[] = 2 ** 8 + $n;
Expand All @@ -26,6 +83,7 @@ public static function bytes_to_unicode(): array
array_map(function($_, $i) use(&$result, $bs, $cs) {
$result[$bs[$i]] = $cs[$i];
}, $bs, array_keys($cs));

if (array_key_exists(256, $result)) {
unset($result[256]);
}
Expand All @@ -47,13 +105,6 @@ public static function decodeStr(array $codes): string {
return implode($bytes);
}

public static function bpeMerges(array $lines): array
{
return array_map(function($x) {
return array_filter(preg_split("/(\s+)/", $x), function($e) { return strlen(trim($e)) > 0; });
}, array_slice($lines, 1, count($lines) - 1));
}

public static function get_pairs($input_arr): array
{
$pairs = array();
Expand All @@ -73,20 +124,13 @@ public static function zipBpe(array $bpeMerges): array
return $bpe;
}

public static function dictZip(array $x, array $y): array
public function bpe(string $token): string
{
return array_combine($x, $y);
}
if($this->useCache && $this->cacheExists($token)) {
return $this->cacheGet($token);
}

public static function splitString($string): array|bool|null
{
return mb_str_split($string);
}
public static function bpe(string $token): string
{
$bpeMerges = Gpt3Tokenizer::bpeMerges((new Merges())->lines());
$bpe_ranks = Gpt3Tokenizer::dictZip(Gpt3Tokenizer::zipBpe($bpeMerges), range(0, count($bpeMerges) - 1));
$chars = self::splitString($token);
$chars = mb_str_split($token);
$pairs = self::get_pairs($chars);
if(!count($pairs)) {
return implode(" ", $chars);
Expand All @@ -96,8 +140,8 @@ public static function bpe(string $token): string
$minPairs = [];
foreach ($pairs as $pair) {
$pairStr = implode(",", $pair);
if (array_key_exists($pairStr, $bpe_ranks)) {
$minPairs[$bpe_ranks[$pairStr]] = $pair;
if (array_key_exists($pairStr, $this->bpe_ranks)) {
$minPairs[$this->bpe_ranks[$pairStr]] = $pair;
} else {
$minPairs[10e10] = $pair;
}
Expand All @@ -109,7 +153,7 @@ public static function bpe(string $token): string
}, array_keys($minPairs)))];

$bigramStr = implode(",", $bigram);
if (!array_key_exists($bigramStr, $bpe_ranks)) {
if (!array_key_exists($bigramStr, $this->bpe_ranks)) {
break;
}

Expand Down Expand Up @@ -142,12 +186,15 @@ public static function bpe(string $token): string
$pairs = self::get_pairs($chars);
}
}
return implode(" ", $chars);
$result = implode(" ", $chars);
if($this->useCache) {
$this->cacheSet($token, $result);
}
return $result;
}

public static function encode(string $text): array
public function encode(string $text): array
{
$encoder = (new Vocab())->data();
$byte_encoder = self::bytes_to_unicode();
$pat = "/'s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^[:space:]\pL\pN]+|\s+(?!\S)|\s+/u";
$bpe_tokens = array();
Expand All @@ -158,37 +205,35 @@ public static function encode(string $text): array
return $byte_encoder[$x];
}, self::encodeStr($token)));

$new_tokens = array_map(function($x) use ($encoder) {
return $encoder[$x];
}, explode(' ', self::bpe($token)));
$new_tokens = array_map(function($x) {
return $this->vocab[$x];
}, explode(' ', $this->bpe($token)));
$bpe_tokens = array_merge($bpe_tokens, $new_tokens);
}
return $bpe_tokens;
}

public static function decode(array $tokens): string
public function decode(array $tokens): string
{
$encoder = (new Vocab())->data();
$decoder = array_flip($encoder);
$decoder = array_flip($this->vocab);
$byte_decoder = array_flip(self::bytes_to_unicode());

$text = array_map(function($x) use ($decoder) {
return $decoder[$x];
}, $tokens);

$text = implode($text);
$chars = self::splitString($text);
$chars = mb_str_split($text);
$decodedChars = array();
for ($i = 0; $i < count($chars); $i++) {
$decodedChars[] = $byte_decoder[$chars[$i]];
}
return self::decodeStr($decodedChars);
}

public static function count(string $text): int
public function count(string $text): int
{
$tokens = self::encode($text);
return count($tokens);
}
public function __construct(){}
}
36 changes: 36 additions & 0 deletions src/Gpt3TokenizerConfig.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace Gioni06\Gpt3Tokenizer;

class Gpt3TokenizerConfig
{

private array $config = [
'mergesPath' => __DIR__ . '/pretrained_vocab_files/merges.txt',
'vocabPath' => __DIR__ . '/pretrained_vocab_files/vocab.json',
'useCache' => true,
];

public function mergesPath($path): Gpt3TokenizerConfig
{
$this->config['mergesPath'] = $path;
return $this;
}

public function vocabPath($path): Gpt3TokenizerConfig
{
$this->config['vocabPath'] = $path;
return $this;
}

public function useCache($useCache): Gpt3TokenizerConfig
{
$this->config['useCache'] = $useCache;
return $this;
}

public function getConfig(): array
{
return $this->config;
}
}
25 changes: 19 additions & 6 deletions src/Merges.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,28 @@

namespace Gioni06\Gpt3Tokenizer;
class Merges {
private array $merges;

public function __construct(string $path = __DIR__ . '/pretrained_vocab_files/merges.txt')
public function __construct(private string $path = __DIR__ . '/pretrained_vocab_files/merges.txt')
{
$this->merges = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
}

public function lines()
public function bpeMerges(): array
{
return $this->merges;
$lines = [];
$fp = @fopen($this->path, "r");
if ($fp) {
// drop the first line of the buffer
fgets($fp, 300);
while (($buffer = fgets($fp, 300)) !== false) {
$line = array_filter(preg_split("/(\s+)/", $buffer), function($e) {
return strlen(trim($e)) > 0;
});
$lines[] = $line;
}
if (!feof($fp)) {
throw new Exception("Error: unexpected fgets() fail\n");
}
fclose($fp);
}
return $lines;
}
}
Loading

0 comments on commit 2070dc4

Please sign in to comment.