feat: improve performance, memory usage & initialization time (#50)

fixes #18 fixes #35 feat: add missing models chore: preliminary benchmarking tool feat: update tokenization format to make the package smaller docs: update README chore: add npmignore
niieani · Sep 20, 2024 · e2c560a · e2c560a
1 parent fcbf48a
commit e2c560a
Show file tree

Hide file tree

Showing 72 changed files with 5,168 additions and 580 deletions.
diff --git a/.npmignore b/.npmignore
@@ -0,0 +1,57 @@
+.vscode
+.eslintignore
+.eslintrc.js
+.prettierignore
+.editorconfig
+tsconfig.json
+tsconfig.*.json
+babel.config.js
+jest.config.js
+prettier.config.js
+vite.config.ts
+webpack.config.js
+*.tsbuildinfo
+
+/.pnp.*
+
+# Logs
+logs/
+*.log
+
+# Cache
+.eslintcache
+.idea
+.npm
+.vscode
+.yarnclean
+
+# Directories
+coverage/
+dist/
+dts/
+esm/
+mjs/
+cjs/
+node_modules/
+
+# Custom
+*.min.js
+*.map
+
+.DS_Store
+*.local
+
+tsconfig.json
+jest.config.js
+
+benchmark/
+.yarn/
+.config/
+.github/
+.swc/
+.vscode/
+data/
+docs/
+.yarnrc.yml
+*.config.js
+tsconfig*.json
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ As of 2023, it is the most feature-complete, open-source GPT tokenizer on NPM. I
 
 - Support for easily tokenizing chats thanks to the `encodeChat` function
 - Support for all current OpenAI models (available encodings: `r50k_base`, `p50k_base`, `p50k_edit`, `cl100k_base` and `o200k_base`)
+- Can be loaded and work synchronously! (i.e. in non async/await contexts)
 - Generator function versions of both the decoder and encoder functions
 - Provides the ability to decode an asynchronous stream of data (using `decodeAsyncGenerator` and `decodeGenerator` with any iterable input)
 - No global cache (no accidental memory leaks, as with the original GPT-3-Encoder implementation)
@@ -45,11 +46,11 @@ npm install gpt-tokenizer
 
 If you wish to use a custom encoding, fetch the relevant script.
 
-- https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js
+- https://unpkg.com/gpt-tokenizer/dist/o200k_base.js (for `gpt-4o`)
+- https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js (for `gpt-4-*` and `gpt-3.5-turbo`)
 - https://unpkg.com/gpt-tokenizer/dist/p50k_base.js
 - https://unpkg.com/gpt-tokenizer/dist/p50k_edit.js
 - https://unpkg.com/gpt-tokenizer/dist/r50k_base.js
-- https://unpkg.com/gpt-tokenizer/dist/o200k_base.js
 
 The global name is a concatenation: `GPTTokenizer_${encoding}`.
 
@@ -131,7 +132,8 @@ import {
   encode,
   decode,
   isWithinTokenLimit,
-} from 'gpt-tokenizer/model/text-davinci-003'
+  // etc...
+} from 'gpt-tokenizer/model/gpt-3.5-turbo'
 ```
 
 If you're dealing with a resolver that doesn't support package.json `exports` resolution, you might need to import from the respective `cjs` or `esm` directory, e.g.:
@@ -141,65 +143,47 @@ import {
   encode,
   decode,
   isWithinTokenLimit,
-} from 'gpt-tokenizer/cjs/model/text-davinci-003'
+  // etc...
+} from 'gpt-tokenizer/cjs/model/gpt-3.5-turbo'
 ```
 
-### Supported models and their encodings
+#### Lazy loading
 
-chat:
+If you don't mind loading the tokenizer asynchronously, you can use a dynamic import inside your function, like so:
 
-- `gpt-4-32k` (`cl100k_base`)
-- `gpt-4-0314` (`cl100k_base`)
-- `gpt-4-32k-0314` (`cl100k_base`)
-- `gpt-3.5-turbo` (`cl100k_base`)
-- `gpt-3.5-turbo-0301` (`cl100k_base`)
-- `gpt-4o` (`o200k_base`)
+```ts
+const {
+  encode,
+  decode,
+  isWithinTokenLimit,
+  // etc...
+} = await import('gpt-tokenizer/model/gpt-3.5-turbo')
+```
+
+#### Loading an encoding
 
-note: if you're using `gpt-3.5-*` or `gpt-4-*` and don't see the model you're looking for, use the `cl100k_base` encoding directly.
+If your model isn't supported by the package, but you know which BPE encoding it uses, you can load the encoding directly, e.g.:
+
+```ts
+import {
+  encode,
+  decode,
+  isWithinTokenLimit,
+  // etc...
+} from 'gpt-tokenizer/encoding/cl100k_base'
+```
 
-text-only:
+### Supported models and their encodings
 
+- `gpt-4o` (`o200k_base`)
+- `gpt-4-*` (`cl100k_base`)
+- `gpt-3.5-turbo` (`cl100k_base`)
 - `text-davinci-003` (`p50k_base`)
 - `text-davinci-002` (`p50k_base`)
 - `text-davinci-001` (`r50k_base`)
-- `text-curie-001` (`r50k_base`)
-- `text-babbage-001` (`r50k_base`)
-- `text-ada-001` (`r50k_base`)
-- `davinci` (`r50k_base`)
-- `curie` (`r50k_base`)
-- `babbage` (`r50k_base`)
-- `ada` (`r50k_base`)
-
-code:
-
-- `code-davinci-002` (`p50k_base`)
-- `code-davinci-001` (`p50k_base`)
-- `code-cushman-002` (`p50k_base`)
-- `code-cushman-001` (`p50k_base`)
-- `davinci-codex` (`p50k_base`)
-- `cushman-codex` (`p50k_base`)
+- ...and many other models, see [mapping](./src/mapping.ts) for an up-to-date list of supported models and their encodings.
 
-edit:
-
-- `text-davinci-edit-001` (`p50k_edit`)
-- `code-davinci-edit-001` (`p50k_edit`)
-
-embeddings:
-
-- `text-embedding-ada-002` (`cl100k_base`)
-
-old embeddings:
-
-- `text-similarity-davinci-001` (`r50k_base`)
-- `text-similarity-curie-001` (`r50k_base`)
-- `text-similarity-babbage-001` (`r50k_base`)
-- `text-similarity-ada-001` (`r50k_base`)
-- `text-search-davinci-doc-001` (`r50k_base`)
-- `text-search-curie-doc-001` (`r50k_base`)
-- `text-search-babbage-doc-001` (`r50k_base`)
-- `text-search-ada-doc-001` (`r50k_base`)
-- `code-search-babbage-code-001` (`r50k_base`)
-- `code-search-ada-code-001` (`r50k_base`)
+Note: if you're using `gpt-3.5-*` or `gpt-4-*` and don't see the model you're looking for, use the `cl100k_base` encoding directly.
 
 ## API
 
@@ -261,6 +245,8 @@ const chat = [
 const tokens = encodeChat(chat)
 ```
 
+Note that if you encode an empty chat, it will still contain the minimum number of special tokens.
+
 ### `encodeGenerator(text: string): Generator<number[], void, undefined>`
 
 Encodes the given text using a generator, yielding chunks of tokens.
@@ -353,10 +339,10 @@ Similarly, you can specify custom sets of disallowed special tokens when encodin
 containing the disallowed special tokens as a parameter to the `encode` function:
 
 ```ts
-import { encode } from 'gpt-tokenizer'
+import { encode, EndOfText } from 'gpt-tokenizer'
 
-const inputText = `Some Text`
-const disallowedSpecial = new Set(['Some'])
+const inputText = `Some Text ${EndOfText}`
+const disallowedSpecial = new Set([EndOfText])
 // throws an error:
 const encoded = encode(inputText, undefined, disallowedSpecial)
 ```

diff --git a/benchmark/.gitignore b/benchmark/.gitignore
@@ -0,0 +1,3 @@
+!tsconfig.json
+.yarn
+results/
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1 @@
+# benchmark
diff --git a/benchmark/package.json b/benchmark/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "benchmark",
+  "packageManager": "yarn@4.5.0",
+  "type": "module",
+  "scripts": {
+    "start": "yarn tsc && node dist/benchmarkRunner.js",
+    "profile": "yarn tsc && node --inspect dist/benchmarkWorker.js --run"
+  },
+  "devDependencies": {
+    "@types/node": "^22.5.5",
+    "chalk": "^5.3.0",
+    "cli-table3": "^0.6.5",
+    "inquirer": "^11.0.2",
+    "typescript": "^5.6.2"
+  },
+  "dependencies": {
+    "gpt-3-encoder": "^1.1.4",
+    "gpt-tokenizer": "^2.2.3",
+    "gpt3-tokenizer": "^1.1.5",
+    "js-tiktoken": "^1.0.14",
+    "tiktoken": "^1.0.16",
+    "tiktoken-node": "^0.0.7"
+  }
+}