From 9d01e5670ff50eb74cdb96406c7f3d9add0ae2f8 Mon Sep 17 00:00:00 2001 From: Shantanu Date: Mon, 13 May 2024 10:05:33 -0700 Subject: [PATCH] Sync codebase --- CHANGELOG.md | 4 ++++ Cargo.toml | 2 +- pyproject.toml | 3 +-- tiktoken/model.py | 2 ++ tiktoken_ext/openai_public.py | 30 ++++++++++++++++++++++++++++++ 5 files changed, 38 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 049fdb4d..f94795ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ This is the changelog for the open source version of tiktoken. +## [v0.7.0] +- Support for `gpt-4o` +- Performance improvements + ## [v0.6.0] - Optimise regular expressions for a 20% performance improvement, thanks to @paplorinc! - Add `text-embedding-3-*` models to `encoding_for_model` diff --git a/Cargo.toml b/Cargo.toml index 14588580..4efb156f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.6.0" +version = "0.7.0" edition = "2021" rust-version = "1.57.0" diff --git a/pyproject.toml b/pyproject.toml index 47aada31..7cc7cb10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.6.0" +version = "0.7.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} @@ -42,4 +42,3 @@ test-command = "pytest {project}/tests --import-mode=append" [[tool.cibuildwheel.overrides]] select = "*linux_aarch64" test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'""" - diff --git a/tiktoken/model.py b/tiktoken/model.py index 17532aee..6ecd7232 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -6,6 +6,7 @@ # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat + "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13 "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo-": "cl100k_base", # Azure deployment name @@ -18,6 +19,7 @@ MODEL_TO_ENCODING: dict[str, str] = { # chat + "gpt-4o": "o200k_base", "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", "gpt-3.5": "cl100k_base", # Common shorthand diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 330ecabb..6b29a711 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -88,10 +88,40 @@ def cl100k_base(): } +def o200k_base(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", + expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d", + ) + special_tokens = { + ENDOFTEXT: 199999, + ENDOFPROMPT: 200018, + } + # This regex could be made more efficient + pat_str = "|".join( + [ + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""\p{N}{1,3}""", + r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""", + r"""\s*[\r\n]+""", + r"""\s+(?!\S)""", + r"""\s+""", + ] + ) + return { + "name": "o200k_base", + "pat_str": pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": special_tokens, + } + + ENCODING_CONSTRUCTORS = { "gpt2": gpt2, "r50k_base": r50k_base, "p50k_base": p50k_base, "p50k_edit": p50k_edit, "cl100k_base": cl100k_base, + "o200k_base": o200k_base, }