-
Notifications
You must be signed in to change notification settings - Fork 913
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ feat(rust): Convert project to a multi-crate workspace
This commit restructures the project from a single-crate workspace into a multi-crate workspace, dividing it into 'rs-tiktoken' and 'py-tiktoken'. This is done to improve the clarity of the organization of the codebase and make the Rust and Python modules separate for easier code maintenance. The setup.py is also updated to reflect these changes in the directory structure. Refs: #24
- Loading branch information
Showing
11 changed files
with
128 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,5 @@ | ||
[package] | ||
name = "tiktoken" | ||
version = "0.4.0" | ||
edition = "2021" | ||
rust-version = "1.57.0" | ||
|
||
[lib] | ||
name = "_tiktoken" | ||
crate-type = ["cdylib"] | ||
|
||
[dependencies] | ||
pyo3 = { version = "0.19.0", features = ["extension-module"] } | ||
|
||
# tiktoken dependencies | ||
fancy-regex = "0.11.0" | ||
regex = "1.8.3" | ||
rustc-hash = "1.1.0" | ||
bstr = "1.5.0" | ||
|
||
[profile.release] | ||
incremental = true | ||
[workspace] | ||
members = [ | ||
"rs-tiktoken", | ||
"py-tiktoken", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
[package] | ||
name = "py-tiktoken" | ||
version = "0.4.0" | ||
edition = "2021" | ||
rust-version = "1.57.0" | ||
|
||
[lib] | ||
name = "_tiktoken" | ||
crate-type = ["cdylib"] | ||
|
||
[dependencies] | ||
tiktoken = { path = "../rs-tiktoken" } | ||
pyo3 = { version = "0.19.0", features = ["extension-module"] } | ||
|
||
# tiktoken dependencies | ||
fancy-regex = "0.11.0" | ||
regex = "1.8.3" | ||
rustc-hash = "1.1.0" | ||
bstr = "1.5.0" | ||
|
||
[profile.release] | ||
incremental = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pub mod tiktoken_py; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
[package] | ||
name = "tiktoken" | ||
version = "0.4.0" | ||
edition = "2021" | ||
rust-version = "1.57.0" | ||
|
||
[dependencies] | ||
fancy-regex = "0.11.0" | ||
regex = "1.8.3" | ||
rustc-hash = "1.1.0" | ||
bstr = "1.5.0" | ||
once_cell = "1.18.0" | ||
|
||
[profile.release] | ||
incremental = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
//! WARNING: This code is under active development. Functionality, | ||
//! behavior, and the interface may change in future updates. | ||
use std::collections::HashMap; | ||
use once_cell::sync::Lazy; | ||
use regex::Regex; | ||
|
||
|
||
pub struct Encoding { | ||
/// The name of the encoding. It should be clear from the name of the encoding | ||
/// what behaviour to expect, in particular, encodings with different special tokens | ||
/// should have different names. | ||
pub name: &'static str, | ||
/// A regex pattern string that is used to split the input text. | ||
pub pat_str: Regex, | ||
/// A dictionary mapping mergeable token bytes to their ranks. The ranks | ||
/// must correspond to merge priority. | ||
pub mergeable_ranks: HashMap<&'static str, u32>, | ||
/// A dictionary mapping special token strings to their token values. | ||
pub special_tokens: HashMap<&'static str, u32>, | ||
/// The number of tokens in the vocabulary. If provided, it is checked | ||
/// that the number of mergeable tokens and special tokens is equal to this number. | ||
pub explicit_n_vocab: Option<u32>, | ||
} | ||
|
||
pub static GPT2: Lazy<Encoding> = Lazy::new(|| { | ||
let mergeable_ranks = Default::default(); | ||
let special_tokens = [ | ||
("<|endoftext|>", 50256) | ||
].iter().cloned().collect(); | ||
|
||
Encoding{ | ||
name: "gpt2", | ||
pat_str: Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+").unwrap(), | ||
mergeable_ranks, | ||
special_tokens, | ||
explicit_n_vocab: Some(50257), | ||
} | ||
}); | ||
|
||
#[cfg(test)] | ||
mod tes { | ||
use super::*; | ||
|
||
#[test] | ||
fn test() { | ||
let a = GPT2.name; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// This check is new and seems buggy (possibly with PyO3 interaction) | ||
pub mod core; | ||
pub mod encoding; | ||
mod model; | ||
|
||
pub fn get_encoding() { | ||
|
||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
|
||
|
||
#[test] | ||
fn test_simple() { | ||
// enc = tiktoken.get_encoding("gpt2") | ||
// assert enc.encode("hello world") == [31373, 995] | ||
// assert enc.decode([31373, 995]) == "hello world" | ||
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] | ||
// | ||
// enc = tiktoken.get_encoding("cl100k_base") | ||
// assert enc.encode("hello world") == [15339, 1917] | ||
// assert enc.decode([15339, 1917]) == "hello world" | ||
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] | ||
// | ||
// for enc_name in tiktoken.list_encoding_names(): | ||
// enc = tiktoken.get_encoding(enc_name) | ||
// for token in range(10_000): | ||
// assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token | ||
} | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.