Use binary search in TextChunks (#71)

* Use a binary search to fill TextChunks Co-authored-by: Richard Bradfield <richard.bradfield@platformed.com> * Update snapshots for binary search * Limit next section based on the encoded offsets to limit search space * Use iterator based approach for chunk size to avoid extra allocations * remove unneeded flag in regex matches * Update changelog, bump version, and use higher version of tiktoken * Bump required versions of both tokenizer crates * add back onig feature for tokenizers --------- Co-authored-by: Richard Bradfield <richard.bradfield@platformed.com>
benbrandt · Dec 27, 2023 · e716aa9 · e716aa9
1 parent 6637a46
commit e716aa9
Show file tree

Hide file tree

Showing 20 changed files with 19,765 additions and 17,811 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## v0.5.0
+
+### What's New
+
+- Significant performance improvements for generating chunks with the `tokenizers` or `tiktoken-rs` crates by applying binary search when attempting to find the next matching chunk size.
+
+### Breaking Changes
+
+- Minimum required version of `tokenizers` is now `0.15.0`
+- Minimum required version of `tiktoken-rs` is now `0.5.6`
+- Due to using binary search, there are some slight differences at the edges of chunks where the algorithm was a little greedier before. If two candidates would tokenize to the same amount of tokens that fit within the capacity, it will now choose the shorter text. Due to the nature of of tokenizers, this happens more often with whitespace at the end of a chunk, and rarely effects users who have set `with_trim_chunks(true)`. It is a tradeoff, but would have made the binary search code much more complicated to keep the exact same behavior.
+- The `chunk_size` method on `ChunkSizer` now needs to accept a `ChunkCapacity` argument, and return a `ChunkSize` struct instead of a `usize`. This was to help support the new binary search method in chunking, and should only affect users who implemented custom `ChunkSizer`s and weren't using one of the provided ones.
+    - New signature: `fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize;`
+
 ## v0.4.5
 
 ### What's New

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "text-splitter"
-version = "0.4.5"
+version = "0.5.0"
 authors = ["Ben Brandt <benjamin.j.brandt@gmail.com>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."
@@ -23,18 +23,16 @@ either = "1.9.0"
 itertools = "0.12.0"
 once_cell = "1.18.0"
 regex = "1.10.2"
-tiktoken-rs = { version = ">=0.2.0, <0.6.0", optional = true }
-tokenizers = { version = ">=0.13.3, <0.16.0", default_features = false, features = [
-    "onig",
-], optional = true }
+tiktoken-rs = { version = "0.5.6", optional = true }
+tokenizers = { version = "0.15.0", default_features = false, features = ["onig"], optional = true }
 unicode-segmentation = "1.10.1"
 
 [dev-dependencies]
 criterion = "0.5.1"
 fake = "2.9.1"
 insta = { version = "1.34.0", features = ["glob", "yaml"] }
 more-asserts = "0.3.1"
-tokenizers = { version = ">=0.13.3, <0.16.0", default-features = false, features = [
+tokenizers = { version = "0.15.0", default-features = false, features = [
     "onig",
     "http",
 ] }

diff --git a/src/characters.rs b/src/characters.rs
@@ -1,4 +1,6 @@
-use crate::ChunkSizer;
+use std::ops::Range;
+
+use crate::{ChunkCapacity, ChunkSize, ChunkSizer};
 
 /// Used for splitting a piece of text into chunks based on the number of
 /// characters in each chunk.
@@ -11,14 +13,26 @@ use crate::ChunkSizer;
 #[derive(Debug)]
 pub struct Characters;
 
+impl Characters {
+    fn encoded_offsets(chunk: &str) -> impl Iterator<Item = Range<usize>> + '_ {
+        chunk.char_indices().map(|(i, c)| i..(i + c.len_utf8()))
+    }
+}
+
 impl ChunkSizer for Characters {
     /// Determine the size of a given chunk to use for validation.
-    ///
-    /// ```
-    /// use text_splitter::{Characters, ChunkSizer};
-    ///
-    /// assert_eq!(Characters.chunk_size("hello"), 5);
-    fn chunk_size(&self, chunk: &str) -> usize {
-        chunk.chars().count()
+    fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize {
+        ChunkSize::from_offsets(Self::encoded_offsets(chunk), capacity)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn returns_offsets() {
+        let offsets = Characters::encoded_offsets("eé").collect::<Vec<_>>();
+        assert_eq!(offsets, vec![0..1, 1..3]);
     }
 }
diff --git a/src/huggingface.rs b/src/huggingface.rs
@@ -1,6 +1,8 @@
+use std::ops::Range;
+
 use tokenizers::Tokenizer;
 
-use crate::ChunkSizer;
+use crate::{ChunkCapacity, ChunkSize, ChunkSizer};
 
 impl ChunkSizer for Tokenizer {
     /// Returns the number of tokens in a given text after tokenization.
@@ -9,8 +11,8 @@ impl ChunkSizer for Tokenizer {
     ///
     /// Will panic if you don't have a byte-level tokenizer and the splitter
     /// encounters text it can't tokenize.
-    fn chunk_size(&self, chunk: &str) -> usize {
-        chunk_size(self, chunk)
+    fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize {
+        ChunkSize::from_offsets(encoded_offsets(self, chunk), capacity)
     }
 }
 
@@ -21,14 +23,59 @@ impl ChunkSizer for &Tokenizer {
     ///
     /// Will panic if you don't have a byte-level tokenizer and the splitter
     /// encounters text it can't tokenize.
-    fn chunk_size(&self, chunk: &str) -> usize {
-        chunk_size(self, chunk)
+    fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize {
+        ChunkSize::from_offsets(encoded_offsets(self, chunk), capacity)
     }
 }
 
-fn chunk_size(tokenizer: &Tokenizer, chunk: &str) -> usize {
-    tokenizer
+fn encoded_offsets<'text>(
+    tokenizer: &Tokenizer,
+    chunk: &'text str,
+) -> impl Iterator<Item = Range<usize>> + 'text {
+    let encoding = tokenizer
         .encode(chunk, false)
-        .map(|enc| enc.len())
-        .expect("Unable to tokenize the following string {str}")
+        .expect("Unable to tokenize the following string {chunk}");
+    let mut offsets = encoding
+        .get_offsets()
+        .iter()
+        .map(|(start, end)| {
+            let end = *end + 1;
+            *start..end
+        })
+        .collect::<Vec<_>>();
+    // Sometimes the offsets are off by one because of whitespace prefixing
+    let prefixed = offsets
+        .last()
+        .map(|r| r.end != chunk.len())
+        .unwrap_or_default();
+
+    if prefixed {
+        for range in &mut offsets {
+            if range.start != 0 {
+                range.start -= 1;
+            }
+            range.end -= 1;
+        }
+    }
+
+    offsets.into_iter()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn returns_offsets() {
+        let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap();
+        let offsets = encoded_offsets(&tokenizer, " An apple a").collect::<Vec<_>>();
+        assert_eq!(offsets, vec![0..3, 3..9, 9..11]);
+    }
+
+    #[test]
+    fn returns_offsets_handles_prefix() {
+        let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap();
+        let offsets = encoded_offsets(&tokenizer, "An apple a").collect::<Vec<_>>();
+        assert_eq!(offsets, vec![0..2, 2..8, 8..10]);
+    }
 }