diff --git a/CHANGELOG.md b/CHANGELOG.md index f616a62..b573a25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.19.1 + +### What's New + +- Python splitters have new `chunk_all` and `chunk_all_indices` method so the multiple texts can be processed in parallel. (For Rust, you should be able to use `rayon` to do this already) + ## v0.19.0 ### Breaking Changes diff --git a/Cargo.lock b/Cargo.lock index c213a91..e7e8db5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2013,9 +2013,10 @@ dependencies = [ [[package]] name = "semantic-text-splitter" -version = "0.19.0" +version = "0.19.1" dependencies = [ "pyo3", + "rayon", "text-splitter", "tiktoken-rs", "tokenizers", @@ -2280,7 +2281,7 @@ dependencies = [ [[package]] name = "text-splitter" -version = "0.19.0" +version = "0.19.1" dependencies = [ "ahash", "auto_enums", diff --git a/Cargo.toml b/Cargo.toml index d18df6e..fb703e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = ["bindings/*"] [workspace.package] -version = "0.19.0" +version = "0.19.1" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python." diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index d0ea947..e64d2ff 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -16,6 +16,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.23", features = ["abi3-py39"] } +rayon = "1.10" text-splitter = { path = "../..", features = [ "code", "markdown", diff --git a/bindings/python/semantic_text_splitter.pyi b/bindings/python/semantic_text_splitter.pyi index 78f25fa..c2ca1fd 100644 --- a/bindings/python/semantic_text_splitter.pyi +++ b/bindings/python/semantic_text_splitter.pyi @@ -269,6 +269,38 @@ class TextSplitter: trimmed as well. """ + def chunk_all(self, texts: List[str]) -> List[List[str]]: + """ + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + + def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]: + """ + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + @final class MarkdownSplitter: """Markdown splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size. @@ -543,6 +575,38 @@ class MarkdownSplitter: trimmed as well. """ + def chunk_all(self, texts: List[str]) -> List[List[str]]: + """ + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + + def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]: + """ + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + @final class CodeSplitter: """Code splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size. @@ -841,3 +905,35 @@ class CodeSplitter: If `trim` was specified in the text splitter, then each chunk will already be trimmed as well. """ + + def chunk_all(self, texts: List[str]) -> List[List[str]]: + """ + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + + def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]: + """ + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 3a0e41d..30ac551 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -11,6 +11,7 @@ use pyo3::{ prelude::*, pybacked::PyBackedStr, }; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use text_splitter::{ Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSizer, CodeSplitter, CodeSplitterError, MarkdownSplitter, TextSplitter, @@ -512,6 +513,55 @@ impl PyTextSplitter { .map(|c| offsets.map_byte_to_char(c)) .collect() } + + /** + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect()) + .collect() + } + + /** + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all_indices(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| { + let mut offsets = ByteToCharOffsetTracker::new(&text); + self.splitter + .chunk_indices(&text) + .map(|c| offsets.map_byte_to_char(c)) + .map(|(i, c)| (i, c.to_owned())) + .collect() + }) + .collect() + } } /** @@ -890,6 +940,55 @@ impl PyMarkdownSplitter { .map(|c| offsets.map_byte_to_char(c)) .collect() } + + /** + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect()) + .collect() + } + + /** + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all_indices(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| { + let mut offsets = ByteToCharOffsetTracker::new(&text); + self.splitter + .chunk_indices(&text) + .map(|c| offsets.map_byte_to_char(c)) + .map(|(i, c)| (i, c.to_owned())) + .collect() + }) + .collect() + } } /** @@ -1325,6 +1424,55 @@ impl PyCodeSplitter { .map(|c| offsets.map_byte_to_char(c)) .collect() } + + /** + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect()) + .collect() + } + + /** + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all_indices(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| { + let mut offsets = ByteToCharOffsetTracker::new(&text); + self.splitter + .chunk_indices(&text) + .map(|c| offsets.map_byte_to_char(c)) + .map(|(i, c)| (i, c.to_owned())) + .collect() + }) + .collect() + } } #[doc = include_str!("../README.md")] diff --git a/bindings/python/tests/test_integration.py b/bindings/python/tests/test_integration.py index b281c87..2720312 100644 --- a/bindings/python/tests/test_integration.py +++ b/bindings/python/tests/test_integration.py @@ -298,3 +298,45 @@ def test_code_char_indices_with_multibyte_character() -> None: (4, "12ü"), (8, "12ü"), ] + + +def test_chunk_all() -> None: + splitter = TextSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all(texts) + assert chunks == [["123", "123"], ["456", "456"]] + + +def test_chunk_all_indices() -> None: + splitter = TextSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all_indices(texts) + assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]] + + +def test_chunk_all_markdown() -> None: + splitter = MarkdownSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all(texts) + assert chunks == [["123", "123"], ["456", "456"]] + + +def test_chunk_all_indices_markdown() -> None: + splitter = MarkdownSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all_indices(texts) + assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]] + + +def test_chunk_all_code() -> None: + splitter = CodeSplitter(tree_sitter_python.language(), 4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all(texts) + assert chunks == [["123", "123"], ["456", "456"]] + + +def test_chunk_all_indices_code() -> None: + splitter = CodeSplitter(tree_sitter_python.language(), 4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all_indices(texts) + assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]