From b62d5570235032f6d39b04c77a33fe3b87e657ee Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Thu, 12 Dec 2024 03:13:08 +0530 Subject: [PATCH 1/4] [Fix] indexing logic for TokenChunker for fn chunk --- src/chonkie/chunker/token.py | 42 +++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index df889a9..78f8691 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -48,6 +48,32 @@ def __init__( if isinstance(chunk_overlap, int) else int(chunk_overlap * chunk_size) ) + + def _create_chunks( + self, + chunk_texts: List[str], + token_counts: List[int], + decoded_text: str, + ) -> List[Chunk]: + """Create chunks from a list of texts.""" + # package everything as Chunk objects and send out the result + chunks = [] + current_index = 0 + for chunk_text, token_count in zip(chunk_texts, token_counts): + start_index = decoded_text.find( + chunk_text, current_index + ) # Find needs to be run every single time because of unknown overlap length + end_index = start_index + len(chunk_text) + chunks.append( + Chunk( + text=chunk_text, + start_index=start_index, + end_index=end_index, + token_count=token_count, + ) + ) + current_index = end_index + return chunks def chunk(self, text: str) -> List[Chunk]: """Split text into overlapping chunks of specified token size. @@ -85,21 +111,7 @@ def chunk(self, text: str) -> List[Chunk]: token_groups ) # decrease the time by decoding in one go (?) - # package everything as Chunk objects and send out the result - chunks = [] - for chunk_text, token_count in zip(chunk_texts, token_counts): - start_index = decoded_text.find( - chunk_text - ) # Find needs to be run every single time because of unknown overlap length - end_index = start_index + len(chunk_text) - chunks.append( - Chunk( - text=chunk_text, - start_index=start_index, - end_index=end_index, - token_count=token_count, - ) - ) + chunks = self._process_batch(token_groups, token_counts, decoded_text) return chunks From 23f72b0948ae2e3b14cf3aa7827eef2b14fce20e Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Thu, 12 Dec 2024 03:15:48 +0530 Subject: [PATCH 2/4] Refactor TokenChunker to use _create_chunks method - Updated the TokenChunker class to replace the _process_batch method with _create_chunks for improved clarity and functionality. - This change enhances the overall structure of the code and aligns with recent refactoring efforts in the chunking classes. --- src/chonkie/chunker/token.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 78f8691..b6f9ee0 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -111,7 +111,7 @@ def chunk(self, text: str) -> List[Chunk]: token_groups ) # decrease the time by decoding in one go (?) - chunks = self._process_batch(token_groups, token_counts, decoded_text) + chunks = self._create_chunks(chunk_texts, token_counts, decoded_text) return chunks From 5760031a26aa19e9d47d2062ebb2c3b32df3f13f Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Thu, 12 Dec 2024 03:30:14 +0530 Subject: [PATCH 3/4] Refactor WordChunker to improve chunk creation logic - Updated the _create_chunk method to include current_index as a parameter for better control over chunk starting index. - Adjusted the logic in the chunking process to utilize the new current_index parameter, enhancing the accuracy of chunk creation. - This refactor improves code clarity and maintains consistency with recent changes in other chunking classes. --- src/chonkie/chunker/word.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/chonkie/chunker/word.py b/src/chonkie/chunker/word.py index f60d2f8..8649853 100644 --- a/src/chonkie/chunker/word.py +++ b/src/chonkie/chunker/word.py @@ -51,21 +51,26 @@ def _split_into_words(self, text: str) -> List[str]: return words def _create_chunk( - self, words: List[str], text: str, token_count: int + self, + words: List[str], + text: str, + token_count: int, + current_index: int = 0, ) -> Tuple[Chunk, int]: """Create a chunk from a list of words. Args: words: List of words to create chunk from - start_idx: Starting index in original text - end_idx: Ending index in original text + text: The original text + token_count: Number of tokens in the chunk + current_index: The index of the first token in the chunk Returns: Tuple of (Chunk object, number of tokens in chunk) """ chunk_text = "".join(words) - start_index = text.find(chunk_text) + start_index = text.find(chunk_text, current_index) return Chunk( text=chunk_text, start_index=start_index, @@ -110,19 +115,23 @@ def chunk(self, text: str) -> List[Chunk]: current_chunk = [] current_chunk_length = 0 + current_index = 0 + for i, (word, length) in enumerate(zip(words, lengths)): if current_chunk_length + length <= self.chunk_size: current_chunk.append(word) current_chunk_length += length else: - chunk = self._create_chunk(current_chunk, text, current_chunk_length) + chunk = self._create_chunk( + current_chunk, + text, + current_chunk_length, + current_index, + ) chunks.append(chunk) - # update the current_chunk and previous chunk previous_chunk_length = current_chunk_length - - current_chunk = [] - current_chunk_length = 0 + current_index = chunk.end_index overlap = [] overlap_length = 0 From 289cdc2155d40c12692e1b123673e8ce52078013 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Thu, 12 Dec 2024 03:30:22 +0530 Subject: [PATCH 4/4] Refactor SentenceChunker to improve sentence position calculation - Removed unnecessary space adjustment in position calculation for sentences, as they are already separated by spaces. - Commented out the _prepare_sentences method to streamline the class and focus on the essential functionality. - This change enhances code clarity and prepares for future improvements in sentence processing. --- src/chonkie/chunker/sentence.py | 58 ++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py index 8c0b6d2..4936918 100644 --- a/src/chonkie/chunker/sentence.py +++ b/src/chonkie/chunker/sentence.py @@ -236,7 +236,7 @@ def _prepare_sentences(self, text: str) -> List[Sentence]: current_pos = 0 for sent in sentence_texts: positions.append(current_pos) - current_pos += len(sent) + 1 # +1 for space/separator + current_pos += len(sent) # No +1 space because sentences are already separated by spaces if not self.approximate: # Get accurate token counts in batch @@ -253,34 +253,34 @@ def _prepare_sentences(self, text: str) -> List[Sentence]: for sent, pos, count in zip(sentence_texts, positions, token_counts) ] - def _prepare_sentences(self, text: str) -> List[Sentence]: - """Prepare sentences with either estimated or accurate token counts.""" - # Split text into sentences - sentence_texts = self._split_sentences(text) - if not sentence_texts: - return [] - - # Calculate positions once - positions = [] - current_pos = 0 - for sent in sentence_texts: - positions.append(current_pos) - current_pos += len(sent) + 1 # +1 for space/separator - - if not self.approximate: - # Get accurate token counts in batch - token_counts = self._get_token_counts(sentence_texts) - else: - # Estimate token counts using character length - token_counts = self._estimate_token_counts(sentence_texts) - - # Create sentence objects - return [ - Sentence( - text=sent, start_index=pos, end_index=pos + len(sent), token_count=count - ) - for sent, pos, count in zip(sentence_texts, positions, token_counts) - ] + # def _prepare_sentences(self, text: str) -> List[Sentence]: + # """Prepare sentences with either estimated or accurate token counts.""" + # # Split text into sentences + # sentence_texts = self._split_sentences(text) + # if not sentence_texts: + # return [] + + # # Calculate positions once + # positions = [] + # current_pos = 0 + # for sent in sentence_texts: + # positions.append(current_pos) + # current_pos += len(sent) + 1 # +1 for space/separator + + # if not self.approximate: + # # Get accurate token counts in batch + # token_counts = self._get_token_counts(sentence_texts) + # else: + # # Estimate token counts using character length + # token_counts = self._estimate_token_counts(sentence_texts) + + # # Create sentence objects + # return [ + # Sentence( + # text=sent, start_index=pos, end_index=pos + len(sent), token_count=count + # ) + # for sent, pos, count in zip(sentence_texts, positions, token_counts) + # ] def _create_chunk(self, sentences: List[Sentence], token_count: int) -> Chunk: """Create a chunk from a list of sentences.