chonkie-ai · bhavnicksm · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py
@@ -236,7 +236,7 @@ def _prepare_sentences(self, text: str) -> List[Sentence]:
         current_pos = 0
         for sent in sentence_texts:
             positions.append(current_pos)
-            current_pos += len(sent) + 1  # +1 for space/separator
+            current_pos += len(sent)  # No +1 space because sentences are already separated by spaces
 
         if not self.approximate:
             # Get accurate token counts in batch
@@ -253,34 +253,34 @@ def _prepare_sentences(self, text: str) -> List[Sentence]:
             for sent, pos, count in zip(sentence_texts, positions, token_counts)
         ]
 
-    def _prepare_sentences(self, text: str) -> List[Sentence]:
-        """Prepare sentences with either estimated or accurate token counts."""
-        # Split text into sentences
-        sentence_texts = self._split_sentences(text)
-        if not sentence_texts:
-            return []
-
-        # Calculate positions once
-        positions = []
-        current_pos = 0
-        for sent in sentence_texts:
-            positions.append(current_pos)
-            current_pos += len(sent) + 1  # +1 for space/separator
-
-        if not self.approximate:
-            # Get accurate token counts in batch
-            token_counts = self._get_token_counts(sentence_texts)
-        else:
-            # Estimate token counts using character length
-            token_counts = self._estimate_token_counts(sentence_texts)
-
-        # Create sentence objects
-        return [
-            Sentence(
-                text=sent, start_index=pos, end_index=pos + len(sent), token_count=count
-            )
-            for sent, pos, count in zip(sentence_texts, positions, token_counts)
-        ]
+    # def _prepare_sentences(self, text: str) -> List[Sentence]:
+    #     """Prepare sentences with either estimated or accurate token counts."""
+    #     # Split text into sentences
+    #     sentence_texts = self._split_sentences(text)
+    #     if not sentence_texts:
+    #         return []
+
+    #     # Calculate positions once
+    #     positions = []
+    #     current_pos = 0
+    #     for sent in sentence_texts:
+    #         positions.append(current_pos)
+    #         current_pos += len(sent) + 1  # +1 for space/separator
+
+    #     if not self.approximate:
+    #         # Get accurate token counts in batch
+    #         token_counts = self._get_token_counts(sentence_texts)
+    #     else:
+    #         # Estimate token counts using character length
+    #         token_counts = self._estimate_token_counts(sentence_texts)
+
+    #     # Create sentence objects
+    #     return [
+    #         Sentence(
+    #             text=sent, start_index=pos, end_index=pos + len(sent), token_count=count
+    #         )
+    #         for sent, pos, count in zip(sentence_texts, positions, token_counts)
+    #     ]
 
     def _create_chunk(self, sentences: List[Sentence], token_count: int) -> Chunk:
         """Create a chunk from a list of sentences.

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
@@ -48,6 +48,32 @@ def __init__(
             if isinstance(chunk_overlap, int)
             else int(chunk_overlap * chunk_size)
         )
+
+    def _create_chunks(
+        self,
+        chunk_texts: List[str],
+        token_counts: List[int],
+        decoded_text: str,
+    ) -> List[Chunk]:
+        """Create chunks from a list of texts."""
+        # package everything as Chunk objects and send out the result
+        chunks = []
+        current_index = 0
+        for chunk_text, token_count in zip(chunk_texts, token_counts):
+            start_index = decoded_text.find(
+                chunk_text, current_index
+            )  # Find needs to be run every single time because of unknown overlap length
+            end_index = start_index + len(chunk_text)
+            chunks.append(
+                Chunk(
+                    text=chunk_text,
+                    start_index=start_index,
+                    end_index=end_index,
+                    token_count=token_count,
+                )
+            )
+            current_index = end_index
+        return chunks
 
     def chunk(self, text: str) -> List[Chunk]:
         """Split text into overlapping chunks of specified token size.
@@ -85,21 +111,7 @@ def chunk(self, text: str) -> List[Chunk]:
             token_groups
         )  # decrease the time by decoding in one go (?)
 
-        # package everything as Chunk objects and send out the result
-        chunks = []
-        for chunk_text, token_count in zip(chunk_texts, token_counts):
-            start_index = decoded_text.find(
-                chunk_text
-            )  # Find needs to be run every single time because of unknown overlap length
-            end_index = start_index + len(chunk_text)
-            chunks.append(
-                Chunk(
-                    text=chunk_text,
-                    start_index=start_index,
-                    end_index=end_index,
-                    token_count=token_count,
-                )
-            )
+        chunks = self._create_chunks(chunk_texts, token_counts, decoded_text)
 
         return chunks
 

diff --git a/src/chonkie/chunker/word.py b/src/chonkie/chunker/word.py
@@ -51,21 +51,26 @@ def _split_into_words(self, text: str) -> List[str]:
         return words
 
     def _create_chunk(
-        self, words: List[str], text: str, token_count: int
+        self,
+        words: List[str],
+        text: str,
+        token_count: int,
+        current_index: int = 0,
     ) -> Tuple[Chunk, int]:
         """Create a chunk from a list of words.
 
         Args:
             words: List of words to create chunk from
-            start_idx: Starting index in original text
-            end_idx: Ending index in original text
+            text: The original text
+            token_count: Number of tokens in the chunk
+            current_index: The index of the first token in the chunk
 
         Returns:
             Tuple of (Chunk object, number of tokens in chunk)
 
         """
         chunk_text = "".join(words)
-        start_index = text.find(chunk_text)
+        start_index = text.find(chunk_text, current_index)
         return Chunk(
             text=chunk_text,
             start_index=start_index,
@@ -110,19 +115,23 @@ def chunk(self, text: str) -> List[Chunk]:
         current_chunk = []
         current_chunk_length = 0
 
+        current_index = 0
+
         for i, (word, length) in enumerate(zip(words, lengths)):
             if current_chunk_length + length <= self.chunk_size:
                 current_chunk.append(word)
                 current_chunk_length += length
             else:
-                chunk = self._create_chunk(current_chunk, text, current_chunk_length)
+                chunk = self._create_chunk(
+                    current_chunk,
+                    text,
+                    current_chunk_length,
+                    current_index,
+                )
                 chunks.append(chunk)
-
                 # update the current_chunk and previous chunk
                 previous_chunk_length = current_chunk_length
-
-                current_chunk = []
-                current_chunk_length = 0
+                current_index = chunk.end_index
 
                 overlap = []
                 overlap_length = 0